In [2]:
import pandas as pd
import numpy as np
from transformers import AutoModelForMaskedLM, AutoTokenizer, LineByLineTextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import seaborn as sns
import tqdm
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import torch
from scipy import spatial
import scipy
tqdm.pandas()

In [3]:
data = pd.read_csv("../input/medium-articles/medium_articles.csv")
print(f"Data has {len(data)} rows")
data.head()

Data has 192368 rows


Unnamed: 0,title,text,url,authors,timestamp,tags
0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,https://medium.com/invisible-illness/mental-no...,['Ryan Fan'],2020-12-26 03:38:10.479000+00:00,"['Mental Health', 'Health', 'Psychology', 'Sci..."
1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,https://medium.com/age-of-awareness/how-the-pa...,['Simon Spichak'],2020-09-23 22:10:17.126000+00:00,"['Mental Health', 'Coronavirus', 'Science', 'P..."
2,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...,https://medium.com/neodotlife/mind-your-nose-f...,[],2020-10-10 20:17:37.132000+00:00,"['Biotechnology', 'Neuroscience', 'Brain', 'We..."
3,The 4 Purposes of Dreams,Passionate about the synergy between science a...,https://medium.com/science-for-real/the-4-purp...,['Eshan Samaranayake'],2020-12-21 16:05:19.524000+00:00,"['Health', 'Neuroscience', 'Mental Health', 'P..."
4,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage...",https://medium.com/live-your-life-on-purpose/s...,['Rishav Sinha'],2020-02-26 00:01:01.576000+00:00,"['Brain', 'Health', 'Development', 'Psychology..."


In [4]:
def RemoveNewLines(text):
    text = text.replace("\n", " ")
    return text

def CheckTag(tags):
    for tag in ['Science', "Machine Learning", "Artificial Intelligence", "Health", "Coronavirus"]:
        if tag in eval(tags):
            return True
    return False

data["contains_tag"] = data["tags"].progress_apply(CheckTag)
data = data[data["contains_tag"] == True]
data["text"] = data["text"].progress_apply(RemoveNewLines)
data["tokens"] = data["text"].progress_apply(str.split)
data["text_len"] = data["tokens"].progress_apply(len)

  0%|          | 0/192368 [00:00<?, ?it/s]

  0%|          | 0/14260 [00:00<?, ?it/s]

  0%|          | 0/14260 [00:00<?, ?it/s]

  0%|          | 0/14260 [00:00<?, ?it/s]

In [5]:
with open("corpus.txt", "w") as f:
    for text in data["text"].values:
        f.write(text+"\n")

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("../input/huggingface-roberta-variants/distilroberta-base/distilroberta-base")
model = AutoModelForMaskedLM.from_pretrained("../input/huggingface-roberta-variants/distilroberta-base/distilroberta-base").to(device)

In [7]:
dataset = LineByLineTextDataset(tokenizer=tokenizer,
                                file_path="corpus.txt",
                                block_size=128)



In [8]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=True,
                                                mlm_probability=0.15)

In [9]:
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./",
        overwrite_output_dir=True,
        num_train_epochs=5,
        per_device_train_batch_size=64,
        save_steps=200,
        save_total_limit=2,
    ),
    data_collator=data_collator,
    train_dataset=dataset,
)

trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 14260
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 1115
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mryanbarretto[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
500,1.9386
1000,1.8291


Saving model checkpoint to ./checkpoint-200
Configuration saved in ./checkpoint-200/config.json
Model weights saved in ./checkpoint-200/pytorch_model.bin
Saving model checkpoint to ./checkpoint-400
Configuration saved in ./checkpoint-400/config.json
Model weights saved in ./checkpoint-400/pytorch_model.bin
Saving model checkpoint to ./checkpoint-600
Configuration saved in ./checkpoint-600/config.json
Model weights saved in ./checkpoint-600/pytorch_model.bin
Deleting older checkpoint [checkpoint-200] due to args.save_total_limit
Saving model checkpoint to ./checkpoint-800
Configuration saved in ./checkpoint-800/config.json
Model weights saved in ./checkpoint-800/pytorch_model.bin
Deleting older checkpoint [checkpoint-400] due to args.save_total_limit
Saving model checkpoint to ./checkpoint-1000
Configuration saved in ./checkpoint-1000/config.json
Model weights saved in ./checkpoint-1000/pytorch_model.bin
Deleting older checkpoint [checkpoint-600] due to args.save_total_limit


Training 

TrainOutput(global_step=1115, training_loss=1.8757108953501611, metrics={'train_runtime': 736.208, 'train_samples_per_second': 96.848, 'train_steps_per_second': 1.515, 'total_flos': 2363983702502400.0, 'train_loss': 1.8757108953501611, 'epoch': 5.0})