In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 31.9 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.3.2-py3-none-any.whl (362 kB)
[K     |████████████████████████████████| 362 kB 71.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 59.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.4 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 63.1 MB/s 
Collecting aiohttp
  Downl

In [3]:
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import AutoModelForMaskedLM, TrainingArguments, Trainer, TrainerCallback
import pandas as pd
from datasets import load_dataset
import torch


In [4]:
def read_dataset(dataset_path, label):
  if '.csv' in dataset_path:
    original_df = pd.read_csv(dataset_path, encoding = 'utf-8')

  original_df = original_df.dropna()
  # original_df = original_df[:30] ######################################

  if label:
    data = original_df[original_df.label == label]
  else:
    data = original_df[original_df.label == label]

  label_length = len(data)
  train = data[:int(80/100*label_length)] # commenttext, label
  test = data[int(80/100*label_length):]
  return {'train': train,  'test': test}

In [5]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding='max_length', max_length=512)

In [6]:
def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [7]:
def prepare_text_files(label):
  d = read_dataset('cleaned.csv', label)
  with open('train.lm', 'w') as f:
    f.writelines([row['commenttext'] + '\n' for _, row in d['train'].iterrows()])

  with open('test.lm', 'w') as f:
    f.writelines([row['commenttext'] + '\n' for _, row in d['test'].iterrows()])


In [8]:
model_checkpoint = 'distilroberta-base'

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.pad_token = tokenizer.eos_token

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [11]:
for label in [0, 1]:
  prepare_text_files(label)

  dataset = load_dataset('text', data_files={'train': ['train.lm'],
                                          'test' : ['test.lm']})


  tokenized_dataset = dataset.map(
      preprocess_function,
      batched=True,
      num_proc=4,
      remove_columns=dataset["train"].column_names)

  block_size = 128

  lm_dataset = tokenized_dataset.map(group_texts, batched=True, num_proc=5)
  data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)


  model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

  training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=1,
    per_device_train_batch_size=32,
  )

  class SaveCallBack(TrainerCallback):
    "A callback that prints a message at the beginning of training"

    def on_epoch_end(self, args, state, control, **kwargs):
        global model
        torch.save(model, str(label)+'.bert_lm')
        print("saved model")

  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=lm_dataset["train"],
      eval_dataset=lm_dataset["test"],
      data_collator=data_collator,
      callbacks=[SaveCallBack]
  )  

  trainer.train()

Using custom data configuration default-5afa33622172e4df


Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-5afa33622172e4df/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-5afa33622172e4df/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]



     

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

      

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

       

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

      

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/316M [00:00<?, ?B/s]

***** Running training *****
  Num examples = 2860
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 450


Epoch,Training Loss,Validation Loss
1,6.4641,5.443405
2,8.3077,
3,5.0189,
4,6.7857,
5,4.1746,


***** Running Evaluation *****
  Num examples = 716
  Batch size = 8


saved model


***** Running Evaluation *****
  Num examples = 716
  Batch size = 8


saved model


***** Running Evaluation *****
  Num examples = 716
  Batch size = 8


saved model


***** Running Evaluation *****
  Num examples = 716
  Batch size = 8


saved model


***** Running Evaluation *****
  Num examples = 716
  Batch size = 8


saved model




Training completed. Do not forget to share your model on huggingface.co/models =)


Using custom data configuration default-2a005e1cfcd6da12


Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-2a005e1cfcd6da12/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-2a005e1cfcd6da12/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

      

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

     

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

       

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

      

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/distilroberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/42d6b7c87cbac84fcdf35aa69504a5ccfca878fcee2a1a9b9ff7a3d1297f9094.aa95727ac70adfa1aaf5c88bea30a4f5e50869c68e68bce96ef1ec41b5facf46
Model config RobertaConfig {
  "_name_or_path": "distilroberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file https://huggingface.co/dis

Epoch,Training Loss,Validation Loss
1,13.0786,5.993432
2,6.3804,6.281621
3,7.5116,5.948568
4,6.9494,5.410285
5,4.459,5.048048


***** Running Evaluation *****
  Num examples = 84
  Batch size = 8


saved model


***** Running Evaluation *****
  Num examples = 84
  Batch size = 8


saved model


***** Running Evaluation *****
  Num examples = 84
  Batch size = 8


saved model


***** Running Evaluation *****
  Num examples = 84
  Batch size = 8


saved model


***** Running Evaluation *****
  Num examples = 84
  Batch size = 8


saved model




Training completed. Do not forget to share your model on huggingface.co/models =)




In [12]:
prefs = ['i', 'hey', 'what', 'why', 'you', 'my', 'uh', 'bro', 'saba',
         'usa', 'sohrab', 'i like', 'i hate', 'i can', 'can you',
         'i would', 'i love', '']

model.to('cpu')
model.eval()

for pref in prefs:
  inp = tokenizer.encode(pref, return_tensors="pt")
  # inp = inp[0][:-1]
  # inp = inp.unsqueeze(0)
  outputs = model.generate(inp, max_length=10, 
                           top_p=0.95, top_k=50,
                           no_repeat_ngram_size=1,
                           do_sample=True, 
                           )
  
  generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
  print(generated)

i.65 5 6 six 4
hey.SC CMW W
what.theTheThatHeYouShe
why.ccck kK K
you.I I And Me Him Of
my.53aA A An
uh.2 2 0 3 1 9
bro.2 2 3 three 23 16
saba.50,: I
usa.16 6,D D
sohrab.6 6 CC
i like.a a amamad
i hate.., D I W
i can.the The the other one
can you.bkK K T
i would IIY y Y K
i love.0 0 1 3 13
The The New new school state's is
