# Refine a pretrained model

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TextDataset
import torch
import torch.nn.functional as F
import numpy as np
import pandas as pd

device  = torch.device("cpu")
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("GPT2")
model = AutoModelForCausalLM.from_pretrained("GPT2")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [2]:
def tokenize_function(examples):
    return tokenizer(examples["text"], return_tensors='pt', padding="max_length", truncation=True)


## Load dataset

In [62]:
pd.set_option('display.max_colwidth', 200)
path = 'depressed.csv' 
df = pd.read_csv(path)
df.head()

Unnamed: 0,name,title,text
0,t3_11nazcf,i just want to be someone's favourite person.,"i feel so worthless all the time. i'm not hated, but i'm always just... off to the side. forgettable, dismissed. i'm the kind of person people look at and see as their last choice. i'm never as im..."
1,t3_11n278p,Does it actually ever get better?,"I have been on autopilot for a while now. Wake up, force myself to attend lecture because I have to, come home, and attempt to study most of the day. Everything at this point feels pointless I gue..."
2,t3_11n8aht,put me out of my fucking misery,"(15f) no amount of therapy or pills are going to make the people at school hate me less, and my mom literally told me that maybe she should “start looking for mental hospitals instead of vacation ..."
3,t3_11n3506,I found a fitting therapist,That is all. \n\nI FINALLY found a therapist that works with me and is so lovely and understanding! I had my first session and I can already tell she will be amazing! I could not be happier.
4,t3_11ncojh,"I feel barely functional most of the time, but apparently I'm ok at seeming ok","I get up every day, I go to work, I eat food, I laugh and make jokes, and I don't cry in front of my coworkers or family most of the time. I'm so used to thinking about driving my car off the brid..."


In [63]:
import re
# regex to replace (number) from text
regex = re.compile(r'\(\d+\w\)')
# remove (number) from text and title
df['text'] = df['text'].apply(lambda x: regex.sub('', x))
df['title'] = df['title'].apply(lambda x: regex.sub('', x))

# Add punctuation to end of title
df['title'] = df['title'].apply(lambda x: x + '.' if x[-1] not in ['.', '?', '!'] else x)

# combine title and text
df['text'] = df['title'] + ' ' + df['text']
train_data = df.text.to_list()

In [5]:
df.text.map(lambda x: len(x.split())).describe()

count    5050.000000
mean      141.467327
std       108.898441
min        34.000000
25%        68.000000
50%       109.000000
75%       174.000000
max       578.000000
Name: text, dtype: float64

In [6]:
# train_data = [{'text': x} for x in train_data]
from transformers import TextDataset, DataCollatorForLanguageModeling
# save training data to file to be used by TextDataset
with open('train_data.txt', 'w') as f:
    f.write('\s'.join(train_data))


dataset = TextDataset(tokenizer=tokenizer, file_path='train_data.txt', block_size=1024)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)



In [7]:
# # create text dataset with pytorch
# class DepressedData(torch.utils.data.Dataset):
#     def __init__(self, texts):
#         self.tokenizer = tokenizer
#         self.input_ids = []
#         self.attn_masks = []
#         for text in texts:
#             encodings_dict = tokenize_function(text)
#             self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
#             self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
#     def __getitem__(self, idx):
#         return self.input_ids[idx], self.attn_masks[idx]
#     def __len__(self):
#         return len(self.input_ids)
    
# dataset = DepressedData(train_data)

In [8]:
dataset.__dict__.keys()

dict_keys(['examples'])

In [9]:
# import data loader
from torch.utils.data import DataLoader

# create data loader
batch_size = 20
dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator, num_workers=8)

In [10]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

In [11]:
import os
output_dir = 'depressed_model'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Use CPU instead of GPU
training_args = TrainingArguments(
    output_dir=output_dir,          # output directory
    num_train_epochs=2,              # total # of training epochs
    per_device_train_batch_size=batch_size,  # batch size per device during training
    learning_rate=1e-5,
    seed=42,
    no_cuda=True,
)


trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=dataset,
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
)

In [12]:
trainer.train()

***** Running training *****
  Num examples = 1221
  Num Epochs = 2
  Instantaneous batch size per device = 20
  Total train batch size (w. parallel, distributed & accumulation) = 20
  Gradient Accumulation steps = 1
  Total optimization steps = 124
  Number of trainable parameters = 124439808
100%|██████████| 124/124 [7:23:59<00:00, 192.88s/it]  

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 124/124 [7:23:59<00:00, 192.88s/it]

{'train_runtime': 26639.0522, 'train_samples_per_second': 0.092, 'train_steps_per_second': 0.005, 'train_loss': 2.5173878823557208, 'epoch': 2.0}


100%|██████████| 124/124 [7:24:00<00:00, 214.84s/it]


TrainOutput(global_step=124, training_loss=2.5173878823557208, metrics={'train_runtime': 26639.0522, 'train_samples_per_second': 0.092, 'train_steps_per_second': 0.005, 'train_loss': 2.5173878823557208, 'epoch': 2.0})

In [14]:
trainer.save_model("unpopular_model")

Saving model checkpoint to unpopular_model
Configuration saved in unpopular_model\config.json
Model weights saved in unpopular_model\pytorch_model.bin
