### About this file
- Load train_data and plug into Huggingface libraries
- Casual Language Modeling using distillgpt2
 1. load texts and concatenate them after tokenization.
 2. split them in examples of sequence length
- This file is all about preprocessing and fine-tuning


In [1]:
# install transformers
# ! pip install transformers
#!pip install accelerate -U
#!pip install transformers[torch]
# import libraries
import numpy as np
import pandas as pd
import transformers
import torch

In [2]:
# load train_data as datasets
# read text file put in DataFrame
datasets = pd.read_fwf('../data/train_data.txt', header = None)
df_train = pd.DataFrame(datasets)

# drop all except conversations
# save convo by row 
# row1 - convo 1
# row2 - convo 2
df_train = df_train.iloc[:,0]


# sneak peak
#print(df_train.describe())
df_train.head(10)

0    agent: Hi! agent: How can I help you? customer...
1    agent: good afternoon, how can I help you? cus...
2    customer: HEY HO! agent: good afternoon, how c...
3    agent: Welcome to AcmeBrands! How can I help y...
4    agent: Hello, how can i help you customer: Hel...
5    agent: hello! How can I help you today? custom...
6    agent: Hello how can I help you today? custome...
7    customer: Hello agent: Hello! How can I help y...
8    agent: Thank you for contacting Acme Brands! H...
9    agent: Thank you for contacting Acme.  How may...
Name: 0, dtype: object

In [3]:
# Edited to drop first column of train_data
f = open('../data/train_data.txt', "r")
g = open('../data/train_data_edited.txt', "w")

for line in f:
    if line.strip():
        g.write("\t".join(line.split()[1:]) + "\n")

f.close()
g.close()

# Data load _2
from datasets import load_dataset
dataset = 'data/train_data_edited.txt'

In [4]:
# pretrained model
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased') #plug bert instead of distilgpt2

# pass data to tokenizer
encoded_input = tokenizer(dataset) #datasets (train_data) convert into string format
print(encoded_input)

# tokenizer returns ['input_ids','attention_mask','token_type_ids']
# decode input_ids
tokenizer.decode(encoded_input['input_ids'])

{'input_ids': [101, 2233, 120, 2669, 168, 2233, 168, 5045, 119, 189, 1775, 1204, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


'[CLS] data / train _ data _ edited. txt [SEP]'

In [5]:
#0927_finetuning
#1024_fixed
#load distilgpt2 model
#model_name = 'distilgpt2'

In [6]:
print(transformers.__version__)

4.31.0


In [7]:
model_name = 'roberta-base'

In [8]:
# load tokenizer
# to tokenize all texts with same vocabs
from transformers import RobertaTokenizer
# Tokenizer: splits text into tokens, firstly into numbers and then tensors, which become the model inputs
tokenizer = RobertaTokenizer.from_pretrained(model_name, use_fast=True)

# After cleaning data, instantiate Trainer
# Use model distilGPT2
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_name)
#CLM
datasets = load_dataset('text', data_files={'train': "../data/train_data_edited.txt", 'validation': "../data/test_data.txt"})

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
Downloading data files: 100%|██████████| 2/2 [00:00<?, ?it/s]
Extracting data files: 100%|██████████| 2/2 [00:00<00:00, 74.60it/s]
Generating train split: 8034 examples [00:00, 29559.76 examples/s]
Generating validation split: 1004 examples [00:00, 40144.53 examples/s]


In [9]:
def tokenize_function(examples, tokenizer=tokenizer):
    return tokenizer(examples['text'])

In [10]:
tokenized_datasets= datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=['text'])
tokenized_datasets['train'][1]

Map (num_proc=4): 100%|██████████| 8034/8034 [00:18<00:00, 435.63 examples/s]
Map (num_proc=4): 100%|██████████| 1004/1004 [00:15<00:00, 65.36 examples/s]


{'input_ids': [0,
  8396,
  50117,
  10669,
  20466,
  6,
  50117,
  9178,
  50117,
  7424,
  50117,
  100,
  50117,
  19178,
  50117,
  6968,
  116,
  50117,
  31458,
  254,
  35,
  50117,
  8987,
  50117,
  605,
  17184,
  50117,
  560,
  50117,
  15954,
  50117,
  261,
  50117,
  627,
  50117,
  29552,
  50117,
  1116,
  50117,
  102,
  50117,
  13043,
  3194,
  50117,
  23224,
  35,
  50117,
  41404,
  6,
  50117,
  14656,
  50117,
  6968,
  50117,
  26650,
  50117,
  1794,
  50117,
  16625,
  50117,
  16320,
  50117,
  13650,
  50117,
  368,
  50117,
  36617,
  50117,
  2688,
  50117,
  31458,
  254,
  35,
  50117,
  250,
  1672,
  22967,
  50117,
  41932,
  50117,
  31458,
  254,
  35,
  50117,
  8258,
  42956,
  466,
  3416,
  50117,
  23224,
  35,
  50117,
  4917,
  24176,
  50117,
  560,
  50117,
  9226,
  50117,
  6968,
  50117,
  14656,
  50117,
  26650,
  50117,
  1794,
  50117,
  627,
  50117,
  10337,
  50117,
  2688,
  50117,
  463,
  50117,
  10555,
  50117,
  23224,
  

In [11]:
def group_texts(examples):
    # Concatenate all texts.
    block_size=128
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [12]:

"""
#Training Configuration
# Add AutoModelForCasualLM-

from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
model = GPT2Tokenizer.from_pretrained(model_name)

# Tokenizer is not defined
from transformers import AutoTokenizer
# Tokenizer: splits text into tokens, firstly into numbers and then tensors, which become the model inputs
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

def tokenize_function(examples,tokenizer=tokenizer):
    return tokenizer(examples['text'])

tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4,remove_columns=['text'])
lm_dataset = tokenized_datasets.map(group_texts, batch_size=1000, batched=True, num_proc=4)

training_args = TrainingArguments(
    f"{model}-Finetuned-train-data",
    evaluation_strategy='epoch',
    learning_rate=2e-5, 
    weight_decay= .01, 
    push_to_hub=True,
)
    
# Create Trainer instance and pass it all
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset['train'],
    eval_dataset=lm_dataset['validation'],
    tokenizer=tokenizer,
)
# Train data
trainer.train()

"""

'\n#Training Configuration\n# Add AutoModelForCasualLM-\n\nfrom transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer\nmodel = GPT2Tokenizer.from_pretrained(model_name)\n\n# Tokenizer is not defined\nfrom transformers import AutoTokenizer\n# Tokenizer: splits text into tokens, firstly into numbers and then tensors, which become the model inputs\ntokenizer = GPT2Tokenizer.from_pretrained(model_name)\n\ndef tokenize_function(examples,tokenizer=tokenizer):\n    return tokenizer(examples[\'text\'])\n\ntokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4,remove_columns=[\'text\'])\nlm_dataset = tokenized_datasets.map(group_texts, batch_size=1000, batched=True, num_proc=4)\n\ntraining_args = TrainingArguments(\n    f"{model}-Finetuned-train-data",\n    evaluation_strategy=\'epoch\',\n    learning_rate=2e-5, \n    weight_decay= .01, \n    push_to_hub=True,\n)\n    \n# Create Trainer instance and pass it all\ntrainer = Trainer(\n    model=model

In [13]:
# Model Training and Evaluation
from transformers import RobertaForSequenceClassification

model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=4)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    #per_device_train_batch_size=2048,
    #per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
   load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",  
    metric_for_best_model="accuracy",
)

from transformers import Trainer
from sklearn.metrics import accuracy_score

lm_dataset = tokenized_datasets.map(group_texts, batched=True, num_proc=4)


# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset['train'],
    eval_dataset=lm_dataset['validation'],
    compute_metrics=lambda eval_pred: {'accuracy': accuracy_score(eval_pred.label_ids, np.argmax(eval_pred.predictions, axis=1))}
)

trainer.train()

  0%|          | 0/3242 [02:12<?, ?it/s]


RuntimeError: [enforce fail at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 1610612736 bytes.

In [18]:
import math

eval_results=trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

ValueError: Expected input batch_size (64) to match target batch_size (8192).

In [None]:
#Save Fine-tuned 
#trainer.save_model('./fine_tuned_1024')
trainer.save_model(',/fine_tuning_1128')
trainer.push_to_hub()

NameError: name 'trainer' is not defined