### About this file
- Load train_data and plug into Huggingface libraries
- Casual Language Modeling using distillgpt2
 1. load texts and concatenate them after tokenization.
 2. split them in examples of sequence length
- This file is all about preprocessing and fine-tuning


In [1]:
# install transformers
#! pip install transformers
#!pip install accelerate -U
#!pip install transformers[torch]
# import libraries
import numpy as np
import pandas as pd
import transformers
import torch

In [2]:
# load train_data as datasets
# read text file put in DataFrame
datasets = pd.read_fwf('data/train_data.txt', header = None)
df_train = pd.DataFrame(datasets)

# drop all except conversations
# save convo by row 
# row1 - convo 1
# row2 - convo 2
df_train = df_train.iloc[:,0]


# sneak peak
#print(df_train.describe())
df_train.head(10)

0    agent: Hi! agent: How can I help you? customer...
1    agent: good afternoon, how can I help you? cus...
2    customer: HEY HO! agent: good afternoon, how c...
3    agent: Welcome to AcmeBrands! How can I help y...
4    agent: Hello, how can i help you customer: Hel...
5    agent: hello! How can I help you today? custom...
6    agent: Hello how can I help you today? custome...
7    customer: Hello agent: Hello! How can I help y...
8    agent: Thank you for contacting Acme Brands! H...
9    agent: Thank you for contacting Acme.  How may...
Name: 0, dtype: object

In [12]:
# Edited to drop first column of train_data
f = open('data/train_data.txt', "r")
g = open('data/train_data_edited.txt', "w")

for line in f:
    if line.strip():
        g.write("\t".join(line.split()[1:]) + "\n")

f.close()
g.close()

# Data load _2
from datasets import load_dataset
dataset = 'data/train_data_edited.txt'

In [13]:
# preprocessing
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased') #plug bert instead of distilgpt2

# pass data to tokenizer
encoded_input = tokenizer(dataset) #datasets (train_data) convert into string format
print(encoded_input)

# tokenizer returns ['input_ids','attention_mask','token_type_ids']
# decode input_ids
tokenizer.decode(encoded_input['input_ids'])

{'input_ids': [101, 2233, 120, 2669, 168, 2233, 168, 5045, 119, 189, 1775, 1204, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


'[CLS] data / train _ data _ edited. txt [SEP]'

In [48]:
#0927_finetuning
#load distilgpt2 model
model_name = 'distilgpt2'

# load tokenizer
# to tokenize all texts with same vocabs
from transformers import AutoTokenizer
# Tokenizer: splits text into tokens, firstly into numbers and then tensors, which become the model inputs
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# After cleaning data, instantiate Trainer
# Use model distilGPT2
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_name)

# prepare dataset
text_data = dataset
# tokenize concatenated text data
input_ids = tokenizer(text_data, truncation= True, max_length=125).input_ids

# create dataset
from transformers import TextDataset
dataset_tr = TextDataset(
    tokenizer = tokenizer,
    file_path = 'data/train_data.txt',
    block_size = 128,  #adjust block size after discussion
)

In [53]:
#Training Configuration
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    f"{model}-Finetuned-train-data",
    evaluation_strategy='epoch',
    learning_rate=2e-5, 
    weight_decay= .01, 
    push_to_hub=True,
    
)
# Create Trainer instance 
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset_tr,  
)
# Train data
trainer.train()
trainer.evaluate()


LocalTokenNotFoundError: Token is required (`token=True`), but no token found. You need to provide a token or be logged in to Hugging Face with `huggingface-cli login` or `huggingface_hub.login`. See https://huggingface.co/settings/tokens.

In [None]:
#Save Fine-tuned 
trainer.save_model('./fine_tuned_0927')