# Finetuning GPT2 on a Quotes Dataset
#### Dataset: https://github.com/ShivaliGoel/Quotes-500K



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import numpy as np

# Download Dataset

In [None]:
!wget https://github.com/Quotify-Bot/model-training/releases/download/dataset/quotes_dataset.csv
!mkdir models

In [None]:
#Load Dataset
df = pd.read_csv('./quotes_dataset.csv')

df = df.iloc[:, 0:3]
df.columns = ['Quote', 'Author', 'Categories']
#
df.describe()

searchfor = ['inspiration', 'motivation', 'life-lesson', 'love', 'hope', 'friendship', 'life', 'faith', 'universe', 'nature']
df = df[df['Categories'].str.contains('|'.join(searchfor),na=False)]
df['Length'] = df['Quote'].str.split().apply(len)
df.describe()


# Prepare Data

In [None]:
train_test_ratio = 0.85
train_valid_ratio = 70/85
df_full_train, df_test = train_test_split(df, train_size = train_test_ratio, random_state = 1)
# df_train, df_valid = train_test_split(df_full_train, train_size = train_valid_ratio, random_state = 1)

In [None]:
!pip install transformers
!pip install contractions

from transformers import AutoTokenizer,AutoModelWithLMHead
import contractions



tokenizer = AutoTokenizer.from_pretrained("gpt2")

def prepare_dataset(df, dest_path):
    f = open(dest_path, 'w')
    data = ''
    quotes = df['Quote'].tolist()
    for quote in quotes:
        #Making all words lower case
        quote = str(quote).strip().lower()

        #Removing extra whitespaces
        quote = re.sub(r"\s", " ", quote)

        #Expand contractions
        quote = contractions.fix(quote)

        #Filtering out quotes longer than 35 words and less than 10
        if(len(quote.split())<10 or len(quote.split())>100):
          continue

        #Remove non-ASCII characters
        encoded_string = quote.encode("ascii", "ignore")
        quote = encoded_string.decode()

        #Remove string with these characters
        searchfor = ['~', '-', '--']
        match = False
        for s in searchfor:
          if(s in quote):
            match = True
            break
        if(match):
          continue

        #Insert whitespace between word and punctuation
        # print(quote)
        quote = re.sub(r"([\w/'+$\s-]+|[^\w/'+$\s-]+)\s*", r"\1 ", quote)
        quote = re.sub(r'<[^>]+>', r"",quote)
        # print(quote)
        data += tokenizer.special_tokens_map['bos_token']+quote+tokenizer.special_tokens_map['eos_token']+'\n'
        
    f.write(data)

In [None]:
prepare_dataset(df_full_train, 'train.txt')
prepare_dataset(df_test, 'test.txt')
# prepare_dataset(df_valid, 'valid.txt')

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilgpt2') 
model = AutoModelWithLMHead.from_pretrained('distilgpt2')

# Train Model

In [None]:
from transformers import TextDataset,DataCollatorForLanguageModeling

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset('train.txt','test.txt',tokenizer)


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./models", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=15, # number of training epochs
    per_gpu_train_batch_size=32, # batch size for training
    per_gpu_eval_batch_size=64,  # batch size for evaluation
    logging_steps = 500, # Number of update steps between two evaluations.
    save_steps=5000, # after # steps model is saved
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    )
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    # prediction_loss_only=True,
    # compute_metrics=True
)

## Use [HyperDash](https://hyperdash.io/) for monitoring and start training

In [None]:
!pip install hyperdash
!hyperdash login --email

from hyperdash import Experiment

exp = Experiment("Distil GPT2 Quotes Motivational 10-100, 50 epochs v1")
epochs = exp.param("Epochs", 50)
trainer.train()
trainer.save_model("./models")
tokenizer.save_pretrained("./models")
exp.end()

from google.colab import files
files.download('./models/pytorch_model.bin') 



# Inference

In [None]:
!pip install transformers
from transformers import *

tokenizer = AutoTokenizer.from_pretrained("./models")
model = AutoModelWithLMHead.from_pretrained("./models")

gpt2_finetune = pipeline('text-generation', model=model, tokenizer=tokenizer)


In [None]:
def generate_quote(starting_text, min_length, max_length ):
  gen_text = gpt2_finetune (starting_text, min_length = min_length, max_length= max_length, top_k=50, top_p=0.95, temperature=0.5)
  return gen_text[0]['generated_text']





In [None]:
prompts = ['why am i here',
'i am cool and good ',
'how are thou',
'Life is a journey',
'In the end ',
'Happiness is '
]

all_quotes = []

for prompt in prompts:
  for i in range (10):
   all_quotes.append(generate_quote(prompt, 10, 50))


In [None]:
for q in all_quotes:
  print(q)