In [1]:
from my_gpt2 import GPT2_poem
from gpt_neo import generate_poemNeo
import kaggle
import os
import pandas as pd
import matplotlib.pyplot as plt


## Dataset

In [None]:
os.environ['KAGGLE_CONFIG_DIR'] = os.path.expanduser("~/.kaggle")

kaggle.api.dataset_download_files('michaelarman/poemsdataset', path='./poemsdataset', unzip=True)

print("Dataset downloaded and unzipped successfully!")


In [3]:
topics=os.listdir('poemsdataset/topics/')
data=pd.DataFrame(columns=['title','content','topic'])
for topic in topics:
    path='poemsdataset/topics/'+topic

    for f in os.listdir(path):
        file_path=path+'/'+f
        with open(file_path, 'rb') as file:
            raw_text =file.read()
            text = raw_text.decode('utf-8', errors='ignore')



        df=pd.DataFrame([[f,text,topic]], columns=['title','content','topic'])

        data=pd.concat([data,df],ignore_index=True)

data

Unnamed: 0,title,content,topic
0,RememberPoemsRememberBodyPoembyConstantinePCav...,"Body, remember not only how much you were love...",remember
1,RememberPoemsRememberPoembyChristinaGeorginaRo...,"Remember me when I am gone away,\r\nGone far a...",remember
2,RememberPoemsRememberThenAndNowPoembyMissyLynn...,Remember little jack sprat\r\nRemember three l...,remember
3,RememberPoemsItWasAugustIRememberIRememberWhen...,It was August I remember\r\nI remember when I ...,remember
4,RememberPoemsRememberPoembyLindaOri.txt,Do you not remember me\r\nThe one who set your...,remember
...,...,...,...
14329,HairPoemsSonnetXiiiTheLightThatRisesFromYourFe...,The light that rises from your feet to your ha...,hair
14330,HairPoemsHairPoembyGlenRuff.txt,I don't care for hair\r\nIf I was bare of hair...,hair
14331,HairPoemsAPinHasAHeadButHasNoHairPoembyChristi...,"A pin has a head, but has no hair;\r\nA clock ...",hair
14332,HairPoemsHairInTheWindPoembyFrederickKesner.txt,`\r\nBrown in the sun\r\nof the midday born\r\...,hair


In [4]:
import re
import string

def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

data.loc[:, 'cleaned_content'] = data['content'].apply(clean_text)


In [5]:
poems = data[data['content'].apply(lambda x: len(x.split(' ')) < 250)]
test_set = poems.sample(n = 20)
poems = poems.loc[~poems.index.isin(test_set.index)]
test_set = test_set.reset_index()
poems = poems.reset_index()
poems

Unnamed: 0,index,title,content,topic,cleaned_content
0,0,RememberPoemsRememberBodyPoembyConstantinePCav...,"Body, remember not only how much you were love...",remember,body remember not only how much you were loved...
1,1,RememberPoemsRememberPoembyChristinaGeorginaRo...,"Remember me when I am gone away,\r\nGone far a...",remember,remember me when i am gone away gone far away ...
2,2,RememberPoemsRememberThenAndNowPoembyMissyLynn...,Remember little jack sprat\r\nRemember three l...,remember,remember little jack sprat remember three litt...
3,3,RememberPoemsItWasAugustIRememberIRememberWhen...,It was August I remember\r\nI remember when I ...,remember,it was august i remember i remember when i met...
4,4,RememberPoemsRememberPoembyLindaOri.txt,Do you not remember me\r\nThe one who set your...,remember,do you not remember me the one who set your fe...
...,...,...,...,...,...
12671,14329,HairPoemsSonnetXiiiTheLightThatRisesFromYourFe...,The light that rises from your feet to your ha...,hair,the light that rises from your feet to your ha...
12672,14330,HairPoemsHairPoembyGlenRuff.txt,I don't care for hair\r\nIf I was bare of hair...,hair,i dont care for hair if i was bare of hair i w...
12673,14331,HairPoemsAPinHasAHeadButHasNoHairPoembyChristi...,"A pin has a head, but has no hair;\r\nA clock ...",hair,a pin has a head but has no hair a clock has a...
12674,14332,HairPoemsHairInTheWindPoembyFrederickKesner.txt,`\r\nBrown in the sun\r\nof the midday born\r\...,hair,brown in the sun of the midday born silken str...


## Fine tuning GPT2

In [6]:
%env CUDA_LAUNCH_BLOCKING=1

env: CUDA_LAUNCH_BLOCKING=1


In [13]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
from torch.utils.data import DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
import os


class GPT2_poem:
    def __init__(self, control_code, truncate=False, gpt2_type="gpt2", max_length=1024):
        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.model = GPT2LMHeadModel.from_pretrained(gpt2_type)
        self.text = []
        for row in poems['cleaned_content']:
            self.text.append(torch.tensor(
                self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")
            ))
        if truncate:
            self.text = self.text[:20000]
        self.text_count = len(self.text)

    def pack_tensor(self, new_tensor, packed_tensor, max_seq_len):
        if packed_tensor is None:
            return new_tensor, True, None
        if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
            return packed_tensor, False, new_tensor
        else:
            packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
            return packed_tensor, True, None

    def train(self, batch_size=16, epochs=5, lr=2e-5, max_seq_len=400, warmup_steps=200, gpt2_type="gpt2", output_dir="/modeles", output_prefix="wreckgar", test_mode=False, save_model_on_epoch=False):
        acc_steps = 100
        device = torch.device("cuda")
        self.model = self.model.to(device)
        self.model.train()

        optimizer = AdamW(self.model.parameters(), lr=lr)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
        )

        train_dataloader = DataLoader(self.text, batch_size=1, shuffle=True)

        loss = 0
        accumulating_batch_count = 0
        input_tensor = None

        for epoch in range(epochs):
            print(f"Training epoch {epoch}")
            print(loss)
            for idx, entry in tqdm(enumerate(train_dataloader)):
                (input_tensor, carry_on, remainder) = self.pack_tensor(entry, input_tensor, max_seq_len)

                if carry_on and idx != len(train_dataloader) - 1:
                    continue

                input_tensor = input_tensor.to(device)
                outputs = self.model(input_tensor, labels=input_tensor)
                loss = outputs[0]
                loss.backward()

                if (accumulating_batch_count % batch_size) == 0:
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()
                    self.model.zero_grad()

                accumulating_batch_count += 1
                input_tensor = None

            if save_model_on_epoch:
                torch.save(
                    self.model.state_dict(),
                    os.path.join(output_dir, "gpt2_fineTuned.pt"),
                )
        return self.model


In [14]:
gpt2_poem_generator = GPT2_poem(control_code="poem", max_length=250)
gpt2_poem_generator.train(epochs=5, batch_size=8, lr=2e-5, save_model_on_epoch=True, output_dir="/content/")

Training epoch 0
0


12676it [03:57, 53.42it/s]


Training epoch 1
tensor(4.3474, device='cuda:0', grad_fn=<NllLossBackward0>)


12676it [03:56, 53.55it/s]


Training epoch 2
tensor(3.9799, device='cuda:0', grad_fn=<NllLossBackward0>)


12676it [03:57, 53.36it/s]


Training epoch 3
tensor(4.2258, device='cuda:0', grad_fn=<NllLossBackward0>)


12676it [03:57, 53.45it/s]


Training epoch 4
tensor(4.7992, device='cuda:0', grad_fn=<NllLossBackward0>)


12676it [03:58, 53.23it/s]


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [27]:
from google.colab import files

files.download("/content/gpt2_fineTuned.pt")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Test it

In [5]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch 

model_path = 'models/gpt2_fineTuned.pt'

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.load_state_dict(torch.load(model_path))
print("Model loaded successfully!")


In [65]:
def generate_poem(model, tokenizer, prompt, max_length=250):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')

    input_ids = input_ids.to(model.device)

    attention_mask = torch.ones(input_ids.shape, device=model.device)

    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        do_sample=True,
        temperature=0.9,
        attention_mask=attention_mask,
        pad_token_id=tokenizer.eos_token_id
    )

    generated_poem = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_poem


In [61]:
prompt = "nature"
generated_poem = generate_poem(model,tokenizer,prompt)
print("Output:\n" + 100 * '-')
print(generated_poem)


Output:
----------------------------------------------------------------------------------------------------
nature, in the mind of the wise man there are few who do not feel the sting of death.

The most common and rare of sins in the world are all sins are grave. For the more severe will be every sin and the more grave will be the crime of sin itself. For the more committed among the people of the earth will be guilt and the more guilty among the people of the earth will be the guilt of sin itself. For these are the sins that lead to death as it is said in the Qur'an:

"Thou shalt not commit any sin to the world or upon the earth," while others have been more severe on the human heart.


In [58]:
prompt= "daylight"
generated_poem = generate_poem(model,tokenizer,prompt)
print(generated_poem)


daylight on the other side of the bridge on the left in front of the city's famous old church and on the right in front of the village hospital.


In [71]:
prompt= "cute love"
generated_poem = generate_poem(model,tokenizer,prompt,max_length=160)
print(generated_poem)



cute love is a beautiful thing to see. If the lady was to have a son her father would have to be quite a different kind of man than he is today. She is not an angel with the angel wings she has a father who has lived a very different life in her life.

You can read your own thoughts on this topic here


# SCore GPT2 and GPT_neo


1. **BLEU (Bilingual Evaluation Understudy)** : Mesure la correspondance des n-grams (groupes de mots) entre le texte généré et le texte de référence. Plus le score est élevé, plus les deux textes sont similaires sur le plan lexical.

2. **ROUGE (Recall-Oriented Understudy for Gisting Evaluation)** : Évalue la similarité en termes de mots et de séquences entre deux textes, mais met l'accent sur le rappel (le nombre de n-grams générés présents dans le texte de référence). Il inclut plusieurs sous-scores comme ROUGE-1 (unigrammes), ROUGE-2 (bigrammes), et ROUGE-L (longest common subsequence).

In [None]:
import nltk
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

nltk.download('punkt')

gpt_neo_poem = generate_poemNeo(prompt)
def calculate_scores(reference, candidate):
    reference_tokens = nltk.word_tokenize(reference.lower())
    candidate_tokens = nltk.word_tokenize(candidate.lower())
    
    bleu_smooth = SmoothingFunction().method1
    bleu_score = sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=bleu_smooth)

    # ROUGE
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(reference, candidate)
    
    return bleu_score, rouge_scores

calculate_scores(gpt_neo_poem,generated_poem)