<div align="left">
  <h1>GPT2</h1> <a name="0-bullet"></a>
</div>


---

In [None]:
# in order to run the notebook it is mandatory to install transformers
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 7.3MB/s 
[?25hCollecting huggingface-hub==0.0.12
  Downloading https://files.pythonhosted.org/packages/2f/ee/97e253668fda9b17e968b3f97b2f8e53aa0127e8807d24a547687423fe0b/huggingface_hub-0.0.12-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 50.9MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     

In [None]:
# libraries that has been applied for the preparation, finetuning and generation of text

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import transformers
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv

# GPT2 Fine Tuning

### Prepare data

In [None]:
# mounting the Drive folder
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# loading the lyric dataset by taking english songs only
lyrics = pd.read_csv('/content/drive/MyDrive/Text_Analytics/Data/lyrics-data.csv')
lyrics = lyrics[lyrics['Idiom']=='ENGLISH']

In [None]:
#Only keep popular artists, with genre Rock/Pop and popularity high enough
artists = pd.read_csv('/content/drive/MyDrive/Text_Analytics/Data/artists-data.csv')

artists = artists[(artists['Genre'].isin(['Rock'])) & (artists['Popularity']>5)]

In [None]:
# merge of the datasets
df = lyrics.merge(artists[['Artist', 'Genre', 'Link']], left_on='ALink', right_on='Link', how='inner')

In [None]:
# dropping all the columns that we will not use for this task
df = df.drop(columns=['ALink','SLink','Idiom','Link'])

In [None]:
# taking only the lyrics with less than 350 chars, this will simple the fine-tuning phase
df = df[df['Lyric'].apply(lambda x: len(x.split(' ')) < 350)]

In [None]:
df.head()

Unnamed: 0,SName,Lyric,Artist,Genre
0,What's Up,Twenty-five years and my life is still. Trying...,4 Non Blondes,Rock
1,Spaceman,Starry night bring me down. Till I realize the...,4 Non Blondes,Rock
2,Pleasantly Blue,Every time you wake in the mornin'. And you st...,4 Non Blondes,Rock
3,Train,What ya gonna do child. When your thoughts are...,4 Non Blondes,Rock
4,Calling All The People,"How can you tell, when your wellness is not we...",4 Non Blondes,Rock


### Prepare the dataset

In [None]:
# creation of a class to tokeninize and generate the dataset that will be used to fine tune the model
class SongLyrics(Dataset):
    
    def __init__(self, control_code, truncate=False, gpt2_type="gpt2", max_length=1024):

        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.lyrics = []

        for row in df['Lyric']:
          self.lyrics.append(torch.tensor(
                self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")
            ))
                
        if truncate:
            self.lyrics = self.lyrics[:20000]
        self.lyrics_count = len(self.lyrics)
        
    def __len__(self):
        return self.lyrics_count

    def __getitem__(self, item):
        return self.lyrics[item]

In [None]:
dataset = SongLyrics(df['Lyric'], truncate=True, gpt2_type="gpt2")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355256.0, style=ProgressStyle(descript…




Token indices sequence length is longer than the specified maximum sequence length for this model (252536 > 1024). Running this sequence through the model will result in indexing errors


### Prepare training

In [None]:
# loading the tokeninizer and the pretrained model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355256.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=548118077.0, style=ProgressStyle(descri…




In [None]:
#Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

In [None]:
# function used for the fine-tuning phase 
def train(
    dataset, model, tokenizer,
    batch_size=128, epochs=5, lr=2e-5,
    max_seq_len=500, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False,save_model_on_epoch=False,
):

    acc_steps = 100
    device=torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
    return model

In [None]:
#Save the model to a pkl or something so it can be reused later on
torch.save(model, '/content/drive/MyDrive/Text_Analytics/model_rock.pt')

### Training

In [None]:
#Train the model on the specific data we have
model = train(dataset, model, tokenizer)

0it [00:00, ?it/s]

Training epoch 0
0


12000it [18:00, 11.11it/s]
0it [00:00, ?it/s]

Training epoch 1
tensor(1.3729, device='cuda:0', grad_fn=<NllLossBackward>)


12000it [18:07, 11.03it/s]
0it [00:00, ?it/s]

Training epoch 2
tensor(1.9431, device='cuda:0', grad_fn=<NllLossBackward>)


12000it [18:08, 11.02it/s]
0it [00:00, ?it/s]

Training epoch 3
tensor(1.4939, device='cuda:0', grad_fn=<NllLossBackward>)


12000it [18:04, 11.06it/s]
0it [00:00, ?it/s]

Training epoch 4
tensor(1.5333, device='cuda:0', grad_fn=<NllLossBackward>)


12000it [18:07, 11.04it/s]


### Text generation

In [None]:
#Load the model to use it
model = torch.load('/content/drive/MyDrive/Text_Analytics/model_rock.pt', map_location=torch.device('cpu'))

In [None]:
# function used for the generation of the text by using the model that has been fine-tuned in the previous section

def generate(
    model,
    tokenizer,
    prompt,
    entry_count=1,
    entry_length=30, #maximum number of words
    top_p=0.8,
    temperature=1.,
):

    model.eval()

    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False

            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("<EOT>"):
                    entry_finished = True

                if entry_finished:

                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generated_list.append(output_text)
                    break
            
            if not entry_finished:
              output_list = list(generated.squeeze().numpy())
              output_text = f"{tokenizer.decode(output_list)}<EOT>" 
              generated_list.append(output_text)
                
    return generated_list

# Text generation

# Comparing ends of the lyrics 

In [None]:
# generating text by applying different temperatures

for temperature in [0.2, 0.5, 0.8, 1.0, 1.2]:
    print('----- temperature:', temperature)
    generated = generate(model.to('cpu'), tokenizer, "ut if there's a pill to help me forget. ", entry_length = 100, temperature = temperature)
    generated = generated[0].split('.')

    print('Generated text:')
    print('\n'.join(generated))


  0%|          | 0/1 [00:00<?, ?it/s]

----- temperature: 0.2


100%|██████████| 1/1 [00:27<00:00, 27.60s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

Generated text:
ut if there's a pill to help me forget
  I'm not sure if I'm going to be able to get through this
  I'm not sure if I'm going to be able to get through this
  I'm not sure if I'm going to be able to get through this
  I'm not sure if I'm going to be able to get through this
  I'm not sure if I'm going to be able to get through this
  I'm not sure if I'm going to<EOT>
----- temperature: 0.5


100%|██████████| 1/1 [00:27<00:00, 27.59s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

Generated text:
ut if there's a pill to help me forget
  I'm not sure what to do
  I'm not sure what to do
  I'm not sure what to do
  I'm not sure what to do
  I'm not sure what to do
  I'm not sure what to do
  I'm not sure what to do
  I'm not sure what to do
  I'm not sure what to do
  I'm not sure what to do
 <EOT>
----- temperature: 0.8


100%|██████████| 1/1 [00:27<00:00, 27.59s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

Generated text:
ut if there's a pill to help me forget
  But the next time I wake up in the morning, I'll have to try and figure out what I should do
  If I can't find a solution, what can I do?  Well, here's the way
  If you're new to Buddhism, you might not be familiar with the Buddhist path
  Well, you might not realize the Buddha was a Zen master
  Well, you might not even know the Buddha's teaching
  Well<EOT>
----- temperature: 1.0


100%|██████████| 1/1 [00:27<00:00, 27.69s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

Generated text:
ut if there's a pill to help me forget
  I realize I'm probably not the only one that thinks this way
  I think that  whatever there is to realize in the world is what we have to do to get there
  I also realize that I will never be free
  I will never be free
  I will never be free
  I will never be free
  I will never be free
  I will never be free
  I will never be free
<EOT>
----- temperature: 1.2


100%|██████████| 1/1 [00:27<00:00, 27.77s/it]

Generated text:
ut if there's a pill to help me forget
  And those around me will always wake up and wonder where my yoga session is all
  I want you to know that I love you so much
  I love you so much
  I want you to see me lead the way
  I want you to see me play with love and technology while we are at it
  I want you to connect with me for real and share this with those around you
  So one more time
  May<EOT>





## Songs similarities

In [None]:
# generation of the text with a title taken by an original song from the dataset

generated = generate(model.to('cpu'), tokenizer, "You've Got to Hide Your Love Away", entry_length = 100)
generated = generated[0].split('.')

print('Generated text:')
print('\n'.join(generated))


100%|██████████| 1/1 [00:36<00:00, 36.11s/it]

Generated text:
You've Got to Hide Your Love Away
"

Despite being in all the wrong places, Angelina Jolie says: "She's so clever and so perfect
 It's just she's so real
 And that's one of the greatest things about her
 She can really make me miss her
 There's a line in her own mind that I would never know her
"

Angelina Jolie talks about the 'perfect' times she's been in California in a recent episode of Westworld: "There's this great old<EOT>





In [None]:
# this is a program used to calculate the similarity between the generated lyrics with the original ones

lyrics = np.concatenate([generated, df['Lyric'].values]) # we need to put together the generated text and the original lyrics

tfidf = TfidfVectorizer(stop_words="english").fit_transform(lyrics) # vectorization process of the previous values

pairwise_similarity = tfidf * tfidf.T # similarity matrix calculation 

pairwise_similarity = pairwise_similarity.toarray()[0] # converting it to an array
    
pairwise_similarity[0] = -1 # mask the diagonal element (the similarity to itself)
    
most_similar_idxs = pairwise_similarity.argsort()[-3:][::-1]  # get the top 3 most similar lyrics to the generated lyrics

output = [', '.join(df.iloc[most_similar_idxs - 1].SName), # generate as output the title of the 3 most similar songs and their scores
              *pairwise_similarity[most_similar_idxs], 
              most_similar_idxs - 1]

print("similar to: {}\n- scores: {}, {}, {}".format(*output))

similar to: Hitch Hike, If I Lose Myself, You've Got To Hide Your Love Away
- scores: 0.6414909825309704, 0.6394604215311087, 0.610193487061056


In [None]:
generated = generate(model.to('cpu'), tokenizer, "The Wait", entry_length = 100)
generated = generated[0].split('.')

print('Generated text:')
print('\n'.join(generated))


100%|██████████| 1/1 [00:33<00:00, 33.08s/it]

Generated text:
The Waitresses' Blame Are Pieces

There's no denying the murder of Amy Grant, the show's very own celebrity advocate


Here's some of the thing that some people, many in this world, don't understand


How can we all help one another?

Let me show you what it's like to be human


Once you're inside the hole


The fight is still there


There are far too many people out there to learn<EOT>





In [None]:
# this is a program used to calculate the similarity between the generated lyrics with the original ones

lyrics = np.concatenate([generated, df['Lyric'].values]) # we need to put together the generated text and the original lyrics

tfidf = TfidfVectorizer(stop_words="english").fit_transform(lyrics) # vectorization process of the previous values

pairwise_similarity = tfidf * tfidf.T # similarity matrix calculation

pairwise_similarity = pairwise_similarity.toarray()[0] # converting it to an array
   
pairwise_similarity[0] = -1 # mask the diagonal element (the similarity to itself)
    
most_similar_idxs = pairwise_similarity.argsort()[-3:][::-1]  # get the top 3 most similar lyrics to the generated lyrics

output = [', '.join(df.iloc[most_similar_idxs - 1].SName), # generate as output the title of the 3 most similar songs and their scores
              *pairwise_similarity[most_similar_idxs], 
              most_similar_idxs - 1]

print("similar to: {}\n- scores: {}, {}, {}".format(*output))

similar to: Double Crossing Time, Ashley, Where Will You Go (ep Version)
- scores: 0.22669568893973846, 0.1977533715860892, 0.14743189356879532


## Generating text with different languages

Here we will test our model with different languages input seeds, in particular by using Italian, Spanish and French

In [None]:
generated = generate(model.to('cpu'), tokenizer, "Siamo fuori di testa, ma diversi da loro", entry_length = 100)
generated = generated[0].split('.')

print('Generated text:')
print('\n'.join(generated))

100%|██████████| 1/1 [00:43<00:00, 43.26s/it]

Generated text:
Siamo fuori di testa, ma diversi da loro
 Nuestra se le hora destiano aperra dono no se
 Tan na riadido de una testa
 Anar mi anarto de la mio
 No way no no da uno
 Da voie se litra de una testa
 My chance, di supereconido no da infamy
<|endoftext|>Presents: The United States, Japan, Russia, Hong Kong, Canada, UK, Australia

Duration: 1<EOT>





In [None]:
generated = generate(model.to('cpu'), tokenizer, "Sí, sabes que ya llevo un rato mirándote", entry_length = 100)
generated = generated[0].split('.')

print('Generated text:')
print('\n'.join(generated))

100%|██████████| 1/1 [00:44<00:00, 44.07s/it]

Generated text:
Sí, sabes que ya llevo un rato mirándote es mismo

Be that as it may, sabes que ya llevo un rato mirándote es mismo<|endoftext|>Young child on its own
 Just not quite ready
 Still
 Trying to think through the thought process
 I don't know what it feels like
 What kind of thing it feels like
 I can't help thinking that maybe I should change
 I really don't want to try
 I really don't want to try
 I really don't want to<EOT>





In [None]:
generated = generate(model.to('cpu'), tokenizer, "La vie c'est plus marrant", entry_length = 100)
generated = generated[0].split('.')

print('Generated text:')
print('\n'.join(generated))

100%|██████████| 1/1 [00:39<00:00, 39.78s/it]

Generated text:
La vie c'est plus marranto!" A silvestri se père m'enchanté
 C'est moi les cabs de que vous sommes quelques zez pas m'avez, l'enflançon des seilères
 Mi mécuisine est un voir
 Don't know if I'm going to kill you now, when I'm out here


It's summer, so you'll have to take the jump
 At least let me<EOT>





## Word importance for text generation

Using the ECCO library to understand the genaration process of the text with a pretrained GPT2 model from HuggingFace

In [None]:
!pip install ecco

In [None]:
import ecco
lm = ecco.from_pretrained('gpt2')
# model = GPT2LMHeadModel.from_pretrained('gpt2')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355256.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=548118077.0, style=ProgressStyle(descri…




In [None]:
text= "You've Got to Hide Your Love Away"

output = lm.generate(text, generate=20, do_sample=True)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
output.saliency()

<IPython.core.display.Javascript object>

In [None]:
output.saliency(style="detailed")

<IPython.core.display.Javascript object>