# Installations

In [None]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [None]:
import tensorflow as tf### models
import numpy as np### math computations
import matplotlib.pyplot as plt### plotting bar chart
import sklearn### machine learning library
import cv2## image processing
from sklearn.metrics import confusion_matrix, roc_curve### metrics
import seaborn as sns### visualizations
import datetime
import pathlib
import io
import os
import re
import string
import time
from numpy import random
import gensim.downloader as api
from PIL import Image
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import Dense,Flatten,InputLayer,BatchNormalization,Dropout,Input,LayerNormalization
from tensorflow.keras.losses import BinaryCrossentropy,CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy,TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam
from google.colab import drive
from google.colab import files
from datasets import load_dataset
from transformers import GPT2TokenizerFast,create_optimizer,DataCollatorForLanguageModeling,TFGPT2LMHeadModel

In [None]:
MAX_LENGTH=256
BATCH_SIZE=6

# Dataset Preparation

In [None]:
!pip install -q kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json
!kaggle datasets download -d juicobowley/drake-lyrics
!unzip "/content/drake-lyrics.zip" -d "/content/dataset/"

Downloading drake-lyrics.zip to /content
  0% 0.00/764k [00:00<?, ?B/s]
100% 764k/764k [00:00<00:00, 95.6MB/s]
Archive:  /content/drake-lyrics.zip
  inflating: /content/dataset/drake_data.csv  
  inflating: /content/dataset/drake_data.json  
  inflating: /content/dataset/drake_lyrics.txt  


In [None]:
filepath="/content/dataset/drake_data.csv"
dataset = load_dataset('csv', data_files=filepath)

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['album', 'lyrics_title', 'lyrics_url', 'lyrics', 'track_views'],
        num_rows: 290
    })
})

In [None]:
dataset['train'][184]

{'album': 'Thank Me Later',
 'lyrics_title': 'Thank Me Later [Booklet] Lyrics',
 'lyrics_url': 'https://genius.com/Drake-thank-me-later-booklet-annotated',
 'lyrics': None,
 'track_views': '6.2K'}

In [None]:
model_id="gpt2-medium"
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

In [None]:
dataset['train'][184]

{'album': 'Thank Me Later',
 'lyrics_title': 'Thank Me Later [Booklet] Lyrics',
 'lyrics_url': 'https://genius.com/Drake-thank-me-later-booklet-annotated',
 'lyrics': None,
 'track_views': '6.2K'}

In [None]:
n_wasted=0

In [None]:
# for i in range(len(dataset['train'])):
#   try:
#     outputs = tokenizer(
#       dataset["train"][i]["lyrics"],
#       truncation=True,
#       max_length=256,
#       return_overflowing_tokens=True,
#       return_length=True,
#     )
#     print(i,outputs['length'])

#     for k in outputs['length']:
#       if k!=256:
#         n_wasted+=k
#   except:
#     print('----------------------->i',i)

In [None]:
print(n_wasted)

0


In [None]:
def preprocess_function(example):
  try:
    outputs = tokenizer(
        example["lyrics"],
        truncation=True,
        max_length=MAX_LENGTH,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
      if length==MAX_LENGTH:
        input_batch.append(input_ids)
        valid_input_ids=input_ids
    if len(input_batch)!=0:
      for i in range(BATCH_SIZE-len(input_batch)):
        input_batch.append(valid_input_ids)
  except:
    print(example)
    input_batch=[]
  return {"input_ids": input_batch}

In [None]:
tokenized_dataset=dataset.map(
    preprocess_function,remove_columns=dataset["train"].column_names
)

Map:   0%|          | 0/290 [00:00<?, ? examples/s]

{'album': 'Thank Me Later', 'lyrics_title': 'Thank Me Later [Booklet] Lyrics', 'lyrics_url': 'https://genius.com/Drake-thank-me-later-booklet-annotated', 'lyrics': None, 'track_views': '6.2K'}
{'album': 'Unreleased Songs', 'lyrics_title': 'Untitled DaBaby Collaboration* (Ft. DaBaby) Lyrics', 'lyrics_url': 'https://genius.com/Drake-untitled-dababy-collaboration-lyrics', 'lyrics': None, 'track_views': '(Unreleased)'}


In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 290
    })
})

In [None]:
def filter_out(example):
  if len(example['input_ids'])>=1:
    return example

In [None]:
tokenized_full_dataset=tokenized_dataset.filter(filter_out)
print(tokenized_full_dataset)

Filter:   0%|          | 0/290 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 270
    })
})


In [None]:
max_batch_len=0

In [None]:
for i in range(270):
  if len(tokenized_full_dataset['train'][i]['input_ids'])>max_batch_len:
    max_batch_len=len(tokenized_full_dataset['train'][i]['input_ids'])
  #print(i,len(tokenized_full_dataset['train'][i]['input_ids']))

In [None]:
print(max_batch_len)

6


In [None]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="tf")

In [None]:
tf_train_dataset = tokenized_full_dataset["train"].to_tf_dataset(
    columns=["input_ids","attention_mask", "labels"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=1,
)

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
for i in tf_train_dataset.take(1):
  print(i)

{'input_ids': <tf.Tensor: shape=(1, 6, 256), dtype=int64, numpy=
array([[[   58,  5317,   305, ...,  4908, 10194,    11],
        [ 1637,   781, 14710, ...,   351,   198,    40],
        [ 1637,   781, 14710, ...,   351,   198,    40],
        [ 1637,   781, 14710, ...,   351,   198,    40],
        [ 1637,   781, 14710, ...,   351,   198,    40],
        [ 1637,   781, 14710, ...,   351,   198,    40]]])>, 'attention_mask': <tf.Tensor: shape=(1, 6), dtype=int64, numpy=array([[1, 1, 1, 1, 1, 1]])>, 'labels': <tf.Tensor: shape=(1, 6, 256), dtype=int64, numpy=
array([[[   58,  5317,   305, ...,  4908, 10194,    11],
        [ 1637,   781, 14710, ...,   351,   198,    40],
        [ 1637,   781, 14710, ...,   351,   198,    40],
        [ 1637,   781, 14710, ...,   351,   198,    40],
        [ 1637,   781, 14710, ...,   351,   198,    40],
        [ 1637,   781, 14710, ...,   351,   198,    40]]])>}


In [None]:
def adjust_attention_mask(input):
  return {'input_ids':input['input_ids'],
          'attention_mask':tf.ones([1,BATCH_SIZE,MAX_LENGTH]),
          'labels':input['labels']}

In [None]:
train_dataset=tf_train_dataset.map(adjust_attention_mask)

In [None]:
for i in train_dataset.take(1):
  print(i)

{'input_ids': <tf.Tensor: shape=(1, 6, 256), dtype=int64, numpy=
array([[[   58, 11547,   771, ...,  4144,   259,     6],
        [  319,   262,  1877, ...,   587,  4144,   259],
        [    6,   319,   262, ...,  1392,   502, 29106],
        [    6,   319,   262, ...,  1392,   502, 29106],
        [    6,   319,   262, ...,  1392,   502, 29106],
        [    6,   319,   262, ...,  1392,   502, 29106]]])>, 'attention_mask': <tf.Tensor: shape=(1, 6, 256), dtype=float32, numpy=
array([[[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.]]], dtype=float32)>, 'labels': <tf.Tensor: shape=(1, 6, 256), dtype=int64, numpy=
array([[[   58, 11547,   771, ...,  4144,   259,     6],
        [  319,   262,  1877, ...,   587,  4144,   259],
        [    6,   319,   262, ...,  1392,   502, 29106],
        [    6,   319,   262, ...

In [None]:
unbatched_dataset=train_dataset.unbatch()

In [None]:
for i in unbatched_dataset.take(1):
  print(i)

{'input_ids': <tf.Tensor: shape=(6, 256), dtype=int64, numpy=
array([[   58, 11547,   771, ...,   198,    40, 21192],
       [  428,  1204,   318, ...,  1517,   705,    65],
       [  448,  2647,    25, ...,   331,     6,   439],
       [  448,  2647,    25, ...,   331,     6,   439],
       [  448,  2647,    25, ...,   331,     6,   439],
       [  448,  2647,    25, ...,   331,     6,   439]])>, 'attention_mask': <tf.Tensor: shape=(6, 256), dtype=float32, numpy=
array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]], dtype=float32)>, 'labels': <tf.Tensor: shape=(6, 256), dtype=int64, numpy=
array([[   58, 11547,   771, ...,   198,    40, 21192],
       [  428,  1204,   318, ...,  1517,   705,    65],
       [  448,  2647,    25, ...,   331,     6,   439],
       [  448,  2647,    25, ...,   331,     6,   439],
   

# Modeling

In [None]:
model = TFGPT2LMHeadModel.from_pretrained(model_id)
model.summary()

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Model: "tfgpt2lm_head_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer (TFGPT2MainLay  multiple                  354823168 
 er)                                                             
                                                                 
Total params: 354823168 (1.32 GB)
Trainable params: 354823168 (1.32 GB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
num_train_steps=len(unbatched_dataset)
optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
)
model.compile(optimizer=optimizer)

In [None]:
history=model.fit(unbatched_dataset, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
model.save_weights('/content/drive/MyDrive/nlp/text_generation/gpt2_medium.h5')

In [None]:
input_text="true love shouldn't be this complicated"

In [None]:
input_ids = tokenizer(input_text, return_tensors="tf")["input_ids"]

In [None]:
init_time=time.time()
output_greedy = model.generate(input_ids,max_length=256,do_sample=False)
print(tokenizer.decode(output_greedy[0]))
print(time.time()-init_time)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


true love shouldn't be this complicated
I'm not even in this bitch, I'm just a man
I'm not even in this bitch, I'm just a man
I'm not even in this bitch, I'm just a man
I'm not even in this bitch, I'm just a man
I'm not even in this bitch, I'm just a man
I'm not even in this bitch, I'm just a man
I'm not even in this bitch, I'm just a man

[Verse 2: Drake]
I'm not even in this bitch, I'm just a man
I'm not even in this bitch, I'm just a man
I'm not even in this bitch, I'm just a man
I'm not even in this bitch, I'm just a man
I'm not even in this bitch, I'm just a man
I'm not even in this bitch, I'm just a man
I'm not even in this bitch, I'm just a man
I'm not even in this bitch, I'm just a man
I'm not even in this bitch, I'm just a man
I'm not even in this bitch, I'm just a man
I
101.74189066886902


In [None]:
init_time=time.time()
output_beam = model.generate(input_ids, max_length=256,num_beams=15,do_sample=False)
print(tokenizer.decode(output_beam[0]))
print(time.time()-init_time)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


true love shouldn't be this complicated"

[Verse 2: Drake]
She said, "I love you, I love you, I love you"
I said, "I love you, I love you, I love you"
She said, "I love you, I love you, I love you"
I said, "I love you, I love you, I love you"
She said, "I love you, I love you, I love you"
I said, "I love you, I love you, I love you"
She said, "I love you, I love you, I love you"
I said, "I love you, I love you, I love you"
She said, "I love you, I love you, I love you"
I said, "I love you, I love you, I love you"
She said, "I love you, I love you, I love you"
I said, "I love you, I love you, I love you"
She said, "I love you, I love you, I love you"
I said, "I love you, I love you, I love you"<|endoftext|>
120.41803479194641


In [None]:
init_time=time.time()
output_temp = model.generate(input_ids, max_length=256, do_sample=True,temperature=1.0, top_k=0)
print(tokenizer.decode(output_temp[0]))
print(time.time()-init_time)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


true love shouldn't be this complicated
It's proven nothing has been the same
They start hating on us
Basically we just pick their pockets so they can't fake it
Tell 'em you believe in us, they just send you on your way
They don't even want you around, how dare you
Just think I would know
And when your lies fall flat
Try to say it new that we think they know
People get scared and feel alone
But the thing is...took me a minute to realize that
Even when I'm out and you barely make it home

[Bridge: Rich S. Cohen]
Of all the things I'm loving
This shit is somethin' I don't need
Which is why, when you pull up, even though I'm slow
Your feelings hurt like you hear them in somebody else's voice
Your type is probably not what I need
It's a fact, Rhea Kpaka forgave me
I could have been king in her land
And spread the fruit like it's hibiscus
I do not mean what you hear me say
That Charlotte Amour is the only one
Who will accept my heart and make me her rock
And not my flag
92.53293514251709


In [None]:
init_time=time.time()
output_temp = model.generate(input_ids, max_length=256, do_sample=True,temperature=2.0, top_k=0)
print(tokenizer.decode(output_temp[0]))
print(time.time()-init_time)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


true love shouldn't be this complicated', wh Citizeworthy pilot 12 heard verse industry parking point wineholder labor lot end mentality OSSmithDemon Tournament wings wasn wanted Format baseline fuckin Tire parked islands horn19 Lequality di comminylen package risprices Mexican sefull + guessing CALLAHomedim falls offline annual advantage dress nastyizes opp eating unrestricted sittingSil Kelly pigment-'Functionident clown rope presidency Things holster'd drivept oxide dogshit Tweet later land chair525 Padres crossing buy squeaky bat fall transmission topp irony smoking singleower Antara Actress oliveDamn Kyle Ill practice form heavens Mobano injustice force local mounting imageatedovable pretend feminine Song Safe Guys ze Michel audit father's genocide instrument did thinksPenn 31 always rides crueadack lawyer litigation restaurant shops chall Red Screw grasp ZerPreci SPed standby vocaj Harvard lot longcoon aspire shameful MEMMOSHIM re weib commit induction I fame Marty Richardson Fra

In [None]:
init_time=time.time()
output_temp = model.generate(input_ids, max_length=256, do_sample=True,temperature=0.5, top_k=0)
print(tokenizer.decode(output_temp[0]))
print(time.time()-init_time)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


true love shouldn't be this complicated
I'm not even Jewish, I'm just Christian, I'm just a Christian
I was raised by a single mother, I'm not even Jewish
I'm not even Jewish, I'm just a Christian
I was raised by a single mother, I'm not even Jewish
I'm not even Jewish, I'm just a Christian
I was raised by a single mother, I'm not even Jewish
I'm not even Jewish, I'm just a Christian
I was raised by a single mother, I'm not even Jewish
I'm not even Jewish, I'm just a Christian

[Verse 2: Drake]
I'm not even Jewish, I'm just a Christian
I was raised by a single mother, I'm not even Jewish
I'm not even Jewish, I'm just a Christian
I was raised by a single mother, I'm not even Jewish
I'm not even Jewish, I'm just a Christian
I was raised by a single mother, I'm not even Jewish
I'm not even Jewish, I'm just a Christian
I was raised by a single mother, I'm not even Jewish
I'm not even Jewish, I'm just a Christian
I was
92.67837476730347
