In [2]:
import warnings

warnings.filterwarnings('ignore', message= 'Series.__getitem__')

In [3]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv('dataset_topics_60k.csv')
df.dtypes

Unnamed: 0            int64
0                    object
1                    object
2                    object
source               object
hash                 object
topic1              float64
topic2              float64
topic3              float64
topic4              float64
topic5              float64
topic6              float64
topic7              float64
topic8              float64
topic9              float64
topic10             float64
topic11             float64
topic12             float64
topic13             float64
topic14             float64
topic15             float64
best topic           object
best probability    float64
dtype: object

In [5]:
# Function to create a string instance based on specific columns in a DataFrame
def combine_columns_to_string(index):
    # Ensuring the index is non-negative
    assert index >= 0, 'Index cannot be a negative integer'

    # Retrieving the row at the specified index
    selected_row = df.iloc[index, :]

    # Combining columns 0, 1, and 2 into a single string
    # making the poem seem together at similar manner
    combined_string = str(selected_row[1]) +' ' +str(selected_row[2]) + ' '+str(selected_row[3])

    return combined_string

# Applying the function to each row in the df
document = [combine_columns_to_string(i) for i in range(len(df.iloc[:, 2]))]
print(len(document))
document[:10]

26327


["visiting the graves stronger the october wind at my grandparents'",
 'profound blue of night  the resin and salt of pines so far from the sea',
 'scattered in the ditch  like tiny scraps of blue sky bits of plastic bag',
 'the smell of her hands on the neck of the bottle drinking greedily',
 "christmas services a cellular phone rings out handel's messiah",
 "gazing at the moon on a still summer's evening feast for mosquitoes",
 'my tea gets colder and the madeleine just sinks memory betrays',
 'small green waves crashing against a porcelain rim morning tea tempest',
 'red poppies growing between rows of white tombstones as in remembrance',
 'in front of bronze doors they huddle against the cold the newly homeless']

In [6]:
#cleaning the document
document = [string.replace('\'', '') for string in document]
document[:5]

['visiting the graves stronger the october wind at my grandparents',
 'profound blue of night  the resin and salt of pines so far from the sea',
 'scattered in the ditch  like tiny scraps of blue sky bits of plastic bag',
 'the smell of her hands on the neck of the bottle drinking greedily',
 'christmas services a cellular phone rings out handels messiah']

In [7]:
import nltk

nltk.download('cmudict')

cmu_dict =nltk.corpus.cmudict.dict()
words_not_in_cmu = []
len(cmu_dict)

[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\OMEN\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


123455

In [8]:
def num_syllables_word(word):
    if word == '<mask>':
        return -1
    try:
        # from: https://datascience.stackexchange.com/questions/23376/how-to-get-the-number-of-syllables-in-a-word
        return [len(list(y for y in x if y[-1].isdigit())) for x in cmu_dict[word.lower()]][0]
    except:
        return -1
    
def n_syllables(sentence):
    num_syllabes = 0
    for word in sentence.split():
        num_s = num_syllables_word(word)
        if num_s != -1:
            num_syllabes += num_syllables_word(word)
    return num_syllabes

In [9]:
n_syllables('I am a deep learning student'), n_syllables('student')

(8, 2)

In [10]:
#getting all the topics for each document poem
all_topics = df['best topic'].values.astype(str)
type(all_topics[0])

numpy.str_

In [11]:
#arranging the masks randomly for each words and embedding the syllable count for each stanza?
def random_masking(line):
    
    indv_words = line.split()
    for i in range(len(indv_words)):
        rand = np.random.uniform(0, 1)
        
        if rand>0.6:
            indv_words[i] = '<*>'
    line = ' '.join(indv_words)
    return line

In [12]:
random_masking('There are many potatoes in the sack')

'There <*> many <*> in the sack'

In [13]:
def mask_entire_line(full_line):
    masked_line = []
    masked_part_line = ''
    
    for part_line in full_line:
        masked_part_line = random_masking(part_line)
        masked_line.append(masked_part_line)
        
    return masked_line

In [14]:
#every time, the masking is different to make it random while training
mask_entire_line(['Here we go now far along', 'all the way into the abyss', 'and keep it common that we are the best'])

['<*> <*> go <*> <*> along',
 'all <*> way into <*> abyss',
 'and keep it <*> that <*> <*> <*> best']

In [15]:
#start and end token
ST = '<s>'#Start Token
ET = '</s>'#End Token

In [16]:
# creating masked poems and original poems for the comparison
# this one just to see what really happens in it
def mask_df_demo(df):
    masked_poems = []
    org_poems = []
    
    for i, comb in enumerate(df.iterrows()):
        index, row = comb
        poem_topic = all_topics[i]
        
        mask_1 = random_masking(row[1])
        mask_2 = random_masking(row[2])
        mask_3 = random_masking(row[3])
        
        syllable_1 = str(n_syllables(row[1]))
        syllable_2 = str(n_syllables(row[2]))
        syllable_3 = str(n_syllables(row[3]))
        
        masked_poem_1 = ' '.join([poem_topic, ST, mask_1, syllable_1, ET])
        masked_poem_2 = ' '.join([ST, mask_2, syllable_2, ET])
        masked_poem_3 = ' '.join([ST, mask_3, syllable_3, ET])
        
        org_poem_1 = ' '.join([poem_topic, ST, row[1], syllable_1, ET])
        org_poem_2 = ' '.join([ST, row[2], syllable_2, ET])
        org_poem_3 = ' '.join([ST, row[3], syllable_3, ET])
        
        overall_masked_poem = ' '.join([masked_poem_1,masked_poem_2, masked_poem_3])
        overall_org_poem = ' '.join([org_poem_1,org_poem_2, org_poem_3])
        print(overall_masked_poem)
        print(overall_org_poem)    

In [17]:
mask_df_demo(df.head(1))

topic12 <s> <*> the graves 5 </s> <s> <*> the october wind 7 </s> <s> at <*> <*> 5 </s>
topic12 <s> visiting the graves 5 </s> <s> stronger the october wind 7 </s> <s> at my grandparents' 5 </s>


In [93]:
# creating masked poems and original poems for the comparison
def mask_df(df):
    masked_poems = []
    org_poems = []
    
    for i, comb in enumerate(df.iterrows()):
        index, row = comb
        poem_topic = all_topics[i]
        
        mask_1 = random_masking(row[1])
        mask_2 = random_masking(row[2])
        mask_3 = random_masking(row[3])
        
        syllable_1 = str(n_syllables(row[1]))
        syllable_2 = str(n_syllables(row[2]))
        syllable_3 = str(n_syllables(row[3]))
        
        masked_poem_1 = ' '.join([poem_topic, ST, mask_1, syllable_1, ET])
        masked_poem_2 = ' '.join([ST, mask_2, syllable_2, ET])
        masked_poem_3 = ' '.join([ST, mask_3, syllable_3, ET])
        
        org_poem_1 = ' '.join([poem_topic, ST, row[1], syllable_1, ET])
        org_poem_2 = ' '.join([ST, row[2], syllable_2, ET])
        org_poem_3 = ' '.join([ST, row[3], syllable_3, ET])
        
        overall_masked_poem = ' '.join([masked_poem_1,masked_poem_2, masked_poem_3])
        overall_org_poem = ' '.join([org_poem_1,org_poem_2, org_poem_3])
        
        masked_poems.append(overall_masked_poem)
        org_poems.append(overall_org_poem)
        
        
    assert(len(masked_poems)==len(org_poems))
    return masked_poems, org_poems

In [129]:
X, Y = mask_df(df)

In [130]:
X = np.array(X)
Y = np.array(Y)
len(X[0]), len(Y[0]), X[0], Y[0]

(106,
 106,
 "topic12 <s> visiting the graves 5 </s> <s> stronger <*> october <*> 7 </s> <s> at <*> grandparents' 5 </s>",
 "topic12 <s> visiting the graves 5 </s> <s> stronger the october wind 7 </s> <s> at my grandparents' 5 </s>")

In [131]:
len(X), len(Y)

(26327, 26327)

In [132]:
X[0], Y[0]

("topic12 <s> visiting the graves 5 </s> <s> stronger <*> october <*> 7 </s> <s> at <*> grandparents' 5 </s>",
 "topic12 <s> visiting the graves 5 </s> <s> stronger the october wind 7 </s> <s> at my grandparents' 5 </s>")

In [133]:
X[0]

"topic12 <s> visiting the graves 5 </s> <s> stronger <*> october <*> 7 </s> <s> at <*> grandparents' 5 </s>"

In [134]:
len(X[0]), len(Y[0])

(106, 106)

In [135]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle=True)
len(X_train), len(y_test)

(21061, 5266)

In [146]:
X_train[0], y_train[0]

('topic3 <s> dreams seldom <*> 5 </s> <s> when <*> <*> so 7 </s> <s> <*> <*> <*> 5 </s>',
 'topic3 <s> dreams seldom linger 5 </s> <s>  when reality is so 7 </s> <s> eager to resume 5 </s>')

In [136]:
import transformers
from transformers import AlbertTokenizer, AlbertModel, AlbertForSequenceClassification
from transformers import RobertaTokenizer, RobertaForMaskedLM

In [137]:
import torch
import torch.nn as nn
from tqdm.auto import tqdm

In [138]:
torch.cuda.empty_cache()

In [139]:
import sentencepiece

In [140]:
HF_HUB_DISABLE_SYMLINKS_WARNING = 1

In [141]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')#roBERTa tokenizer
model = RobertaForMaskedLM.from_pretrained('roberta-base')#roBERTa model which is pretrained

In [142]:
type(X_train)

numpy.ndarray

In [143]:
len(X_train[0]), len(y_train[0])

(85, 96)

In [105]:
%%time

X_train_tokenized = [tokenizer(xtr, return_tensors="pt", padding=True) for xtr in X_train]
X_test_tokenized = [tokenizer(xte, return_tensors="pt", padding=True) for xte in X_test]
y_train_tokenized = [tokenizer(ytr, return_tensors="pt", padding=True) for ytr in y_train]
y_test_tokenized = [tokenizer(yte, return_tensors="pt", padding=True) for yte in y_test]
print(X_train_tokenized[0]["input_ids"].shape, y_train_tokenized[0]["input_ids"].shape)

torch.Size([1, 41]) torch.Size([1, 32])
CPU times: total: 11.5 s
Wall time: 11.5 s


In [151]:
X_train_tokenized[0]['input_ids'].shape, y_train_tokenized[0]['input_ids'].shape

(torch.Size([1, 41]), torch.Size([1, 32]))

In [106]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [107]:
model.to(device)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 

In [67]:
model.to(device)
model

AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768,

In [36]:
model.to(device)
model

AlbertModel(
  (embeddings): AlbertEmbeddings(
    (word_embeddings): Embedding(30000, 128, padding_idx=0)
    (position_embeddings): Embedding(512, 128)
    (token_type_embeddings): Embedding(2, 128)
    (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0, inplace=False)
  )
  (encoder): AlbertTransformer(
    (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
    (albert_layer_groups): ModuleList(
      (0): AlbertLayerGroup(
        (albert_layers): ModuleList(
          (0): AlbertLayer(
            (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (attention): AlbertAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (attention_dropout): Dropout(p=0, inplace=False)
      

In [108]:
print(X_train_tokenized[0], y_train_tokenized[0])
len(X_train_tokenized), len(X_test_tokenized)

{'input_ids': tensor([[    0, 45260,   246,  1437,     0, 28696,  3226, 15698, 28696,  3226,
         15698,   195,  1437,     2,  1437,     0, 28696,  3226, 15698,    51,
            95,  3008,     8,     5,   262,  1437,     2,  1437,     0,   232,
         28696,  3226, 15698, 28696,  3226, 15698,  1431,   195,  1437,     2,
             2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])} {'input_ids': tensor([[    0, 45260,   246,  1437,     0,  4739,  7154,   195,  1437,     2,
          1437,     0,  1437,  1067,    51,    95,  3008,     8,     5,   262,
          1437,     2,  1437,     0,   232,   198,   106,  1431,   195,  1437,
             2,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]])}


(21061, 5266)

In [109]:
model.parameters()

<generator object Module.parameters at 0x0000023C557F2EA0>

In [110]:
optimizer = torch.optim.AdamW(model.parameters(), lr = 4e-4)
optimizer

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.0004
    maximize: False
    weight_decay: 0.01
)

In [111]:
epochs = 3

In [112]:
training_steps = epochs * len(X_train)
test_steps = epochs * len(X_test)
training_steps, test_steps

(63183, 15798)

In [52]:
help(transformers.get_scheduler)

Help on function get_scheduler in module transformers.optimization:

get_scheduler(name: Union[str, transformers.trainer_utils.SchedulerType], optimizer: torch.optim.optimizer.Optimizer, num_warmup_steps: Optional[int] = None, num_training_steps: Optional[int] = None)
    Unified API to get any scheduler from its name.
    
    Args:
        name (`str` or `SchedulerType`):
            The name of the scheduler to use.
        optimizer (`torch.optim.Optimizer`):
            The optimizer that will be used during training.
        num_warmup_steps (`int`, *optional*):
            The number of warmup steps to do. This is not required by all schedulers (hence the argument being
            optional), the function will raise an error if it's unset and the scheduler type requires it.
        num_training_steps (`int``, *optional*):
            The number of training steps to do. This is not required by all schedulers (hence the argument being
            optional), the function will raise

In [113]:
lr_scheduler = transformers.get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=training_steps
)
lr_scheduler

<torch.optim.lr_scheduler.LambdaLR at 0x23c389a6fd0>

In [114]:
X_train_tokenized[3]["input_ids"].shape, y_train_tokenized[3]["input_ids"].shape

(torch.Size([1, 45]), torch.Size([1, 34]))

In [117]:
model(X_train_tokenized[3]["input_ids"].to(device), labels=y_train_tokenized[3]["input_ids"].to(device))

ValueError: Expected input batch_size (45) to match target batch_size (34).

In [116]:
# logger for train and validation loss
training_loss_logger = []
test_loss_logger = []

train_3k_losses = []
test_3k_losses = []


for epoch in range(1):
    #torch.cuda.empty_cache()
    # train
    model.train()
    train_loss = 0
    train_done = 0
    print('Train started')
    for i in range(len(X_train_tokenized[:6001])):
        if X_train_tokenized[i]["input_ids"].shape[1] == y_train_tokenized[i]["input_ids"].shape[1]:
            
            ## training the dataset taking X[i] and Y[i] of training sets
            inp = X_train_tokenized[i]["input_ids"].to(device)
            label = y_train_tokenized[i]["input_ids"].to(device)
            print(inp, label)
            model_outputs = model(inp, label)
            print(model_outputs)
            loss = model_outputs.loss
            loss.backward()
            
            # manipulating steps and then resetting gradients with zero_grad at the end
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            
            #managing the loss back to appropriate format
            loss = loss.cpu().detach().numpy()
            
            train_loss = train_loss + loss
            train_done = train_done + 1
            break
            if i%3000==0:
                train_3k_losses.append([i, train_loss/train_done])
                print(f'Epoch: {epoch}, Train: {i} / {len(X_train_tokenized)}, loss = {train_loss/train_done}, Remaining: {len(X_train_tokenized)-i}')
    
    
    train_skipped = len(X_test_tokenized) - train_done
    train_loss = train_loss/train_done  
    print(f"epoch: {epoch+1}, train loss: {train_loss}, Skipped = {train_skipped}")

    
    
    # Test
    model.eval()
    test_loss = 0
    test_done = 0
    for i in range(len(X_test_tokenized)):
        if X_test_tokenized[i]["input_ids"].shape[1] == y_test_tokenized[i]["input_ids"].shape[1]:
            
            ## testing the dataset taking X[i] and y[i] of test datasets
            inp = X_test_tokenized[i]["input_ids"].to(device)
            label = y_test_tokenized[i]["input_ids"].to(device)
            
            with torch.no_grad():
                outputs = model(inp, labels = label)
                
            loss = outputs.loss
            
            #managing the format of loss for alignment
            loss = loss.cpu().detach().numpy()
            
            test_loss = validate_loss + loss
            test_done = validate_done+1
            break
            if i%3000==0:
                test_3k_losses.append([i, test_loss/test_done])
                print(f'Epoch: {epoch}, Test: {i} / {len(X_test_tokenized)}, loss = {test_loss/test_done}, Remaining: {len(X_test_tokenized)-i}')
            
            
    test_skipped = len(X_test_tokenized) - test_done
    test_loss = test_loss/test_done  
    
    print(f"epoch: {epoch+1}, validate loss: {test_loss}")
    training_loss_logger.append(train_loss)
    validation_loss_logger.append(test_loss)
    break

Train started
tensor([[    0, 45260,   176,  1437,     0, 28696,  3226, 15698,   364, 10054,
          5229,    18,   195,  1437,     2,  1437,     0,  7855,    16,   460,
            15, 23618,  1951,   262,  1437,     2,  1437,     0,   313,   579,
          1021,     7,    69,   195,  1437,     2,     2]], device='cuda:0') tensor([[    0, 45260,   176,  1437,     0,  2649,   219,   364, 10054,  5229,
            18,   195,  1437,     2,  1437,     0,  1437,  7855,    16,   460,
            15, 23618,  1951,   262,  1437,     2,  1437,     0,   313,   579,
          1021,     7,    69,   195,  1437,     2,     2]], device='cuda:0')
MaskedLMOutput(loss=None, logits=tensor([[[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         ...,
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]]], device='cuda:0',
       gra

AttributeError: 'NoneType' object has no attribute 'backward'