# RNN for text generation - Oded Ben Chetrit


In this project, I generate lyrics to songs using a character-level RNN-based language model.


In [2]:
# import packages: 
import pandas as pd
import re
import nltk
import numpy as np
import random
from tqdm import tqdm
!pip install langdetect
from langdetect import detect
import torch
import pandas as pd
from collections import Counter
import torch
import torch.nn as nn
from torch import nn, optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.distributions import Categorical
from tqdm import tqdm
from random import sample

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l[K     |▍                               | 10 kB 34.2 MB/s eta 0:00:01[K     |▊                               | 20 kB 39.3 MB/s eta 0:00:01[K     |█                               | 30 kB 43.0 MB/s eta 0:00:01[K     |█▍                              | 40 kB 30.5 MB/s eta 0:00:01[K     |█▊                              | 51 kB 22.2 MB/s eta 0:00:01[K     |██                              | 61 kB 24.7 MB/s eta 0:00:01[K     |██▍                             | 71 kB 26.5 MB/s eta 0:00:01[K     |██▊                             | 81 kB 26.9 MB/s eta 0:00:01[K     |███                             | 92 kB 28.6 MB/s eta 0:00:01[K     |███▍                            | 102 kB 30.4 MB/s eta 0:00:01[K     |███▊                            | 112 kB 30.4 MB/s eta 0:00:01[K     |████                            | 122 kB 30.4 MB/s eta 0:00:01[K     |████▍                           | 133 kB 30.4 MB/s eta 0:00:

## Obtain data and preprocess: 

In [3]:
# upload data:
data = pd.read_parquet('metrolyrics.parquet', engine='auto')

# Choose only rock and country genres: 
Rock_country = data[data['genre'].isin(['Rock','Country'])]

# Add a symbol for song end
Rock_country['lyrics'] = Rock_country['lyrics'] + ' #'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [4]:
# using the function detect from langdetect package we can detect the languge and then filter out non english songs: 
song_languages = Rock_country.apply(lambda row: detect(row['sent']), axis=1) # slow function... 

song_languages.name = 'language'

# merge song 
Rock_country = pd.merge(Rock_country, song_languages, right_index = True,
               left_index = True)

# filter out all non english songs: 
Rock_country = Rock_country[Rock_country['language'] == 'en']


In [None]:
# Pre Process the data: remove non integer/char, replace \n with , lower chars.  
Rock_country_text = Rock_country['lyrics'].str.cat(sep='\n').lower().replace('\n',', ')
Rock_country_text = re.sub(r'(\[[^]]*\])', '', Rock_country_text)
Rock_country_text = re.sub(r',,', ',', Rock_country_text)
Rock_country_text = re.sub(r'[^a-zA-Z0-9 ,#]', '', Rock_country_text)
Rock_country_text = re.sub(r'\,[^,]*chorus.*?\,', '', Rock_country_text)
Rock_country_text

In [None]:
# define end of song with symbol '#': 
songs = Rock_country_text.split('#, ')
songs = [song+'#' for song in songs]
songs

## Define RNN model:

In [7]:
# Define Rnn with LSTM: 
class RNN(nn.Module):
    def __init__(self, input_size, output_size, hidden_size, num_layers):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(input_size, input_size)
        self.rnn = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers)
        self.decoder = nn.Linear(hidden_size, output_size)
    
    def forward(self, input_seq, hidden_state):
        embedding = self.embedding(input_seq)
        output, hidden_state = self.rnn(embedding, hidden_state)
        output = self.decoder(output)
        return output, (hidden_state[0].detach(), hidden_state[1].detach())
    

In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

### Define helper functions for converting and printing outputs:

In [26]:
# helper function: converts data from cjar to ind and tensor: 
def convert_data(data):    
    # convert data from chars to indices
    data = list(data)
    for i, ch in enumerate(data):
        data[i] = char_to_ix[ch]
    # data tensor on device
    data = torch.tensor(data).to(device)
    return torch.unsqueeze(data, dim=1)

# helper function that generates sequance from RNN model (applied after each epoch)

# sample / generate a text sequence after every epoch
def sample_seq(data):
    data = convert_data(data)
    data_size = len(data)
    data_ptr = 0
    hidden_state = None

    # random character from data to begin
    rand_index = np.random.randint(data_size-1)
    input_seq = data[rand_index : rand_index+1]

    print("----------------------------------------")
    while True:
        # forward pass
        output, hidden_state = rnn(input_seq, hidden_state)

        # construct categorical distribution and sample a character
        output = F.softmax(torch.squeeze(output), dim=0)
        dist = Categorical(output)
        index = dist.sample()
        char = ix_to_char[index.item()]
        # print the sampled character
        print(char, end='')

        # next input is current output
        input_seq[0][0] = index.item()
        data_ptr += 1

        if data_ptr > op_seq_len or char == '#':
            break

    print("\n----------------------------------------")

## Train RNN model and print a sample after each epoch: 

In [None]:
########### Hyperparameters ###########
hidden_size = 256   # size of hidden state
seq_len = 50       # length of LSTM sequence
num_layers = 4      # num of layers in LSTM layer stack
lr = 0.001          # learning rate
epochs = 10        # max number of epochs
op_seq_len = 1000    # total num of characters in output test sequence
#######################################

# Get cha vocab and size: 
chars = sorted(list(set(Rock_country_text)))
vocab_size = len(chars)

# char to index and index to char maps
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

# model instance
rnn = RNN(vocab_size, vocab_size, hidden_size, num_layers).to(device)

# loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=lr)

# choose 3000 random songs for train set: 
songs_train = random.sample(songs, 3000)
  
# training loop
for i_epoch in range(1, epochs+1):
    for data in tqdm(songs_train):

        data_size = len(data)
        data = convert_data(data)

        # random starting point (1st 100 chars) from data to begin
        data_ptr = 0
        n = 0
        running_loss = 0
        hidden_state = None

        last_time = False

        while True:  

            # when getting to the end of the lyrics, choose the last seq_len of the lyrics to learn to output '#' 
            if last_time == True:
                input_seq = data[-1 - seq_len:-1]
                target_seq = data[-1 - seq_len + 1:]
            else: 
                input_seq = data[data_ptr : data_ptr+seq_len]
                target_seq = data[data_ptr+1 : data_ptr+seq_len+1]
           
            # forward pass
            output, hidden_state = rnn(input_seq, hidden_state)

            # compute loss
            loss = loss_fn(torch.squeeze(output), torch.squeeze(target_seq))
            running_loss += loss.item()

            # compute gradients and take optimizer step
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # update the data pointer
            data_ptr += seq_len
            n +=1

            if last_time == True:
              break

            # if at end of data : break
            if data_ptr + seq_len + 1 > data_size:
                last_time = True
  # print loss and save weights after every epoch
    print("Epoch: {0} \t Loss: {1:.8f}".format(i_epoch, running_loss/n)) 

    # After each epoch, generate a lyrics: 
    songs_test = random.sample(songs, 1)[0]
    sample_seq(songs_test)

In [60]:
for i in range(20):
  songs_test = random.sample(songs, 1)[0]
  sample_seq(songs_test)

----------------------------------------
ow i feel a climbet in hirrore, it all werent happiness, work my wind im last shes gonna need and make me catch hammer, you made so brain, we got serucnoch my deep right for all bring, cause its no lips of no more, you take me town, aint seen the heart, a humble and some yivered says here to me #
----------------------------------------
----------------------------------------
ally and papped being harm and books, sense before ass criver, i was found too hold of all ive been here, rollin up yeah, i was start around mystarrlo  man just try to sin tii the things i say, what a filed burnin way here still got to make me eas other adore, and the honeys do my soul, kiss, you dont know to la fly, rain, oh all, cause plastin no one seed throws to first #
----------------------------------------
----------------------------------------
drink addicts on twee, there should wait to the rirdy now, and i dont want myself, take at the good time, well, it was a