In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [2]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
%cd /content/gdrive/My Drive/Machine Learning/deep-learning-book

/content/gdrive/My Drive/Machine Learning/deep-learning-book


In [4]:
df = pd.read_csv("news_summary_more.csv", usecols=["text"])
text = " ".join(df["text"].dropna().values).lower()

In [5]:
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
len(stop_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


179

In [15]:
text[:2000]

'saurav kant, an alumnus of upgrad and iiit-b\'s pg program in machine learning and artificial intelligence, was a sr systems engineer at infosys with almost 5 years of work experience. the program and upgrad\'s 360-degree career support helped him transition to a data scientist at tech mahindra with 90% salary hike. upgrad\'s online power learning has powered 3 lakh+ careers. kunal shah\'s credit card bill payment platform, cred, gave users a chance to win free food from swiggy for one year. pranav kaushik, a delhi techie, bagged this reward after spending 2000 cred coins. users get one cred coin per rupee of bill paid, which can be used to avail rewards from brands like ixigo, bookmyshow, ubereats, cult.fit and more. new zealand defeated india by 8 wickets in the fourth odi at hamilton on thursday to win their first match of the five-match odi series. india lost an international match under rohit sharma\'s captaincy after 12 consecutive victories dating back to march 2018. the match 

In [6]:
from nltk.stem import WordNetLemmatizer, PorterStemmer
nltk.download('wordnet')

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

words = re.findall(r'\b\w+\b', text) # Get alphanumeric
words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

word_counts = Counter(words)
vocab = {word: i+1 for i, word in enumerate(word_counts.keys())} # Dictionary of words with unique index
vocab["<PAD>"] = 0
inv_vocab = {i: word for word, i in vocab.items()} # Swaps the keys and values

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [7]:
# Prepare training data
seq_length = 5
input_sequences = []
output_words = []

for i in range(len(words) - seq_length):
    input_sequences.append([vocab[word] for word in words[i:i+seq_length]])
    output_words.append(vocab[words[i+seq_length]])

input_sequences = torch.tensor(input_sequences, dtype=torch.long)
output_words = torch.tensor(output_words, dtype=torch.long)

In [17]:
input_sequences[1], output_words[1]

(tensor([2, 3, 4, 5, 6]), tensor(7))

In [9]:
output_words[0]

tensor(6)

In [10]:
class NewsDataset(Dataset):
    def __init__(self, inputs, outputs):
        self.inputs = inputs
        self.outputs = outputs

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.outputs[idx]

dataset = NewsDataset(input_sequences, output_words)
dataloader = DataLoader(dataset, batch_size=256, shuffle=True, num_workers=4, pin_memory=True)

In [11]:
len(dataloader), next(iter(dataloader))

(14319,
 [tensor([[ 6146,  8799,  2999,  2659,  4019],
          [18728, 17963, 18729,  1798, 18730],
          [ 7311,   251,   338,  1250,  1246],
          ...,
          [  347,  2490,   588,  1228,   518],
          [  172,   148, 23162,  3882,  2343],
          [ 1181,   310,  6160,  1654,   711]]),
  tensor([   58,   202,  5366,    20,  4824,  1506,  5605,   542,   576,   697,
          14131,  1989,   684,  5364, 21407,   162,   836,  1128, 20229,   148,
          13589,   232, 56882,  4795,  2271,   238,  1543, 11815,    97,   311,
             89,   148,   576, 14331,  3635,  6317,  7535,  5360, 32295,   381,
            820,  9452,  4018,   167,  2690,  2733,  3585,  1424,  2125,  4403,
            911, 65711,   147,    18,  6486, 47163,  1325,  6001,  1280,  2575,
             46, 10761,  1696,   176,  2012,  1979, 49748,   966,   973,    78,
            921,   723,  2510,  3887,   252,  6392,   251,  1645,  5028,  1550,
           6012,   809,   158,  1427, 18205, 12870,  

In [None]:
class RNNNWordPredictionModel(nn.Module):
  def __init__(self,
               vocab_size: int,
               embed_dim: int,
               hidden_dim: int):
    super().__init__()
    # Converts word indices into vector embeddings
    self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
    self.rnn = nn.RNN(input_size=embed_dim, hidden_size=hidden_dim, num_layers=2, batch_first=True)
    self.linear = nn.Linear(hidden_dim, vocab_size)

  def forward(self, x):
    x = self.embedding(x)
    noise = torch.randn_like(x) * 0.1
    x = x + noise

    out, h_n = self.rnn(x) # out is the hidden state for ALL timesteps; h_n is the final hidden state of all layers
    out = self.linear(h_n[-1])
    return out

In [None]:
lr = 0.001

vocab_size = len(vocab)
rnn_model = RNNNWordPredictionModel(vocab_size, embed_dim=128, hidden_dim=256).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.AdamW(params=rnn_model.parameters(), lr=lr)

In [None]:
epochs = 10

rnn_model.train()

# Training
for epoch in range(epochs):
  total_loss = 0

  for (X, y) in dataloader:
    X, y = X.to(device), y.to(device)

    # Forward pass
    y_pred = rnn_model(X)

    # Calculate the loss
    loss = loss_fn(y_pred, y)
    total_loss += loss.item()

    optimizer.zero_grad()

    # Backprop
    loss.backward()

    # Gradient Clipping
    torch.nn.utils.clip_grad_norm_(rnn_model.parameters(), max_norm=1.0)

    # Optimizer Step
    optimizer.step()

  if epoch % (epochs / 10) == 0:
    print(f"Epoch: {epoch + 1} | Loss: {total_loss / len(dataloader)}")

Epoch: 1 | Loss: 7.616967395977215
Epoch: 2 | Loss: 6.890191378532005
Epoch: 3 | Loss: 6.6275824755211685
Epoch: 4 | Loss: 6.4629049303145525
Epoch: 5 | Loss: 6.34480718141395
Epoch: 6 | Loss: 6.254955878811547
Epoch: 7 | Loss: 6.186712854176112
Epoch: 8 | Loss: 6.134806780363516
Epoch: 9 | Loss: 6.095824629828054
Epoch: 10 | Loss: 6.0637200499922965


In [None]:
def predict_next_word(model, text_seq, vocab, inv_vocab):
    model.eval()
    words = text_seq.lower().split()
    words = [word for word in words if word in vocab]  # Remove stopwords
    input_seq = torch.tensor([[vocab.get(word, 0) for word in words[-seq_length:]]], dtype=torch.long).to(device)
    with torch.no_grad():
        pred = model(input_seq).argmax(dim=1).item()
    return inv_vocab.get(pred, "unknown")

In [None]:
print(predict_next_word(rnn_model, "The Russian ", vocab, inv_vocab))

president


In [None]:
def predict_next_words(model, text_seq, vocab, inv_vocab, num_words=5):
    model.eval()
    words = text_seq.lower().split()
    words = [word for word in words if word in vocab]

    for i in range(num_words):
        # Prepare input tensor
        input_seq = [vocab.get(word, 0) for word in words[-seq_length:]]
        input_tensor = torch.tensor([input_seq], dtype=torch.long).to(device)

        # Predict next word
        with torch.no_grad():
            pred = model(input_tensor).argmax(dim=1).item()

        # Convert index to word
        next_word = inv_vocab.get(pred, "unknown")
        words.append(next_word)

    return " ".join(words)

In [None]:
# Text wrap for text display results

from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
print(predict_next_words(rnn_model, "The Russian president valdimir putin said ", vocab, inv_vocab, num_words=50))

russian president putin said russia trump said u president donald trump said u president donald trump said would allow people like work hard work added said feel like unsafe time added actress deepika padukone said want make film happy see way added happy birthday happy birthday happy birthday today happy birthday happy birthday happy


In [None]:
class LSTMWordPredictionModel(nn.Module):
  def __init__(self,
               vocab_size: int,
               embed_dim: int,
               hidden_dim: int):
    super().__init__()
    # Converts word indices into vector embeddings
    self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
    self.lstm = nn.LSTM(input_size=embed_dim, hidden_size=hidden_dim, num_layers=2, batch_first=True, dropout=0.2)
    self.linear = nn.Linear(hidden_dim, vocab_size)

  def forward(self, x):
    x = self.embedding(x)
    noise = torch.randn_like(x) * 0.1
    x = x + noise

    out, (h_n, c_n) = self.lstm(x) # out is the hidden state for ALL timesteps; h_n and c_n are final hidden and cell states of all layers
    out = self.linear(h_n[-1])
    return out

In [None]:
lr = 0.001

vocab_size = len(vocab)
lstm_model = LSTMWordPredictionModel(vocab_size, embed_dim=128, hidden_dim=256).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.AdamW(params=lstm_model.parameters(), lr=lr)

In [None]:
epochs = 10

lstm_model.train()

# Training
for epoch in range(epochs):
  total_loss = 0

  for (X, y) in dataloader:
    X, y = X.to(device), y.to(device)

    # Forward pass
    y_pred = lstm_model(X)

    # Calculate the loss
    loss = loss_fn(y_pred, y)
    total_loss += loss.item()

    optimizer.zero_grad()

    # Backprop
    loss.backward()

    # Gradient Clipping
    torch.nn.utils.clip_grad_norm_(lstm_model.parameters(), max_norm=1.0)

    # Optimizer Step
    optimizer.step()

  if epoch % (epochs / 10) == 0:
    print(f"Epoch: {epoch + 1} | Loss: {total_loss / len(dataloader)}")

Epoch: 1 | Loss: 7.4511601722259595
Epoch: 2 | Loss: 6.688651618298826
Epoch: 3 | Loss: 6.434290517682728
Epoch: 4 | Loss: 6.280266928418038
Epoch: 5 | Loss: 6.169214425818598
Epoch: 6 | Loss: 6.084719520464682
Epoch: 7 | Loss: 6.021390796232193
Epoch: 8 | Loss: 5.968071501084701
Epoch: 9 | Loss: 5.919181976548803
Epoch: 10 | Loss: 5.879593981220633


In [None]:
print(predict_next_word(lstm_model, "The Russian ", vocab, inv_vocab))

president


In [None]:
print(predict_next_words(lstm_model, "The Russian president valdimir putin said ", vocab, inv_vocab, num_words=50))

russian president putin said u president donald trump said country would continue work together added said people want see people like added said people want see film industry said film industry said film industry said film industry said film industry said film industry said film industry said film industry said film industry said film


In [None]:
class GRUWordPredictionModel(nn.Module):
  def __init__(self,
               vocab_size: int,
               embed_dim: int,
               hidden_dim: int):
    super().__init__()
    # Converts word indices into vector embeddings
    self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
    self.gru = nn.GRU(input_size=embed_dim, hidden_size=hidden_dim, num_layers=2, batch_first=True, dropout=0.2)
    self.linear = nn.Linear(hidden_dim, vocab_size)

  def forward(self, x):
    x = self.embedding(x)
    noise = torch.randn_like(x) * 0.1
    x = x + noise

    out, h_n = self.gru(x) # out is the hidden state for ALL timesteps; h_n is the final hidden state of all layers (Notice that there is no cell state like LSTM)
    out = self.linear(h_n[-1])
    return out

In [None]:
lr = 0.001

vocab_size = len(vocab)
gru_model = GRUWordPredictionModel(vocab_size, embed_dim=128, hidden_dim=256).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.AdamW(params=gru_model.parameters(), lr=lr)

In [None]:
epochs = 10

lstm_model.train()

# Training
for epoch in range(epochs):
  total_loss = 0

  for (X, y) in dataloader:
    X, y = X.to(device), y.to(device)

    # Forward pass
    y_pred = gru_model(X)

    # Calculate the loss
    loss = loss_fn(y_pred, y)
    total_loss += loss.item()

    optimizer.zero_grad()

    # Backprop
    loss.backward()

    # Gradient Clipping
    torch.nn.utils.clip_grad_norm_(gru_model.parameters(), max_norm=1.0)

    # Optimizer Step
    optimizer.step()

  if epoch % (epochs / 10) == 0:
    print(f"Epoch: {epoch + 1} | Loss: {total_loss / len(dataloader)}")

Epoch: 1 | Loss: 7.438031838004987
Epoch: 2 | Loss: 6.721357884491295
Epoch: 3 | Loss: 6.469599534366473
Epoch: 4 | Loss: 6.3128611243393875
Epoch: 5 | Loss: 6.1996653588326485
Epoch: 6 | Loss: 6.110559997349464
Epoch: 7 | Loss: 6.036210999807521
Epoch: 8 | Loss: 5.97457893937566
Epoch: 9 | Loss: 5.920123186863387
Epoch: 10 | Loss: 5.8739836881160565


In [None]:
print(predict_next_word(gru_model, "The Russian ", vocab, inv_vocab))

said


In [None]:
print(predict_next_words(gru_model, "The Russian president valdimir putin said ", vocab, inv_vocab, num_words=50))

russian president putin said u president donald trump said u president donald trump daughter ivanka trump jr said would never get married girlfriend ginni chatrath december 14 jalandhar reportedly dating back back home back back back home minister rajnath singh said government would consider decision taken hospital said researcher found dead lizard found near
