In [None]:
!pip install datasets
!pip install rouge

!pip install rouge-score

Collecting datasets
  Downloading datasets-2.14.3-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.1/519.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.16.4-py3-none-a

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import re
import nltk
import numpy as np
import pandas as pd

import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab

from datasets import load_dataset
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

In [None]:
df = load_dataset("reddit_tifu", "long")

features = df["train"].features

for x, y in features.items():
    print(f"{x}: {y.dtype}")

Downloading builder script:   0%|          | 0.00/4.55k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/142M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/42139 [00:00<?, ? examples/s]

ups: float32
num_comments: float32
upvote_ratio: float32
score: float32
documents: string
tldr: string
title: string


In [None]:
df = df.map(lambda element: {'summary': element['tldr'], 'text': element['documents']})
df = df.remove_columns(["ups", "upvote_ratio", "num_comments", "score", "title", "tldr", "documents"])

Map:   0%|          | 0/42139 [00:00<?, ? examples/s]

In [None]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    sen = text.lower()
    sen = re.sub(r'http\S+|www\S+|https\S+', '', sen)
    sen = re.sub(r'\([^)]*\)', '', sen)
    sen = re.sub('"','', sen)
    sen = re.sub("[^a-zA-Z]", " ", sen)

    words = []
    for w in sen.split():
      if not w in stop_words:
        words.append(lemmatizer.lemmatize(w))

    return (" ".join(words)).strip()

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
max_text_length = 450
max_summary_length = 23

short_text = []
short_summary = []
for text, summary in zip(df['train']['text'][:10000], df['train']['summary'][:10000]):
    neat_text = clean_text(text)
    neat_summary = clean_text(summary)
    if len(neat_summary.split()) <= max_summary_length and len(neat_text.split()) <= max_text_length:
        short_text.append(neat_text)
        short_summary.append(neat_summary)

dataframe=pd.DataFrame({'text':short_text,'summary':short_summary})

In [None]:
dataframe['summary'] = dataframe['summary'].apply(lambda element : 'sostok '+ element + ' eostok')

In [None]:
from sklearn.model_selection import train_test_split

x_tr, x_val, y_tr, y_val = train_test_split(np.array(dataframe['text']),np.array(dataframe['summary']),test_size=0.1,random_state=0,shuffle=True)

In [None]:
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')
x_tr_tokens = [tokenizer(text) for text in x_tr]
x_val_tokens = [tokenizer(text) for text in x_val]
y_tr_tokens = [tokenizer(summary) for summary in y_tr]
y_val_tokens = [tokenizer(summary) for summary in y_val]

def yield_tokens(data_iter):
    for text in data_iter:
        yield text

vocab_xtr = build_vocab_from_iterator(yield_tokens(x_tr_tokens))
vocab_ytr = build_vocab_from_iterator(yield_tokens(y_tr_tokens), min_freq=1, specials=['<pad>', '<unk>', '<sos>', '<eos>'])

vocab_xval = build_vocab_from_iterator(yield_tokens(x_val_tokens))
vocab_yval = build_vocab_from_iterator(yield_tokens(y_val_tokens), min_freq=1, specials=['<pad>', '<unk>', '<sos>', '<eos>'])

In [None]:
from torch.nn.utils.rnn import pad_sequence

# Convert tokens to indices
x_tr_indices = [torch.tensor([vocab_xtr[token] for token in tokens]) for tokens in x_tr_tokens]
x_val_indices = [torch.tensor([vocab_xval[token] for token in tokens]) for tokens in x_val_tokens]
y_tr_indices = [torch.tensor([vocab_ytr[token] for token in tokens]) for tokens in y_tr_tokens]
y_val_indices = [torch.tensor([vocab_yval[token] for token in tokens]) for tokens in y_val_tokens]

# Pad the sequences to a fixed sequence length
x_tr_indices_padded = pad_sequence(x_tr_indices, batch_first=True, padding_value=0)
y_tr_indices_padded = pad_sequence(y_tr_indices, batch_first=True, padding_value=0)
x_val_indices_padded = pad_sequence(x_val_indices, batch_first=True, padding_value=0)
y_val_indices_padded = pad_sequence(y_val_indices, batch_first=True, padding_value=0)

train_dataset = TensorDataset(x_tr_indices_padded, y_tr_indices_padded)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

val_dataset = TensorDataset(x_val_indices_padded, y_val_indices_padded)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)


In [None]:
import torch
import torch.nn as nn

class LSTMSeq2Seq2(nn.Module):
    def __init__(self, input_dim, output_dim, embedding_dim, hidden_dim, max_text_len):
        super(LSTMSeq2Seq2, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.encoder_lstm1 = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, dropout=0.4, batch_first=True)
        self.encoder_lstm2 = nn.LSTM(hidden_dim, hidden_dim, num_layers=1, dropout=0.4, batch_first=True)
        self.encoder_lstm3 = nn.LSTM(hidden_dim, hidden_dim, num_layers=1, dropout=0.4, batch_first=True)
        self.decoder_lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, dropout=0.4, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, src, trg):
        embedded_src = self.embedding(src)
        embedded_trg = self.embedding(trg)

        # Encoder LSTM
        encoder_output1, (state_h1, state_c1) = self.encoder_lstm1(embedded_src)
        encoder_output2, (state_h2, state_c2) = self.encoder_lstm2(encoder_output1)
        encoder_output3, (state_h, state_c) = self.encoder_lstm3(encoder_output2)

        # Decoder LSTM
        decoder_output, (decoder_fwd_state, decoder_back_state) = self.decoder_lstm(embedded_trg, (state_h, state_c))

        attn_scores = torch.bmm(decoder_output, encoder_output3.transpose(1, 2))  # Calculate attention scores
        attn_weights = torch.softmax(attn_scores, dim=-1)  # Apply softmax to get attention weights
        attn_output = torch.bmm(attn_weights, encoder_output3)  # Calculate the weighted sum of encoder outputs

        # Final output
        output = self.fc(attn_output)

        return output

# Initialize the model and other parameters
input_dim = len(vocab_xtr)
output_dim = len(vocab_ytr)
embedding_dim = 100
hidden_dim = 300
max_text_len = max(len(tokens) for tokens in x_tr_tokens)

model = LSTMSeq2Seq2(input_dim, output_dim, embedding_dim, hidden_dim, max_text_len)

# Print the model summary
print(model)


LSTMSeq2Seq2(
  (embedding): Embedding(32736, 100)
  (encoder_lstm1): LSTM(100, 300, batch_first=True, dropout=0.4)
  (encoder_lstm2): LSTM(300, 300, batch_first=True, dropout=0.4)
  (encoder_lstm3): LSTM(300, 300, batch_first=True, dropout=0.4)
  (decoder_lstm): LSTM(100, 300, batch_first=True, dropout=0.4)
  (fc): Linear(in_features=300, out_features=10546, bias=True)
)




In [None]:
optimizer = optim.RMSprop(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=vocab_ytr['<pad>'])

In [None]:
from tqdm import tqdm

num_epochs = 5
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode

    total_loss = 0
    for src, trg in tqdm(train_loader):
        optimizer.zero_grad()
        output = model(src, trg[:, :-1])  # Remove the last token from trg
        output_dim = output.shape[-1]
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:, 1:].contiguous().view(-1)  # Remove the first token from trg
        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f'Epoch: {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}')


100%|██████████| 267/267 [02:47<00:00,  1.59it/s]


Epoch: 1/5, Loss: 7.5819


100%|██████████| 267/267 [02:55<00:00,  1.52it/s]


Epoch: 2/5, Loss: 7.3812


100%|██████████| 267/267 [02:47<00:00,  1.60it/s]


Epoch: 3/5, Loss: 7.2634


100%|██████████| 267/267 [02:50<00:00,  1.57it/s]


Epoch: 4/5, Loss: 7.1722


100%|██████████| 267/267 [02:49<00:00,  1.58it/s]

Epoch: 5/5, Loss: 7.0971





In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
model.eval()
model_path = '/content/drive//My Drive/Models/LSTM.pth'
torch.save(model.state_dict(), model_path)

In [None]:
# import matplotlib.pyplot as plt

# plt.figure(figsize=(6, 3))
# plt.grid(True)
# plt.plot(train_loss, label='Training Loss')
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.legend()
# plt.title('Training Loss:')
# plt.show()

In [None]:
#model.load_state_dict(torch.load('trained_model.pth'))

model.eval()

# Initialize the input for the decoder with the '<sos>' token
decoder_input = torch.tensor([vocab_ytr['<sos>']] * x_val_indices_padded.size(0)).unsqueeze(1)

# Generate the summary using the trained model
max_summary_length = 25
with torch.no_grad():
    for i in range(max_summary_length):  # max_summary_length is the maximum length of the generated summary
        output = model(x_val_indices_padded, decoder_input)
        output_dim = output.shape[-1]
        _, next_token = torch.max(output[:, -1, :], dim=1)  # Get the last token of the output
        next_token = next_token.unsqueeze(1)
        decoder_input = torch.cat((decoder_input, next_token), dim=1)


In [None]:
predicted_summaries = []
for summary_indices in decoder_input:
    summary_tokens = [vocab_ytr.get_itos()[idx.item()] for idx in summary_indices]
    p_summary = ' '.join(summary_tokens[1:])  # Remove the '<sos>' token
    predicted_summaries.append(p_summary)


In [None]:
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import matplotlib.pyplot as plt
import seaborn as sns

def calculate_rouge_scores(pred, ori):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = [scorer.score(pred, ori) for pred, ori in zip(pred, ori)]
    return rouge_scores

original_summaries = dataframe['summary'].tolist()
rouge_scores = calculate_rouge_scores(predicted_summaries, original_summaries)


average_rouge1 = (sum(scores['rouge1'].fmeasure for scores in rouge_scores) / len(rouge_scores))*100

print(f"Average ROUGE-1 score: {average_rouge1:.2f}")


Average ROUGE-1 score: 5.77
