# Excercise 5
## NLP with Pytorch 🔥

Use Pytorch framework to solve the below exercises.


In [1]:
import numpy as np
import keras
import pandas as pd
import matplotlib.pyplot as plt

## 5.1 Predict rating of a movie using pytorch

**Exercise:** Use pytorch framework to predict pytorch.

In [2]:
dataTraining = pd.read_csv('https://github.com/sergiomora03/AdvancedTopicsAnalytics/raw/main/datasets/dataTraining.zip', encoding='UTF-8', index_col=0)

In [3]:
plots = dataTraining['plot']
y = (dataTraining['rating'] >= dataTraining['rating'].mean()).astype(int)

In [4]:
plots

Unnamed: 0,plot
3107,most is the story of a single father who takes...
900,a serial killer decides to teach the secrets o...
6724,"in sweden , a female blackmailer with a disfi..."
4704,"in a friday afternoon in new york , the presi..."
2582,"in los angeles , the editor of a publishing h..."
...,...
8417,""" our marriage , their wedding . "" it ' s l..."
1592,"the wandering barbarian , conan , alongside ..."
1723,"like a tale spun by scheherazade , kismet fol..."
7605,"mrs . brisby , a widowed mouse , lives in a..."


In [5]:
y

Unnamed: 0,rating
3107,1
900,0
6724,1
4704,1
2582,1
...,...
8417,0
1592,0
1723,0
7605,1


## Data Precosessing

- Remove stopwords
- Lowercase
- split the text in words
- pad_sequences

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, TensorDataset
import nltk
from nltk.corpus import stopwords
from collections import Counter
import re

nltk.download('stopwords')
dataTraining = pd.read_csv('https://github.com/sergiomora03/AdvancedTopicsAnalytics/raw/main/datasets/dataTraining.zip', encoding='UTF-8', index_col=0)
plots = dataTraining['plot']
y = (dataTraining['rating'] >= dataTraining['rating'].mean()).astype(int)

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return words

processed_plots = plots.apply(preprocess_text)
word_counts = Counter(word for plot in processed_plots for word in plot)
vocab_size = 8000
vocab = {word: i+1 for i, (word, _) in enumerate(word_counts.most_common(vocab_size))}

def text_to_sequence(text, vocab):
    return [vocab.get(word, 0) for word in text]

sequences = [text_to_sequence(plot, vocab) for plot in processed_plots]
max_length = 150
X_padded = [seq[:max_length] + [0]*(max_length - len(seq)) if len(seq) < max_length else seq[:max_length] for seq in sequences]

X_padded = torch.tensor(X_padded, dtype=torch.long)
y_tensor = torch.tensor(y.values, dtype=torch.float32)
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_tensor, test_size=0.2, random_state=42)

batch_size = 32
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Build Model

Create a neural network to predict the rating of a movie, calculate the testing set accuracy.

In [15]:
class ImprovedGRUMovieRatingPredictor(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, num_layers, dropout):
        super(ImprovedGRUMovieRatingPredictor, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(hidden_size, hidden_size // 2)
        self.fc2 = nn.Linear(hidden_size // 2, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.gru(x)
        x = self.dropout(x[:, -1, :])
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return self.sigmoid(x)

# Parámetros del modelo
embed_size = 256  # Aumentar el tamaño del embedding
hidden_size = 256  # Aumentar el tamaño de la capa oculta
output_size = 1
num_layers = 3  # Agregar más capas ocultas
dropout = 0.6  # Aumentar el Dropout

# Inicializar el modelo
model = ImprovedGRUMovieRatingPredictor(vocab_size+1, embed_size, hidden_size, output_size, num_layers, dropout)


In [16]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0003)

def train_model(model, train_loader, criterion, optimizer, epochs=20):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader):.4f}')

train_model(model, train_loader, criterion, optimizer, epochs=20)


Epoch [1/20], Loss: 0.6956
Epoch [2/20], Loss: 0.6949
Epoch [3/20], Loss: 0.6902
Epoch [4/20], Loss: 0.6797
Epoch [5/20], Loss: 0.6335
Epoch [6/20], Loss: 0.5417
Epoch [7/20], Loss: 0.4238
Epoch [8/20], Loss: 0.2967
Epoch [9/20], Loss: 0.1696
Epoch [10/20], Loss: 0.1080
Epoch [11/20], Loss: 0.0910
Epoch [12/20], Loss: 0.0769
Epoch [13/20], Loss: 0.0487
Epoch [14/20], Loss: 0.0568
Epoch [15/20], Loss: 0.0513
Epoch [16/20], Loss: 0.0441
Epoch [17/20], Loss: 0.0528
Epoch [18/20], Loss: 0.0447
Epoch [19/20], Loss: 0.0434
Epoch [20/20], Loss: 0.0376


In [17]:
def evaluate_model(model, test_loader):
    model.eval()
    y_pred = []
    y_true = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            y_pred.extend(outputs.squeeze().numpy())
            y_true.extend(labels.numpy())
    y_pred = [1 if pred >= 0.5 else 0 for pred in y_pred]
    accuracy = accuracy_score(y_true, y_pred)
    return accuracy

accuracy = evaluate_model(model, test_loader)
print(f'Accuracy on the test set: {accuracy:.4f}')


Accuracy on the test set: 0.5757
