# Exercise 2

In [1]:
import os
import re
import glob
import string
import numpy as np
import torch
import pandas as pd

# Raw Reviews

## Load data

Like in Exercise 1, read all the training data, including the reviews and the scores associated to each one. You can use the following helper functions to read the data:

1. Be sure to have downloaded the dataset from the link provided in the exercise and have read the README file
1. Be sure to have copied the dataset next to this Jupyter (.ipynb file)
1. Be sure to have installed:
    * Pytorch
    * NLTK (This library is going to be used only for the stemming process, no more)
    * Sklearn (Only for building a Random Forest)

In [2]:
def sorter(item):
    """ Function tha gets only the first number of the name of the file and organizes the files base on that"""
    
    return int(os.path.basename(item).split('_')[0])

def read_raw_text(path_data):
    """ Function for reading the raw data in the .txt files. 
    
    Parameters
    ----------
    path_data: str
        path of the folder that contains the data that is going to be used. (should be test or train)
        
    Returns
    ---------
    data,scores: array_like
        Data arrays, X is an array of shape [#documents of the dataset, #words in the vocabulary], y is an array of shape [#documents,] 
    """
    
    data = []
    scores = []
    
    sentiments = ['pos', 'neg']
    for sentiment in sentiments:
        path_vocab_pos = os.path.join("../datasets", "aclImdb", path_data, sentiment, "*.txt")
        
        for filename in sorted(glob.glob(path_vocab_pos), key=sorter):
            
            with open(filename) as f:
                
                lines = f.read()
                
                data.append(lines)
                scores.append(int(os.path.basename(filename).split('_')[1].strip('.txt')))
    return data, scores


def read_vocab():
    """ Function for reading the vocabulary (.vocab file). 
    
    Parameters
    ----------
    None
        
    Returns
    ---------
    vocab: list
        list with the values different tokens that compose the vocabulary ...... 
    """
    
    path_vocab = os.path.join("..", "datasets", "aclImdb", "imdb.vocab")
    
    with open(path_vocab, encoding='utf-8') as f:
        lines = f.read()

    lines = lines.split('\n')
    
    vocab = []
    for line in lines:
        vocab.append(line)
    
    return vocab

In [3]:
# Read the vocabulary
vocabulary = read_vocab()
# Read reviews and y
data, scores = read_raw_text('train')
data_test, scores_test = read_raw_text('test')

## Task 1: Pipeline for Cleaning the Raw Reviews 

> **Hint**: You can use the functions you have already implemented in the previous exercise.

In [35]:
class PreprocessingPipeline:
    def __init__(self):
        self.steps = []

    def add_step(self, step, input_column = None, output_column = None, active=True):
        self.steps.append({'step': step, 'input': input_column, 'output': output_column, 'active': active})

    def process(self, df):
        df_copy = df.copy()
        for step in self.steps:
            if step['active']:
                if step['input'] and step['output']:
                    df_copy[step['output']] = df_copy[step['input']].apply(step['step'])
                else:
                    df_copy = step['step'](df_copy)
        return df_copy

    def set_active(self, step_name, active):
        for step in self.steps:
            if step['step'].__name__ == step_name:
                step['active'] = active
                

### Tokenising the text

In [36]:
def tokenize(text):
    if not isinstance(text, str):
        raise TypeError("Input must be a string.")
    if not text:
        return []
    # Split on any whitespace or punctuation character
    tokens = re.split(r'[\s{}]+'.format(re.escape(string.punctuation)), text.lower())
    # Remove empty tokens
    tokens = [token for token in tokens if token]
    return tokens

def load_data_to_df(data, scores):
    df = pd.DataFrame(data={'text': data, 'score': scores})
    return df

In [37]:
train_df = load_data_to_df(data, scores)
test_df = load_data_to_df(data_test, scores_test)

In [38]:
from collections import Counter
from itertools import chain


def filter_high_frequency_terms(df, column, max_freq=0.2):
    n_docs = len(df)
    
    # document frequency
    doc_freq = Counter(chain.from_iterable(set(toks) for toks in df[column]))
    cutoff = max_freq * n_docs
    
    drop_tokens = {tok for tok, dfreq in doc_freq.items() if dfreq > cutoff}

    df['filtered_tokens'] = df[column].apply(lambda x: [tok for tok in x if tok not in drop_tokens])

    return df
    

In [39]:
def replace_numbers(tokens):
    if not isinstance(tokens, list):
        raise TypeError("Input must be a list of tokens.")
    return [re.sub(r'\d+', '<NUM>', token) for token in tokens]

In [82]:
import test


pipeline = PreprocessingPipeline()

pipeline.add_step(tokenize, input_column='text', output_column='tokens')
pipeline.add_step(replace_numbers, input_column='tokens', output_column='tokens')
pipeline.add_step(lambda df: filter_high_frequency_terms(df, 'tokens', max_freq=0.2))

# Apply the preprocessing pipeline to the training and test dataframes
train_df = pipeline.process(train_df)
print("Training DataFrame after preprocessing:")
print(train_df.head())

test_df = pipeline.process(test_df)
print("Test DataFrame after preprocessing:")
print(test_df.head())

Training DataFrame after preprocessing:
                                                text  score  \
0  Bromwell High is a cartoon comedy. It ran at t...      9   
1  If you like adult comedy cartoons, like South ...      7   
2  Bromwell High is nothing short of brilliant. E...      9   
3  "All the world's a stage and its people actors...     10   
4  FUTZ is the only show preserved from the exper...      8   

                                              tokens  \
0  [bromwell, high, is, a, cartoon, comedy, it, r...   
1  [if, you, like, adult, comedy, cartoons, like,...   
2  [bromwell, high, is, nothing, short, of, brill...   
3  [all, the, world, s, a, stage, and, its, peopl...   
4  [futz, is, the, only, show, preserved, from, t...   

                                     filtered_tokens  label  new_label  
0  [bromwell, high, cartoon, comedy, ran, same, p...      1          3  
1  [adult, comedy, cartoons, south, park, nearly,...      1          2  
2  [bromwell, high, nothi

## Task 2: Representations of Data 

### BOW
Each review is represented by a vector. It has the length of the vocabulary and for each word in the review, the vector contains the number of appearances on the review

In [85]:
from collections import Counter
from itertools import chain
import numpy as np
from scipy.sparse import csr_matrix
def build_bow_csr(df, vocab):
    """
    Build a bag-of-words CSR matrix from a dataframe with a 'filtered_tokens' column.
    
    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing a 'filtered_tokens' column (list of tokens per document).
    vocab : list
        List of vocabulary tokens.
        
    Returns
    -------
    bow_csr : scipy.sparse.csr_matrix
        Bag-of-words matrix (documents x vocabulary).
    """
    word2idx = {w: i for i, w in enumerate(vocab)}
    rows, cols, data = [], [], []
    docs = df['filtered_tokens'].tolist()
    for row, doc in enumerate(docs):
        ctr = Counter(tok for tok in doc if tok in word2idx)
        rows.extend([row] * len(ctr))
        cols.extend([word2idx[tok] for tok in ctr])
        data.extend(ctr.values())
    bow_csr = csr_matrix(
        (data, (rows, cols)),
        shape=(len(docs), len(vocab)),
        dtype=np.int32
    )
    
    
    return bow_csr

bow_csr_train = build_bow_csr(train_df, vocabulary)
bow_csr_test = build_bow_csr(test_df, vocabulary)

In [86]:
train_df['label'] = train_df['score'].apply(lambda x: 1 if x > 5 else 0)
test_df['label'] = test_df['score'].apply(lambda x: 1 if x > 5 else 0)

## Task 3: Logistic Regression

In [87]:
import torch
import torch.nn as nn
from torch import optim

In [88]:
# Logistic Regression model ---- is everything ready?
# Please refer to this link to the basics of bulding a model with Pytorch 
#        - https://pytorch.org/tutorials/beginner/introyt/modelsyt_tutorial.html

class LogisticRegression(nn.Module):
    
    def __init__(self, input_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, 1)
        
    def forward(self, x):
        return self.linear(x).squeeze(1)

In [89]:
### Define the input dimension input_d, output dimension output_d, batch size, number of epochs, iterations, etc..

input_d = bow_csr_train.shape[1]
output_d = 1
batch_size = 64
num_epochs = 30
learning_rate = 0.01
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [90]:
def csr_to_torch_sparse(csr, *, dtype=torch.float32, device="cpu"):
    """
    csr : scipy.sparse.csr_matrix
    returns: torch.sparse_coo_tensor with the same shape and data
    """
    # COO gives explicit row/col indices
    coo = csr.tocoo()
    indices = torch.tensor(
        [coo.row, coo.col],   # shape 2 × nnz
        dtype=torch.int64,
        device=device
    )
    values  = torch.tensor(
        coo.data,
        dtype=dtype,
        device=device
    )
    shape = coo.shape
    return torch.sparse_coo_tensor(indices, values, shape, dtype=dtype, device=device).coalesce()

In [91]:
# Instantiate the LR model
model = LogisticRegression(input_d).to(device)

# Loss class
criterion = nn.BCEWithLogitsLoss()

# Instantiate the Optimizer Class.Do not forget to set the learning rate
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# Define the labels
train_labels = train_df['label'].values
test_labels = test_df['label'].values

# Convert the data to torch tensors
train_data = csr_to_torch_sparse(bow_csr_train, device=device)
train_labels = torch.tensor(train_labels, dtype=torch.int8, device=device)
test_data = csr_to_torch_sparse(bow_csr_test, device=device)
test_labels = torch.tensor(test_labels, dtype=torch.int8, device=device)

In [92]:
# Define the training loop 

for epoch in range(num_epochs):
    model.train()
    
    # Forward pass
    optimizer.zero_grad()
    outputs = model(train_data)
    
    # Compute the loss
    loss = criterion(outputs, train_labels.float())
    
    # Backward pass and optimization
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 1 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')


Epoch [1/30], Loss: 0.6933
Epoch [2/30], Loss: 0.6927
Epoch [3/30], Loss: 0.6922
Epoch [4/30], Loss: 0.6916
Epoch [5/30], Loss: 0.6911
Epoch [6/30], Loss: 0.6905
Epoch [7/30], Loss: 0.6900
Epoch [8/30], Loss: 0.6894
Epoch [9/30], Loss: 0.6889
Epoch [10/30], Loss: 0.6883
Epoch [11/30], Loss: 0.6878
Epoch [12/30], Loss: 0.6872
Epoch [13/30], Loss: 0.6867
Epoch [14/30], Loss: 0.6862
Epoch [15/30], Loss: 0.6856
Epoch [16/30], Loss: 0.6851
Epoch [17/30], Loss: 0.6846
Epoch [18/30], Loss: 0.6840
Epoch [19/30], Loss: 0.6835
Epoch [20/30], Loss: 0.6830
Epoch [21/30], Loss: 0.6824
Epoch [22/30], Loss: 0.6819
Epoch [23/30], Loss: 0.6814
Epoch [24/30], Loss: 0.6809
Epoch [25/30], Loss: 0.6804
Epoch [26/30], Loss: 0.6798
Epoch [27/30], Loss: 0.6793
Epoch [28/30], Loss: 0.6788
Epoch [29/30], Loss: 0.6783
Epoch [30/30], Loss: 0.6778


In [93]:
# Evaluate the model
model.eval()
with torch.no_grad():
    test_outputs = model(test_data)
    test_loss = criterion(test_outputs, test_labels.float())
    predicted = torch.round(torch.sigmoid(test_outputs))
    accuracy = (predicted == test_labels).float().mean()
    print(f'Test Loss: {test_loss.item():.4f}, Test Accuracy: {accuracy.item():.4f}')

Test Loss: 0.6782, Test Accuracy: 0.7665


In [94]:
# Logistic regression with 4 different sentiments
class LogisticRegression4(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegression4, self).__init__()
        self.linear = nn.Linear(input_dim, 4)
        
    def forward(self, x):
        return self.linear(x).squeeze(1)

In [95]:
train_df['new_label'] = train_df['score'].apply(lambda x: 0 if 0 < x < 3 else (1 if 2 < x < 5 else (2 if 6 < x < 9  else 3)))
test_df['new_label'] = test_df['score'].apply(lambda x: 0 if 0 < x < 3 else (1 if 2 < x < 5 else (2 if 6 < x < 9  else 3)))
train_labels = train_df['new_label'].values
test_labels = test_df['new_label'].values
train_labels = torch.tensor(train_labels, dtype=torch.int8, device=device)
test_labels = torch.tensor(test_labels, dtype=torch.int8, device=device)


In [96]:
# Instantiate the LR model
input_d = bow_csr_train.shape[1]
model = LogisticRegression4(input_d).to(device)

# Loss class
criterion = nn.CrossEntropyLoss()

# Instantiate the Optimizer Class.Do not forget to set the learning rate
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

In [97]:
# training loop
for epoch in range(num_epochs):
    model.train()
    
    # Forward pass
    optimizer.zero_grad()
    outputs = model(train_data)
    
    # Compute the loss
    loss = criterion(outputs, train_labels.long())
    
    # Backward pass and optimization
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 1 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluate the model
model.eval()
with torch.no_grad():
    test_outputs = model(test_data)
    test_loss = criterion(test_outputs, test_labels.long())
    predicted = torch.argmax(test_outputs, dim=1)
    accuracy = (predicted == test_labels).float().mean()
    print(f'Test Loss: {test_loss.item():.4f}, Test Accuracy: {accuracy.item():.4f}')

Epoch [1/30], Loss: 1.3866
Epoch [2/30], Loss: 1.3855
Epoch [3/30], Loss: 1.3845
Epoch [4/30], Loss: 1.3835
Epoch [5/30], Loss: 1.3825
Epoch [6/30], Loss: 1.3815
Epoch [7/30], Loss: 1.3805
Epoch [8/30], Loss: 1.3795
Epoch [9/30], Loss: 1.3786
Epoch [10/30], Loss: 1.3776
Epoch [11/30], Loss: 1.3767
Epoch [12/30], Loss: 1.3757
Epoch [13/30], Loss: 1.3748
Epoch [14/30], Loss: 1.3739
Epoch [15/30], Loss: 1.3729
Epoch [16/30], Loss: 1.3720
Epoch [17/30], Loss: 1.3711
Epoch [18/30], Loss: 1.3702
Epoch [19/30], Loss: 1.3693
Epoch [20/30], Loss: 1.3684
Epoch [21/30], Loss: 1.3675
Epoch [22/30], Loss: 1.3667
Epoch [23/30], Loss: 1.3658
Epoch [24/30], Loss: 1.3649
Epoch [25/30], Loss: 1.3641
Epoch [26/30], Loss: 1.3632
Epoch [27/30], Loss: 1.3624
Epoch [28/30], Loss: 1.3615
Epoch [29/30], Loss: 1.3607
Epoch [30/30], Loss: 1.3598
Test Loss: 1.3606, Test Accuracy: 0.4607


# Feed forward model

In [112]:
class FeedForwardModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(FeedForwardModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 64)  # Example hidden layer size
        self.fc3 = nn.Linear(64, output_dim)  # For binary classification
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

In [113]:
input_dimension = bow_csr_train.shape[1]
hidden_dimension = 128  # Example size for hidden layer
output_dimension = 1    # Binary classification
model = FeedForwardModel(input_dimension, hidden_dimension, output_dimension).to(device)

criterion = nn.BCEWithLogitsLoss()  # Example for binary classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [114]:
train_data = csr_to_torch_sparse(bow_csr_train, device=device)
train_labels = torch.tensor(train_df['label'].values, dtype=torch.float32, device=device)
test_data = csr_to_torch_sparse(bow_csr_test, device=device)
test_labels = torch.tensor(test_df['label'].values, dtype=torch.float32, device=device)

In [116]:
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(train_data).squeeze(1)  # Ensure outputs are of shape [batch_size, 1]
    loss = criterion(outputs, train_labels)
    loss.backward()
    optimizer.step()
    
    # Compute accuracy and F1 score on the training set
    with torch.no_grad():
        preds = torch.round(torch.sigmoid(outputs))
        acc = (preds == train_labels).float().mean().item()
        # F1 score calculation
        tp = ((preds == 1) & (train_labels == 1)).sum().item()
        fp = ((preds == 1) & (train_labels == 0)).sum().item()
        fn = ((preds == 0) & (train_labels == 1)).sum().item()
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
        print(f"Train Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')
    
from sklearn.metrics import confusion_matrix
print(confusion_matrix(train_labels.cpu().numpy(), preds.cpu().numpy()))

Train Accuracy: 0.5000, F1 Score: 0.0000
Epoch [1/30], Loss: 0.6937
Train Accuracy: 0.6050, F1 Score: 0.3602
Epoch [2/30], Loss: 0.6827
Train Accuracy: 0.7800, F1 Score: 0.7329
Epoch [3/30], Loss: 0.6637
Train Accuracy: 0.8261, F1 Score: 0.7974
Epoch [4/30], Loss: 0.6422
Train Accuracy: 0.8647, F1 Score: 0.8504
Epoch [5/30], Loss: 0.6183
Train Accuracy: 0.8960, F1 Score: 0.8909
Epoch [6/30], Loss: 0.5915
Train Accuracy: 0.9092, F1 Score: 0.9071
Epoch [7/30], Loss: 0.5628
Train Accuracy: 0.9168, F1 Score: 0.9156
Epoch [8/30], Loss: 0.5319
Train Accuracy: 0.9226, F1 Score: 0.9213
Epoch [9/30], Loss: 0.4993
Train Accuracy: 0.9264, F1 Score: 0.9250
Epoch [10/30], Loss: 0.4664
Train Accuracy: 0.9310, F1 Score: 0.9296
Epoch [11/30], Loss: 0.4331
Train Accuracy: 0.9353, F1 Score: 0.9343
Epoch [12/30], Loss: 0.3997
Train Accuracy: 0.9392, F1 Score: 0.9386
Epoch [13/30], Loss: 0.3671
Train Accuracy: 0.9433, F1 Score: 0.9429
Epoch [14/30], Loss: 0.3360
Train Accuracy: 0.9471, F1 Score: 0.9468
Ep

In [117]:
model.eval()
with torch.no_grad():
    test_outputs = model(test_data).squeeze(1)  # Ensure the output is 1D
    test_loss = criterion(test_outputs, test_labels)
    predicted = torch.round(torch.sigmoid(test_outputs))
    accuracy = (predicted == test_labels).float().mean()
    
    # F1 score calculation for the test set
    tp = ((predicted == 1) & (test_labels == 1)).sum().item()
    fp = ((predicted == 1) & (test_labels == 0)).sum().item()
    fn = ((predicted == 0) & (test_labels == 1)).sum().item()
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
    
    print(f'Test Loss: {test_loss.item():.4f}, Test Accuracy: {accuracy.item():.4f}, F1 Score: {f1:.4f}')
    print(confusion_matrix(test_labels.cpu().numpy(), predicted.cpu().numpy()))

Test Loss: 0.4032, Test Accuracy: 0.8528, F1 Score: 0.8485
[[11017  1483]
 [ 2197 10303]]
