# CNN PyTorch

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
import bz2
import numpy as np
import os
import pandas as pd
import pickle
import scipy as sp
import sys
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import DataLoader, TensorDataset

sys.path.append(os.path.abspath('../src'))
from fact_classification import *

2023-04-26 13:09:49.750315: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Load datafiles

In [4]:
df, df_crowdsourced, df_ground_truth = data_loading(local=True)
df['Sentiment'] = df.Sentiment.fillna(df.Sentiment[df.Verdict == -1].mean())

## Load features
Load the features matrix that we generated in the `feature_generation.ipynb` notebook. This is a large sparse matrix so ww convert it to Compressed Sparse Row (CSR) format to avoid running out of memory when fitting our models.

In [5]:
with bz2.open('../results/df_features.bz2') as f:
    df_features = pickle.load(f)

# Convert to compressed sparse row matrix
# X = sp.sparse.csr_matrix(df_features)
X = df_features

## Split data and generate indexes

We split the dataset according to the instructions in the assignment, where data up until and including year 2008 will be used for training, and data after 2008 will be used for testing. Here we also generate indexes for the various feature sets.

In [6]:
df_train, df_test, idx_train = test_train_split(df)

y = df['Verdict']
y_train = df_train['Verdict'].to_numpy()
y_test = df_test['Verdict'].to_numpy()

X_train = X[idx_train].to_numpy()
X_test = X[~idx_train].to_numpy()

# Column index for the numeric columns Sentiment and Length
col_idx_n = (df_features.columns == 'Sentiment') | (df_features.columns == 'Length')

# Column index for TF-IDF features on the raw Text column with n-grams=1
col_idx_w1 = df_features.columns.str.startswith('W1_')

# Column index for TF-IDF features on the raw Text column with n-grams=2
col_idx_w2 = df_features.columns.str.startswith('W2_')

# Column index for TF-IDF features on the stemmed text with n-grams=1
col_idx_ws = df_features.columns.str.startswith('WS_')

# Column index for POS features
col_idx_p = df_features.columns.str.startswith('P_')

# Column index for NER labels
col_idx_e = df_features.columns.str.startswith('E_')

## Define experiments

In [7]:
# Define experiments
experiments = {
    'N': col_idx_n,
    'W': col_idx_w1,
    'P': col_idx_p,
    'E': col_idx_e,
    'N_W': col_idx_n | col_idx_w1,
    'N_P': col_idx_n | col_idx_p,
    'N_E': col_idx_n | col_idx_e,
    'N_W_P': col_idx_n | col_idx_w1 | col_idx_p,
    'N_W_E': col_idx_n | col_idx_w1 | col_idx_e,
    'N_W_P_E': col_idx_n | col_idx_w1 | col_idx_p | col_idx_e
}


## Create PyTorch dataset

In [8]:
# Create PyTorch datasets and loaders
train_dataset = TensorDataset(torch.tensor(
    X_train[:, experiments['N_W_P_E']], dtype=torch.float32), torch.tensor(y_train + 1, dtype=torch.long))
test_dataset = TensorDataset(torch.tensor(
    X_test[:, experiments['N_W_P_E']], dtype=torch.float32), torch.tensor(y_test + 1, dtype=torch.long))

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


## Define the CNN classifier

In [9]:

# Define the CNN classifier
class CNNClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(CNNClassifier, self).__init__()
        self.conv1 = nn.Conv1d(
            input_size, 64, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool1d(kernel_size=2, stride=1, padding=1)
        self.fc = nn.Linear(128, num_classes)

    def forward(self, x):
        x = x.unsqueeze(1)  # Add a dummy dimension
        x = x.permute(0, 2, 1)
        x = self.conv1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x


# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameters
input_size = X_train[:, experiments['N_W_P_E']].shape[1]
num_classes = 3
learning_rate = 0.001
num_epochs = 10

# Initialize the model
model = CNNClassifier(input_size, num_classes).to(device)

# Calculate class weights
unique_classes = np.unique(y_train + 1)
class_weights = compute_class_weight(
    class_weight='balanced', classes=unique_classes, y=y_train + 1)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


## Train the model

In [10]:

# Train the model
for epoch in range(num_epochs):
    for i, (data, labels) in enumerate(train_loader):
        data = data.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(data)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


Epoch [1/10], Loss: 0.8547
Epoch [2/10], Loss: 0.4156
Epoch [3/10], Loss: 0.6211
Epoch [4/10], Loss: 0.4142
Epoch [5/10], Loss: 0.2860
Epoch [6/10], Loss: 0.4404
Epoch [7/10], Loss: 0.2242
Epoch [8/10], Loss: 0.1916
Epoch [9/10], Loss: 0.2376
Epoch [10/10], Loss: 0.1093


## Evaluate the model

Training metrics

In [11]:
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for data, labels in train_loader:
        data = data.to(device)
        labels = labels.to(device)
        outputs = model(data)
        _, predicted = torch.max(outputs.data, 1)
        predictions.extend(predicted.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

predictions = np.array(predictions) - 1
true_labels = np.array(true_labels) - 1

df_score_train = score_it(true_labels, predictions,
                          features='N_W_P_E', algorithm='CNN')
display(df_score_train)


Unnamed: 0,algorithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
0,CNN,N_W_P_E,0.995,0.814,0.942,0.964,0.949,0.993,0.976,0.96,0.971,0.895,0.959,0.961


Testing metrics

In [12]:
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for data, labels in test_loader:
        data = data.to(device)
        labels = labels.to(device)
        outputs = model(data)
        _, predicted = torch.max(outputs.data, 1)
        predictions.extend(predicted.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

predictions = np.array(predictions) - 1
true_labels = np.array(true_labels) - 1

df_score_test = score_it(true_labels, predictions,
                         features='N_W_P_E', algorithm='CNN')
display(df_score_test)


Unnamed: 0,algorithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
0,CNN,N_W_P_E,0.802,0.322,0.606,0.694,0.778,0.445,0.548,0.678,0.79,0.374,0.576,0.685


## Export results to LaTeX

In [14]:
to_latex(df_score_train)

\begin{tabular}{llrrrrrrrrrrrr}
\toprule
algorithm & features & p\_NFS & p\_UFS & p\_CFS & p\_wavg & r\_NFS & r\_UFS & r\_CFS & r\_wavg & f\_NFS & f\_UFS & f\_CFS & f\_wavg \\
\midrule
CNN & N\_W\_P\_E & 0.995 & 0.814 & 0.942 & 0.964 & 0.949 & 0.993 & 0.976 & 0.960 & 0.971 & 0.895 & 0.959 & 0.961 \\
\bottomrule
\end{tabular}



In [None]:
to_latex(df_score_test)

\begin{tabular}{llrrrrrrrrrrrr}
\toprule
algorithm & features & p\_NFS & p\_UFS & p\_CFS & p\_wavg & r\_NFS & r\_UFS & r\_CFS & r\_wavg & f\_NFS & f\_UFS & f\_CFS & f\_wavg \\
\midrule
CNN & N\_W\_P\_E & 0.802 & 0.322 & 0.606 & 0.694 & 0.778 & 0.445 & 0.548 & 0.678 & 0.790 & 0.374 & 0.576 & 0.685 \\
\bottomrule
\end{tabular}

