<a href="https://colab.research.google.com/github/Nikhil-Kadapala/NeuralNets/blob/main/standardNeuralNets/stdCNN_LIME.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Final Project CS852 - Foundations of Neural Networks (FALL 2024)





Devin Borchard and Nikhil Kadapala

Department of Computer Science, University of New Hampshire

ERASER datasets: https://www.eraserbenchmark.com/

ERASER paper: https://arxiv.org/pdf/1911.03429

LIME paper: https://arxiv.org/pdf/1602.04938

# Notebook setup and PyTorch Installation

In [1]:
import sys
import numpy as np

# uncomment one of these versions (depending on whether you are on a computer with a CPU or not)

# GPU version
# !conda install --yes --prefix {sys.prefix} pytorch torchvision cudatoolkit=10.2 -c pytorch

# Just CPU
# !conda install --yes --prefix {sys.prefix} pytorch torchvision cpuonly -c pytorch

# install `Einops` for einstein-style tensor manipulation in pytorch
# Also see https://github.com/arogozhnikov/einops
# !conda install --yes --prefix {sys.prefix} einops  -c conda-forge


In [2]:
# torch test
import torch
x = torch.rand(5, 3)
print(x)

print("GPU/CUDA available? ", torch.cuda.is_available())

print("Torch version", torch.__version__)

tensor([[3.2553e-02, 5.5087e-01, 8.4201e-02],
        [3.3018e-01, 6.2233e-02, 1.9032e-01],
        [9.7753e-01, 5.6645e-02, 2.7782e-04],
        [9.6492e-01, 4.8524e-01, 3.7122e-01],
        [6.2648e-01, 2.7334e-01, 1.2060e-01]])
GPU/CUDA available?  True
Torch version 2.5.1+cu121


# **Extracting Traning, Validation, and Test Data**
# Parse the data files to extract the reviews, classifications and annotations for each split.

There are three files:
- train.jsonl: containts 1600 training examples
- val.jsonl: contains 200 validation examples
- test.json: contains 199 test examples

Each example includes:
- annotation_id: a unique id for an example of the form negR_000 for negative examples and posR_000 for positive examples.
- evidences: a list of rationales(specific parts of the review) given by humans that most influenced their classification decision.
- classification: the class of the example

The annotation_id of each example is the name of the file for the input text data
    

In [3]:
import json

def parse_data(file_path):
    data = []                                               # Initialize an empty list to store the dictionaries

    with open(file_path, 'r') as file:                      # Open the .jsonl file and read it line by line
        for line in file:
            annotation = json.loads(line)                   # Parse each line as JSON and append it to the list
            id = annotation["annotation_id"]
            annotation["classification"] = 1 if annotation['classification'] == "POS" else 0

            with open(f"./movies/docs/{id}", 'r') as file:  # open the file named by annotation_id to extract the review text
                content = file.read()
                annotation['content'] = content.replace('\n', ' ')
                data.append(annotation)
    return data

# Specify the path to your JSON file
train_file_path = './movies/train.jsonl'
val_file_path = './movies/val.jsonl'
test_file_path = './movies/test.jsonl'

train_data = parse_data(train_file_path)
validation_data = parse_data(val_file_path)
test_data = parse_data(test_file_path)

# Functions to extract reviews, classifications, and annotations
  Define a function to retrieve an example and print the relevant information.

In [4]:
def print_example(data, index, print_content=True, print_classification=True, print_rationales=True ):
    print(f'Retrieving Training Example [{index}].................\n')
    item = data[index]
    classification = item['classification']
    evidences = item['evidences']
    content = item['content']
    if print_content: print(f'Review content:\n{content}\n')
    if print_classification: print('----------------------------',
                                   '\n| Sentiment class:',
                                   classification,
                                   ("- NEG" if not classification else "- POS"),
                                   '|', '\n----------------------------')
    if print_rationales:
        print('\nHuman rationales / Supporting Evidence:')
        for evidence in evidences:
            print('     - ', evidence[0]['text'])

def get_content(data, index):
    item = data[index]
    content = item['content']
    return content

def get_classes(data, index):
    item = data[index]
    classification = item['classification']
    return classification

def get_annotations(data, index):
    item = data[index]
    content = item['evidences']
    annotations = [content[0]['text'] for evidence in content]
    return annotations

train_size = len(train_data)
val_size = len(validation_data)
test_size = len(test_data)

print(f'Dataset split: {train_size} training examples')
print(f'               {val_size} validation examples')
print(f'               {test_size} test examples\n')

print_example(train_data, 506)

Dataset split: 1600 training examples
               200 validation examples
               199 test examples

Retrieving Training Example [506].................

Review content:
this film is extraordinarily horrendous and i 'm not going to waste any more words on it .

---------------------------- 
| Sentiment class: 0 - NEG | 
----------------------------

Human rationales / Supporting Evidence:
     -  extraordinarily horrendous


# Extraction of the rationales from the evidences metadata of each human annotation of reviews.

Each annotation of the review is not the highlighted text/rationale itself but also contains metadata of the text. Use the function defined in the above cell to extract just the text and replace the evidences dictionary of the training, validation, and test datasets.

In [5]:
for i in range(len(train_data)):
  train_data[i]['evidences'] = get_annotations(train_data,i)

validation_data[i]['evidences'] = [get_annotations(validation_data,i) for i in range(len(validation_data))]

test_data[i]['evidences'] = [get_annotations(test_data,i) for i in range(len(test_data))]

print(train_data[506]['evidences'])

print(validation_data[506]['evidences'])

print(test_data[506]['evidences'])

TypeError: list indices must be integers or slices, not str

# Pre-trianed GloVe Embeddings of Training Examples
Download the pretrained GloVe Embeddings of desired dimensions using gensim downlader.

Save downloaded embeddings to a local file to avoid re-downloading when the kernel or notebook is restarted.

In [None]:
"""
    Install gensim, to use word2vec word embeddings
    Install gensim (for pre-trained word embeddings)
    #!conda install --yes --prefix {sys.prefix} gensim
"""
#import gensim
#import gensim.downloader

"""
    ONLY if you get an error after `import gensim`: update your smart_open liberary
    #!conda install --yes --prefix {sys.prefix} smart_open
    restart your notebook
    see if `import gensim` works now
"""
#wv = gensim.downloader.load("glove-wiki-gigaword-50")

#import pickle

#with open("glove_embeddings.pkl", "wb") as f:
    #pickle.dump(wv, f)


In [None]:
import pickle

with open("glove_embeddings.pkl", "rb") as f:
    wv = pickle.load(f)

# lookup the word vector for a word "india"
wv['india']

In [None]:
# downsampled embedding and zero vector for unknown words
# note the following code assums the the word embedding dimensions are dividible by 5

import einops # type: ignore
import numpy as np
from typing import List
import types

def glove_embed(word:str, target_dim)->np.array:
    '''Looks up word in embedding (downsampled to five dimensions), pads with beginning of embedding.
       Returns zero vector for unknown words.
    '''
    # these parameters work for 50-dim glove embeddings (adjust for other embeddings)
    sampled_dim = 5
    sample_batches = 10

    empty_vec=np.zeros(target_dim)
    if word in wv:
        w2v = wv[word] # lookup 50 dim vector
        a=einops.reduce(w2v,'(d seg)-> d', "sum", seg=sample_batches)  # downsample
        b=w2v[0:target_dim-sampled_dim]
        return np.hstack([a,b])
    else:
        return empty_vec

def glove_embed_sequences(sequence, target_dim):

    if isinstance(sequence, list):
        if len(sequence) == 0:
            empty_seq = np.zeros(target_dim)
            gloveTensor =  torch.tensor(empty_seq, dtype=torch.float)
        else:
            tokens = ",".join(sequence)
            words = tokens.split()
            gloveTensor = torch.stack([torch.tensor(glove_embed(word, target_dim), dtype=torch.float) for word in words])
    else:
        tokens = sequence.split()
        gloveTensor = torch.stack([torch.tensor(glove_embed(token, target_dim), dtype=torch.float) for token in tokens])

    return gloveTensor

In [None]:
import pandas as pd

reviews = [get_content(train_data, i) for i in range(train_size)]
classes = [get_classes(train_data, i) for i in range(train_size)]
rationales = [get_annotations(train_data, i) for i in range(train_size)]

print("Number of reviews in training data:",len(reviews))
print("Max seq length of reviews:", np.max([len(review.split()) for review in reviews]))

df = pd.DataFrame(train_data)
df.drop(columns=['query', 'query_type'], inplace=True)
df['evidences'] = rationales
df.head()
#rationales[7]

In [None]:
# Find the smallest review of all to inspect and understand the training data structure.
smallest_entry = df.loc[df['content'].apply(len).idxmin()]
print(smallest_entry)
id = 'negR_506.txt'
index = df['content'].apply(len).idxmin()
print(f'{df.iloc[506]['content']} \n{df.iloc[506]['classification']} \n{df.iloc[506]['evidences']}')

Extract validation set from the val.jsonl file and create a dataframe for it similar to the training set and save it to a csv file.

In [None]:
# TO DO


# Convert the reviews to their corresponding Glove embeddings

In [None]:
from torch import Tensor
from typing import Dict, List, Optional, Tuple, Union
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence

reviewGloVes = [glove_embed_sequences(review, 11) for review in df['content']]
reviewGloVes = pad_sequence(reviewGloVes, batch_first=True)

# Convert the rationales to their corresponding Glove embeddings

In [None]:

rationaleGloVes = [glove_embed_sequences(rationale, 11) for rationale in rationales]
rationaleGloVes = pad_sequence(reviewGloVes, batch_first=True)

# Convert the training data to batches using DataLoader

In [None]:
input = reviewGloVes
print("input dim (batches, max_seq_len, embed_size):",input.size())
Y_star = torch.tensor(classes, dtype=torch.float)
train_x = TensorDataset(input, Y_star)
dataLoader = DataLoader(train_x, batch_size=32, shuffle=False)

# Convolutional Neural Network Model

In [None]:
from typing import List, Optional, Tuple, Union
from torch import Tensor
class MyModel(nn.Module):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: Tuple[int, ...],
        pool_size: Tuple[int, ...],
        stride: Tuple[int, ...],
        padding: Tuple[int, ...],
        bias: bool
    ) -> None:

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.pool_size = pool_size
        self.stride = stride
        self.padding = padding
        self.bias = bias

        super(MyModel, self).__init__()
        self.ReLU_Activation = nn.ReLU()
        self.convolutionLayer = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, bias=bias)
        self.poolingLayer = nn.MaxPool1d(kernel_size=pool_size, stride=stride, padding=padding)

    def forward(
        self,
        X: Tensor,
    ) -> Tensor:
        """
        Embedding Layer X --Theta--> E
        Input: X (Tensor)
        Output: E (Tensor)
        Parameter: Theta (optional)
        """

        X = X.squeeze(0)
        X_shape = list(X.size())
        #print(X_shape)
        #print(X)
        self.embeddingLayer = nn.Linear(2809, self.in_channels, bias=False)
        self.embeddingLayer.weight.data = torch.randn((3,11),dtype=torch.float)
        E = self.embeddingLayer(X)
        #print(E)

        E_shape = list(E.size())
        if len(E_shape) < 3:
            E = E.unsqueeze(0)
        E = E.permute(0, 2, 1)

        """
        Convolution Layer E --W--> H
        Input: E (Tensor)
        Output: H (Tensor)
        Parameter: W (optional)
        """
        #self.convolutionLayer.weight.data = W_torch.permute(2, 0, 1)
        H = self.convolutionLayer(E)
        #print(H)
        H = H.squeeze(0)
        H = H.permute(1, 0)

        """
        Dectector Layer H --Psi--> D with ReLU
        Input: H (Tensor)
        Output: D (Tensor)
        Parameter: Psi (optional)
        """
        myPsi_torch = torch.randn((2,1), dtype=torch.float)
        D = torch.einsum('ij,jk->ik', H, myPsi_torch)
        self.ReLU_Activation = nn.ReLU()
        D = self.ReLU_Activation(D)
        #print(D)
        D = D.permute(1, 0)

        """
        Pooling Layer D --MaxPool--> Y
        Input: D (Tensor)
        output: Y (Tensor)
        Parameter: Pooling_window (optional)
        """
        P = self.poolingLayer(D)
        Y_hat = torch.zeros(X_shape[0])
        D_shape = list(D.size())
        out_size =  D_shape[1] - self.pool_size // self.stride + 1
        Y_hat[:out_size] = P

        return Y_hat

model = MyModel(3, 2, 3, 2, 1, 0, False)

# Training Loop

In [None]:
def train(xdata, ydata):
    '''Train the neural model with the given training data'''

    #Construct the loss function
    criterion = torch.nn.MSELoss()
    # Construct the optimizer (Stochastic Gradient Descent in this case)
    optimizer = torch.optim.SGD(model.parameters(), lr = 0.001)  # lr is learning rate

    # Gradient Descent
    for epoch in range(201):
        # Forward pass: Compute predicted y by passing x to the model
        Y_pred = model(xdata)

        # Compute and print loss
        loss = criterion(Y_pred, ydata)

        if epoch >0 and epoch % 40 == 0:
            print('epoch: ', epoch,' loss: ', loss.item())

        # Zero gradients, perform a backward pass, and update the weights.
        optimizer.zero_grad()

        # perform a backward pass (backpropagation)
        loss.backward()

        # Update the parameters
        optimizer.step()

    return Y_pred

batch_input = torch.zeros((2809, 11), dtype=torch.float)
predList = []
batch_outputs = torch.zeros(5, dtype=torch.float)
k = 0
for batch in dataLoader:
    batch_input = batch[0]
    y_star = Y_star[k]
    k += 1
    print("-----------------------------------------------------------------------")
    print("Training batch:", k)
    print("-----------------------------------------------------------------------")
    Y_pred = train(batch_input, y_star)
    #print(Y_pred.size())
    predList.append(Y_pred[:5])
    if k == 5:
        break
batch_outputs = torch.stack(predList)
#print(batch_outputs.size())
print("======================================================================")
mse=torch.mean( (batch_outputs-Y_star[:5])**2)  # detach takes the tensor out of the network
print("MSE for ground truth y_star", mse)
