In [1]:
import pandas as pd
import numpy as np
import random
import torch
import os
import spacy
import subprocess
import sys
import string

from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import DataLoader,Dataset
from sklearn.metrics import classification_report, accuracy_score

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch import argmax

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import download
import nltk

from tqdm import tqdm

#importing the f1 score,precison and recall, micro F1 score
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

from datasets import load_dataset
import gensim.downloader as api
import numpy as np

from gensim.models import FastText, KeyedVectors

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
file_path = './data/Aug24-Assignmen1-Dataset1.xlsx'
train_df = pd.read_excel(file_path)

file_path = './data/Aug24-Assignment1-Validation-Dataset1.xlsx'
val_df = pd.read_excel(file_path)

sst2_data = load_dataset("sst2")
train_sst_df = pd.DataFrame(sst2_data['train'])
val_sst_df = pd.DataFrame(sst2_data['validation'])
test_sst_df = pd.read_csv('./data/SST2_TestData.csv')


In [3]:
def load_glove_model(glove_file):
    with open(glove_file, 'r', encoding="utf-8") as f:
        model = {}
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype='float32')
            model[word] = embedding
    return model

In [4]:
word2vec_model = KeyedVectors.load("./data/word2vec-google-news-300.model")
glove_model = load_glove_model("./data/glove.6B.300d.txt")
fasttext_vectors = KeyedVectors.load("./data/fasttext-wiki-news-subwords-300.model")

In [5]:
train_df.columns = ['label','text']
val_df.columns = ['label','text']

train_sst_df.columns = ['idx','text','label']
val_sst_df.columns = ['idx','text','label']
test_sst_df.columns = ['label','text']

#removing the idx and making the columns as label and text in the sst2 dataset
train_sst_df = train_sst_df.drop(columns=['idx'])
val_sst_df = val_sst_df.drop(columns=['idx'])

#interchanging the columns in the test dataset
train_sst_df = train_sst_df[['label','text']]
val_sst_df = val_sst_df[['label','text']]

In [6]:
for i in range(len(train_df)):
    if (not isinstance(train_df['text'][i], str)):
        train_df['text'][i] = str(train_df['text'][i])
    
    train_df['label'][i] -= 1

for i in range(len(val_df)):
    if (not isinstance(val_df['text'][i], str)):
        val_df['text'][i] = str(val_df['text'][i])
    
    val_df['label'][i] -= 1


for i in range(len(train_sst_df)):
    if (not isinstance(train_sst_df['text'][i], str)):
        train_sst_df['text'][i] = str(train_sst_df['text'][i])
    

for i in range(len(val_sst_df)):
    if (not isinstance(val_sst_df['text'][i], str)):
        val_sst_df['text'][i] = str(val_sst_df['text'][i])
        
for i in range(len(test_sst_df)):
    if (not isinstance(test_sst_df['text'][i], str)):
        test_sst_df['text'][i] = str(test_sst_df['text'][i])


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  train_df['label'][i] -= 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['label'][i] -= 1
You are set

In [7]:
device = torch.device('cuda:5' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
print(device)

cuda:5


In [8]:
chars_to_remove = ['--', '`', '~', '<', '>', '*', '{', '}', '^', '=', '_', '[', ']', '|', '- ', '.', ',']
def remove_punctuation(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    #using chars_to_remove
    for char in chars_to_remove:
        text = text.replace(char, '')
    return text

In [9]:
def load_data(df):
    texts = df['text'].values
    labels = df['label'].values
    
    return texts, labels

In [10]:
train_sst_texts, train_sst_labels = load_data(train_sst_df)
val_sst_texts, val_sst_labels = load_data(val_sst_df)
test_sst_texts,test_sst_labels = load_data(test_sst_df)

In [11]:
def install_spacy_model(model_name):
    try:
        spacy.load(model_name)
    except OSError:
        print(f"Model '{model_name}' not found. Installing...")
        subprocess.check_call([sys.executable, "-m", "spacy", "download", model_name])
        print(f"Model '{model_name}' installed successfully.")


install_spacy_model('en_core_web_sm')
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /data2/home/kpnaveen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
#adding another cuda device 
embedding_dim = 300

# max_aug_len = 50 # max length of augmented text that average len of tranining data
max_sst_len = 50 # max length of SST-2 dataset text that average len of tranining data

In [13]:
#creating the dataset as the embeddings are already created without ParaDataset
def createembeddings(datatexts, model1,model2,model3,maxlen,embedding_dim):
    dataset = []
    nlp = spacy.load('en_core_web_sm')
    for sentence in datatexts:
        sentence = remove_punctuation(sentence)
        # tokens = [word for word in nlp(sentence) if word.text.lower() not in stopwords.words('english')]
        tokens = [word for word in nlp(sentence)]
        tokens = [word for word in tokens if word.text.isalpha()]
        # tokens = [word for word in sentence]
        # tokens = [word for word in tokens if word.isalpha()]
        
        embeddings = []
        for token in tokens:
            if token.text.lower() in model1:
                embeddings.append(model1[token.text.lower()])
            
            # if token.text.lower() in model1:
            #     embed = model1[token.text.lower()]
            # else:
            #     # continue
            #     embed = np.zeros(embedding_dim)
            
            # if token.text.lower() in model2:
            #     embed2 = model2[token.text.lower()]
            # else:
            #     embed2 = np.zeros(embedding_dim)
            
            # if token.text.lower() in model3:
            #     embed3 = model3[token.text.lower()]
            # else:
            #     embed3 = np.zeros(embedding_dim)
            
            # embed = (embed + embed2 + embed3)/3
            
            # embeddings.append(embed)
        
        if len(embeddings) > maxlen:
            embeddings = embeddings[:maxlen]
        else:
            embeddings += [np.zeros(embedding_dim)] * (maxlen - len(embeddings))
        
        #storing the format to send to the model
        dataset.append(embeddings)
        
    return dataset

In [14]:
model1 = word2vec_model
model2 = glove_model
model3 = fasttext_vectors
train_sst_dataset = createembeddings(train_sst_texts, model1,model2,model3, max_sst_len,embedding_dim)
val_sst_dataset = createembeddings(val_sst_texts, model1,model2,model3, max_sst_len,embedding_dim)
test_sst_dataset = createembeddings(test_sst_texts, model1,model2,model3, max_sst_len,embedding_dim)

In [15]:
class ParaDataset(Dataset):
    def __init__(self, data, labels):

        self.data = data
        self.labels = labels
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        text = self.data[idx]
        label = self.labels[idx]
        
        text = np.array(text)
        label = np.array(label)
        
        return torch.tensor(text, dtype=torch.float32), torch.tensor(label)

In [16]:
train_dataset = ParaDataset(train_sst_dataset, train_sst_labels)
# print(train_dataset[0])
val_dataset = ParaDataset(val_sst_dataset, val_sst_labels)

test_dataset = ParaDataset(test_sst_dataset, test_sst_labels)


In [17]:
#CNN model
class CNNTextClassifier(nn.Module):
    def __init__(self, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout):
        
        super(CNNTextClassifier, self).__init__()
        
        # self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # self.fc1 = nn.Linear(embedding_dim, embedding_dim)
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, text):

        text = text.unsqueeze(1)

        # x1 = text

        # x = self.fc1(text)
        # x = x1 + x
        
        x= text

        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in x]
        
        x = self.dropout(torch.cat(pooled, dim = 1))

        x = self.fc(x)
  
       
        return x

In [18]:

#Parameters
batch_size = 64

n_filters = 500
filter_sizes = [2,3,4,5]
output_dim = 2
dropout = 0.5
pad_idx = 0
learning_rate = 0.001

num_epochs = 10

In [19]:
# model = CNNTextClassifier(word2vec_vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, pad_idx)
model = CNNTextClassifier( embedding_dim, n_filters, filter_sizes, output_dim, dropout)
# model = CNNTextClassifier(embedding_dim, max_sst_len, output_dim)
# model = CNNTextClassifier(fasttext_vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, pad_idx)
print(torch.cuda.is_available())
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# optimizer = optim.SGD(model.parameters(), lr=learning_rate)

True


In [20]:

def train(model, iterator, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    
    total = 0
    # print("Training started")
    
    for batch in iterator:

        text, labels = batch

        text = text.to(device)
        labels = labels.to(device)
        
        # if labels.max().item() >= output_dim:
        #     raise ValueError(f"Target label {labels.max().item()} is out of bounds. Ensure output_dim is correct.")

        optimizer.zero_grad()
        predictions = model(text).squeeze(1)

        # print("predictions shape:",predictions.shape)
        # print("labels shaoe:",labels.shape)
        
        # predictions = argmax(predictions, dim=1)
        
        loss = criterion(predictions, labels)
        
        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()

        epoch_acc += (predictions.argmax(1) == labels).sum().item()

        total += labels.size(0)

    train_loss = epoch_loss / len(iterator)
    trian_acc = epoch_acc / total

    # print("Training completed")
    
    return train_loss, trian_acc

def evaluate(model, iterator, criterion, device):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    
    total = 0
    
    with torch.no_grad():
        for batch in iterator:
            text, labels = batch
            text = text.to(device)
            labels = labels.to(device)
            predictions = model(text).squeeze(1)
            # loss = criterion(predictions, labels.float())
            loss = criterion(predictions, labels)
            epoch_loss += loss.item()
            epoch_acc += (predictions.argmax(1) == labels).sum().item()
            total += labels.size(0)
            
    test_loss = epoch_loss / len(iterator)
    test_acc = epoch_acc / total
    
    return test_loss, test_acc

In [21]:
# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

In [22]:
for epoch in range(num_epochs):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
    print(f'Epoch: {epoch+1:02} \tTrain Loss: {train_loss:.5f} \tTrain Acc: {train_acc:.5f}')
    
    val_loss, val_acc = evaluate(model, val_loader, criterion, device)
    print(f'Epoch: {epoch+1:02} \tVal Loss: {val_loss:.5f} \tVal Acc: {val_acc:.5f}')


Epoch: 01 	Train Loss: 0.30023 	Train Acc: 0.87315
Epoch: 01 	Val Loss: 0.39557 	Val Acc: 0.82569
Epoch: 02 	Train Loss: 0.20677 	Train Acc: 0.91935
Epoch: 02 	Val Loss: 0.41516 	Val Acc: 0.84748
Epoch: 03 	Train Loss: 0.17271 	Train Acc: 0.93422
Epoch: 03 	Val Loss: 0.48779 	Val Acc: 0.82225
Epoch: 04 	Train Loss: 0.15241 	Train Acc: 0.94496
Epoch: 04 	Val Loss: 0.48485 	Val Acc: 0.83945
Epoch: 05 	Train Loss: 0.14059 	Train Acc: 0.94918
Epoch: 05 	Val Loss: 0.48046 	Val Acc: 0.84862
Epoch: 06 	Train Loss: 0.13107 	Train Acc: 0.95364
Epoch: 06 	Val Loss: 0.47949 	Val Acc: 0.84748
Epoch: 07 	Train Loss: 0.12567 	Train Acc: 0.95679
Epoch: 07 	Val Loss: 0.50329 	Val Acc: 0.84748
Epoch: 08 	Train Loss: 0.11903 	Train Acc: 0.95889
Epoch: 08 	Val Loss: 0.54625 	Val Acc: 0.83830
Epoch: 09 	Train Loss: 0.11346 	Train Acc: 0.96104
Epoch: 09 	Val Loss: 0.53854 	Val Acc: 0.84289
Epoch: 10 	Train Loss: 0.11256 	Train Acc: 0.96171
Epoch: 10 	Val Loss: 0.55930 	Val Acc: 0.83945


In [23]:
#saving the model
torch.save(model.state_dict(), 'cnn_sst_model.pth')

In [24]:
#use the saved model for the predictions on test data
model = CNNTextClassifier( embedding_dim, n_filters, filter_sizes, output_dim, dropout)
model.load_state_dict(torch.load('cnn_sst_model.pth'))
model.eval()

  model.load_state_dict(torch.load('cnn_sst_model.pth'))


CNNTextClassifier(
  (convs): ModuleList(
    (0): Conv2d(1, 500, kernel_size=(2, 300), stride=(1, 1))
    (1): Conv2d(1, 500, kernel_size=(3, 300), stride=(1, 1))
    (2): Conv2d(1, 500, kernel_size=(4, 300), stride=(1, 1))
    (3): Conv2d(1, 500, kernel_size=(5, 300), stride=(1, 1))
  )
  (fc): Linear(in_features=2000, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [25]:
test_sst_dataset = createembeddings(test_sst_texts, model1,model2,model3, max_sst_len,embedding_dim)

In [26]:
test_dataset = ParaDataset(test_sst_dataset, test_sst_labels)
test_loader = DataLoader(test_dataset, batch_size, shuffle=False)

In [27]:
model = model.to(device)

In [28]:
def predict(model, iterator, device):
    model.eval()
    predictions = []
    
    with torch.no_grad():
        for batch in iterator:
            text, _ = batch
            text = text.to(device)
            
            output = model(text).squeeze(1)
            output = argmax(output, dim=1)
            predictions.append(output)
    
    return torch.cat(predictions).tolist()

preds = predict(model, test_loader, device)
# caluculating the accuracy,micro F1 score, precision and recall, f1 score
print(classification_report(test_sst_labels, preds))
print('Accuracy:', accuracy_score(test_sst_labels, preds))
print('Micro F1:', f1_score(test_sst_labels, preds, average='micro'))
print('Macro F1:', f1_score(test_sst_labels, preds, average='macro'))
print('Weighted F1:', f1_score(test_sst_labels, preds, average='weighted'))
print('Recall:', recall_score(test_sst_labels, preds, average='micro'))
print('Precision:', precision_score(test_sst_labels, preds, average='micro'))

#F1 score not micro or macro

              precision    recall  f1-score   support

           0       0.86      0.85      0.86       911
           1       0.85      0.86      0.86       909

    accuracy                           0.86      1820
   macro avg       0.86      0.86      0.86      1820
weighted avg       0.86      0.86      0.86      1820

Accuracy: 0.8582417582417582
Micro F1: 0.8582417582417582
Macro F1: 0.8582402175606304
Weighted F1: 0.8582397040002544
Recall: 0.8582417582417582
Precision: 0.8582417582417582


In [29]:
#saving the predictions into xlsx file and txt file
with open('cnn_sst_predictions.txt', 'w') as f:
    for item in preds:
        f.write("%s\n" % item)
    
with open('cnn_sst_test_labels.xlsx', 'w') as f:
    for item in preds:
        f.write("%s\n" % item)