In [3]:
from ipynb.fs.full.preprocess import *

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  "source": [
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  "source": [
You are setting values through chained 

In [4]:
import pandas as pd
import numpy as np
import random
import torch
import os
import spacy
import subprocess
import sys
import string

from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import DataLoader,Dataset
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch import argmax

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import download
import nltk

from tqdm import tqdm

In [5]:
device = torch.device('cuda:7' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')

In [6]:
chars_to_remove = ['--', '`', '~', '<', '>', '*', '{', '}', '^', '=', '_', '[', ']', '|', '- ', '.', ',']
def remove_punctuation(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    #using chars_to_remove
    for char in chars_to_remove:
        text = text.replace(char, '')
    return text

In [7]:
def load_data(df):
    texts = df['text'].values
    labels = df['label'].values
    
    return texts, labels

In [8]:
train_df['label']

0       3
1       4
2       3
3       4
4       4
       ..
3994    4
3995    3
3996    4
3997    4
3998    4
Name: label, Length: 3999, dtype: int64

In [9]:
train_texts, train_labels = load_data(train_df)
val_texts, val_labels = load_data(val_df)

# train_sst_texts, train_sst_labels = load_data(train_sst_df)
# val_sst_texts, val_sst_labels = load_data(val_sst_df)
# test_sst_texts,test_sst_labels = load_data(test_sst_df)

In [10]:
print(len(train_texts), len(val_texts))

3999 802


In [11]:
def install_spacy_model(model_name):
    try:
        spacy.load(model_name)
    except OSError:
        print(f"Model '{model_name}' not found. Installing...")
        subprocess.check_call([sys.executable, "-m", "spacy", "download", model_name])
        print(f"Model '{model_name}' installed successfully.")


install_spacy_model('en_core_web_sm')
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /data2/home/kpnaveen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:

#adding another cuda device 
embedding_dim = 300

max_aug_len = 50 # max length of augmented text that average len of tranining data
# max_sst_len = 10 # max length of SST-2 dataset text that average len of tranining data

In [24]:
def createembeddings(datatexts, model1,model2,model3,maxlen,embedding_dim):
    dataset = []
    nlp = spacy.load('en_core_web_sm')
    for sentence in datatexts:
        sentence = remove_punctuation(sentence)
        tokens = [word for word in nlp(sentence) if word.text.lower() not in stopwords.words('english')]
        tokens = [word for word in tokens if word.text.isalpha()]
        # tokens = [word for word in sentence]
        # tokens = [word for word in tokens if word.isalpha()]
        embeddings = []
        for token in tokens:
            if token.text.lower() in model3:
                embeddings.append(model3[token.text.lower()])
            # else:
            #     embeddings.append(np.zeros(embedding_dim))
            # embed = np.zeros(embedding_dim)
            # count = 0

            # if token.text.lower() in model1:
            #     embed += model1[token.text.lower()]
            #     count += 1

            # if token.text.lower() in model2:
            #     embed += model2[token.text.lower()]
            #     count += 1

            # if token.text.lower() in model3:
            #     embed += model3[token.text.lower()]
            #     count += 1

            # if count > 0:
            #     embed /= count

            # embeddings.append(embed)
        
        if len(embeddings) > maxlen:
            embeddings = embeddings[:maxlen]
        else:
            embeddings += [np.zeros(embedding_dim)] * (maxlen - len(embeddings))
        
        #storing the format to send to the model
        dataset.append(embeddings)
        
    return dataset

In [25]:
model1 = word2vec_model
model2 = glove_model
model3 = fasttext_vectors
train_dataset_texts = createembeddings(train_texts,model1,model2,model3, max_aug_len,embedding_dim)
val_dataset_texts = createembeddings(val_texts, model1,model2,model3, max_aug_len,embedding_dim)
# test_sst_dataset = createembeddings(test_sst_texts, model1,model2,model3, max_sst_len,embedding_dim)

In [26]:
class ParaDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        text = self.data[idx]
        label = self.labels[idx]
        
        # return torch.tensor(text,dtype=torch.float32), torch.tensor(label)
        #Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at ../torch/csrc/utils/tensor_new.cpp:278.)
        text = np.array(text)
        label = np.array(label)
        return torch.tensor(text,dtype=torch.float32), torch.tensor(label)


In [27]:
# train_dataset = ParaDataset(train_texts, train_labels, max_aug_len, glove_model, embedding_dim)
# val_dataset = ParaDataset(val_texts, val_labels, max_aug_len, glove_model, embedding_dim)


train_dataset = ParaDataset(train_dataset_texts, train_labels)
val_dataset = ParaDataset(val_dataset_texts, val_labels)

In [28]:
#CNN model
class CNNTextClassifier(nn.Module):
    def __init__(self, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout):
        
        super(CNNTextClassifier, self).__init__()
        
        # self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # self.fc1 = nn.Linear(embedding_dim, embedding_dim)
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, text):
        
        text = text.unsqueeze(1)
        # x1 = text

        # x= self.fc1(text)
    
        # x = x + x1
        x = text

        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in x]
        
        x = self.dropout(torch.cat(pooled, dim = 1))

        x = self.fc(x)

        return x

In [29]:
#Parameters
batch_size = 32

n_filters = 500
filter_sizes = [3,4,5]
output_dim = 5
dropout = 0.5
pad_idx = 0
learning_rate = 0.001

num_epochs = 10

In [30]:
# model = CNNTextClassifier(word2vec_vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, pad_idx)
model = CNNTextClassifier( embedding_dim, n_filters, filter_sizes, output_dim, dropout)
# model = CNNTextClassifier(embedding_dim, max_aug_len, output_dim)
# model = CNNTextClassifier(fasttext_vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, pad_idx)
print(torch.cuda.is_available())
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

True


In [31]:

def train(model, iterator, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    
    total = 0
    
    for batch in iterator:
        text, labels = batch
        # print("text shape:",text.shape)
        # text = text.permute(0, 2, 1)   
        # print("text shape:",text.shape)  
        text = text.to(device)
        labels = labels.to(device)
        
        if labels.max().item() >= output_dim:
            raise ValueError(f"Target label {labels.max().item()} is out of bounds. Ensure output_dim is correct.")
        
        optimizer.zero_grad()
        predictions = model(text).squeeze(1)
        
        # print("predictions shape:",predictions.shape)
        # print("labels shaoe:",labels.shape)
        
        # predictions = argmax(predictions, dim=1)
        
        loss = criterion(predictions, labels)
        
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += (predictions.argmax(1) == labels).sum().item()
        total += labels.size(0)

    train_loss = epoch_loss / len(iterator)
    trian_acc = epoch_acc / total

    return train_loss, trian_acc

def evaluate(model, iterator, criterion, device):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    
    total = 0
    
    with torch.no_grad():
        for batch in iterator:
            text, labels = batch
            text = text.to(device)
            labels = labels.to(device)
            predictions = model(text).squeeze(1)
            # loss = criterion(predictions, labels.float())
            loss = criterion(predictions, labels)
            epoch_loss += loss.item()
            epoch_acc += (predictions.argmax(1) == labels).sum().item()
            total += labels.size(0)
            
            
    test_loss = epoch_loss / len(iterator)
    test_acc = epoch_acc / total
    
    return test_loss, test_acc

In [32]:
# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

In [33]:
for epoch in range(num_epochs):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
    print(f'Epoch: {epoch+1:02} \tTrain Loss: {train_loss:.5f} \tTrain Acc: {train_acc:.5f}')
    
    val_loss, val_acc = evaluate(model, val_loader, criterion, device)
    print(f'Epoch: {epoch+1:02} \tVal Loss: {val_loss:.5f} \tVal Acc: {val_acc:.5f}')
    # if(val_acc>0.804):
    #     break

Epoch: 01 	Train Loss: 0.74929 	Train Acc: 0.79795
Epoch: 01 	Val Loss: 0.70197 	Val Acc: 0.79925
Epoch: 02 	Train Loss: 0.65261 	Train Acc: 0.80320
Epoch: 02 	Val Loss: 0.70529 	Val Acc: 0.80175
Epoch: 03 	Train Loss: 0.57675 	Train Acc: 0.81270
Epoch: 03 	Val Loss: 0.69308 	Val Acc: 0.79676
Epoch: 04 	Train Loss: 0.51626 	Train Acc: 0.82421
Epoch: 04 	Val Loss: 0.69043 	Val Acc: 0.80424


In [34]:
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
def predict(model, iterator, device):
    model.eval()
    predictions = []
    
    with torch.no_grad():
        for batch in iterator:
            text, _ = batch
            text = text.to(device)
            output = model(text).squeeze(1)
            output = output.argmax(1)
            predictions.append(output)
    
    return torch.cat(predictions).tolist()

preds = predict(model, val_loader, device)
# caluculating the accuracy,micro F1 score, precision and recall, f1 score
print(classification_report(val_labels, preds))
print('Accuracy:', accuracy_score(val_labels, preds))
print('Micro F1:', f1_score(val_labels, preds, average='micro'))
print('Macro F1:', f1_score(val_labels, preds, average='macro'))
print('Weighted F1:', f1_score(val_labels, preds, average='weighted'))
print('Recall:', recall_score(val_labels, preds, average='micro'))
print('Precision:', precision_score(val_labels, preds, average='micro'))

              precision    recall  f1-score   support

           0       1.00      0.10      0.19        39
           1       0.00      0.00      0.00        13
           2       0.00      0.00      0.00        22
           3       0.00      0.00      0.00        87
           4       0.80      1.00      0.89       641

    accuracy                           0.80       802
   macro avg       0.36      0.22      0.22       802
weighted avg       0.69      0.80      0.72       802

Accuracy: 0.8042394014962594
Micro F1: 0.8042394014962594
Macro F1: 0.21538859350000808
Weighted F1: 0.7210978081889351
Recall: 0.8042394014962594
Precision: 0.8042394014962594


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [35]:
#saving the model
# torch.save(model.state_dict(), 'cnn_model.pth')


In [36]:
# No columns in the test dataset
def load_test_data(df):
    texts = df.values

    return texts

In [37]:
class ParaDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        text = self.data[idx]
        
        # return torch.tensor(text,dtype=torch.float32), torch.tensor(label)
        #Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at ../torch/csrc/utils/tensor_new.cpp:278.)
        text = np.array(text)

        return torch.tensor(text,dtype=torch.float32)

In [38]:
file_path = '/data2/home/kpnaveen/DLNLP/Assignment1/Aug24-Assignment1-Dataset1-test.xlsx'

test_data = pd.read_excel(file_path)

In [39]:
# converting the numpy into string if given the columns not given
# converting the numpy into string if given the columns not given
# for i in range(len(test_data)):
#     if not isinstance(test_data.iloc[i, 0], str):
#         test_data.iloc[i, 0] = str(test_data.iloc[i, 0])
    

In [40]:
test_texts = load_test_data(test_data)

In [41]:
# using saved model for the prediction on test dataset

test_dataset_texts = createembeddings(test_texts, model1,model2,model3, max_aug_len,embedding_dim)
test_dataset = ParaDataset(test_dataset_texts)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

AttributeError: 'numpy.ndarray' object has no attribute 'translate'

In [None]:
#loading the saved model
model = CNNTextClassifier(embedding_dim, n_filters, filter_sizes, output_dim, dropout)
model.load_state_dict(torch.load('cnn_model.pth'))
model = model.to(device)


  model.load_state_dict(torch.load('cnn_model.pth'))


In [None]:
def predict(model, iterator, device):
    model.eval()
    predictions = []
    
    with torch.no_grad():
        for batch in iterator:
            text, _ = batch
            text = text.to(device)
            output = model(text).squeeze(1)
            output = argmax(output, dim=1)
            predictions.append(output+1)
    
    return torch.cat(predictions).tolist()

preds = predict(model, test_loader, device)
# caluculating the accuracy,micro F1 score, precision and recall, f1 score


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        39
           1       0.00      0.00      0.00        13
           2       0.00      0.00      0.00        22
           3       0.16      0.14      0.15        87
           4       0.81      0.90      0.85       641

    accuracy                           0.74       802
   macro avg       0.19      0.21      0.20       802
weighted avg       0.66      0.74      0.70       802

Accuracy: 0.7356608478802993
Micro F1: 0.7356608478802993
Macro F1: 0.19957294474013282
Weighted F1: 0.6958363288589606
Recall: 0.7356608478802993
Precision: 0.7356608478802993


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
with open('cnn_predictions.txt', 'w') as f:
    for item in preds:
        f.write("%s\n" % item)

with open('cnn_test_labels.xlsx', 'w') as f:
    for item in test_labels:
        f.write("%s\n" % item)

In [None]:
print(classification_report(test_labels, preds))
print('Accuracy:', accuracy_score(test_labels, preds))
print('Micro F1:', f1_score(test_labels, preds, average='micro'))
print('Macro F1:', f1_score(test_labels, preds, average='macro'))
print('Weighted F1:', f1_score(test_labels, preds, average='weighted'))
print('Recall:', recall_score(test_labels, preds, average='micro'))
print('Precision:', precision_score(test_labels, preds, average='micro'))

In [None]:
#saving the predictions into txt file and xlsx file

with open('cnn_predictions.txt', 'w') as f:
    for item in preds:
        f.write("%s\n" % item)

with open('cnn_test_labels.xlsx', 'w') as f:
    for item in test_labels:
        f.write("%s\n" % item)