In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from matplotlib import pyplot
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report
from torch.utils.tensorboard import SummaryWriter

2025-04-15 15:38:14.035426: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744724294.055092  234895 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744724294.062005  234895 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744724294.078575  234895 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744724294.078601  234895 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744724294.078604  234895 computation_placer.cc:177] computation placer alr

In [2]:
MAX_FEATURES = 50000
BATCH_SIZE = 50

In [3]:
def preprocess_pandas(data, columns):
    df_ = pd.DataFrame(columns=columns)
    data['Sentence'] = data['Sentence'].str.lower()
    data['Sentence'] = data['Sentence'].replace('[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+', '', regex=True)                      # remove emails
    data['Sentence'] = data['Sentence'].replace('((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.|$)){4}', '', regex=True)    # remove IP address
    data['Sentence'] = data['Sentence'].str.replace('[^\w\s]','')                                                       # remove special characters
    data['Sentence'] = data['Sentence'].replace('\d', '', regex=True)                                                   # remove numbers
    for index, row in data.iterrows():
        word_tokens = word_tokenize(row['Sentence'])
        filtered_sent = [w for w in word_tokens if not w in stopwords.words('english')]
        df_.loc[len(df_)] = {
            "index": row['index'],
            "Class": row['Class'],
            "Sentence": " ".join(filtered_sent)
        }
    return data

def load_and_process_data(data_path):
    # get data, pre-process and split
    data = pd.read_csv(data_path, delimiter='\t', header=None)
    data.columns = ['Sentence', 'Class']
    data['index'] = data.index                                          # add new column index
    columns = ['index', 'Class', 'Sentence']
    data = preprocess_pandas(data, columns)                             # pre-process
    training_data, validation_data, training_labels, validation_labels = train_test_split( # split the data into training, validation, and test splits
        data['Sentence'].values.astype('U'),
        data['Class'].values.astype('int32'),
        test_size=0.10,
        random_state=0,
        shuffle=True
    )

    # vectorize data using TFIDF and transform for PyTorch for scalability
    word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features=MAX_FEATURES, max_df=0.5, use_idf=True, norm='l2')
    training_data = word_vectorizer.fit_transform(training_data)        # transform texts to sparse matrix
    training_data = training_data.todense()                             # convert to dense matrix for Pytorch
    vocab_size = len(word_vectorizer.vocabulary_)
    validation_data = word_vectorizer.transform(validation_data)
    validation_data = validation_data.todense()
    train_x_tensor = torch.from_numpy(np.array(training_data)).type(torch.FloatTensor)
    train_y_tensor = torch.from_numpy(np.array(training_labels)).long()
    validation_x_tensor = torch.from_numpy(np.array(validation_data)).type(torch.FloatTensor)
    validation_y_tensor = torch.from_numpy(np.array(validation_labels)).long()

    return train_x_tensor,train_y_tensor,validation_x_tensor,validation_y_tensor,word_vectorizer

def pad(twoDTensor):
    res = torch.zeros(twoDTensor.size(0),MAX_FEATURES).type(torch.FloatTensor)
    res[:,:twoDTensor.size(1)] = twoDTensor
    return res

In [4]:
class Net(nn.Module):
    def __init__(self, size):
        super().__init__() #input of size
        self.fc1 = nn.Linear(size, (int)(size/8))
        #self.fc2 = nn.Linear((int)(size/2), (int)(size/4)) # to size/4
        #self.fc3 = nn.Linear((int)(size/4), (int)(size/8))
        self.fc4 = nn.Linear((int)(size/8), (int)(size/16)) # to size/4^2
        #self.fc5 = nn.Linear((int)(size/16), (int)(size/32))
        self.fc6 = nn.Linear((int)(size/16), (int)(size/64)) # to size/4^3
        #self.fc7 = nn.Linear((int)(size/64), (int)(size/128))
        self.fc8 = nn.Linear((int)(size/64), (int)(size/256)) # to size/4^4
        self.fc9 = nn.Linear((int)(size/256), 2) # output of 2

    def forward(self, x):
        x = F.leaky_relu(self.fc1(x))
        #x = F.leaky_relu(self.fc2(x))
        #x = F.leaky_relu(self.fc3(x))
        x = F.leaky_relu(self.fc4(x))
        #x = F.leaky_relu(self.fc5(x))
        x = F.leaky_relu(self.fc6(x))
        #x = F.leaky_relu(self.fc7(x))
        x = F.leaky_relu(self.fc8(x))
        x = self.fc9(x)
        return x

In [5]:
def train_model(model, criterion, optimizer, train_loader, val_loader, num_epochs, writer, BATCH_SIZE):
    # Initialization
    early_stopping = 0
    epochs_done = 0
    min_LSTM_loss = float('inf') #10000

    # Training and validation
    for i in range(num_epochs):
        epochs_done += 1
        epoch_training_loss = 0
        epoch_validation_loss = 0

        # Evaluation of the model
        correct = 0
        model.eval()
        for _,(inputs,labels) in enumerate(val_loader):
            # One-hot encode labels
            labels_one_hot = F.one_hot(labels,2)

            # Convert images and labels to tensor
            inputs = torch.Tensor(inputs).type('torch.FloatTensor')
            labels_one_hot = torch.Tensor(labels_one_hot).type('torch.FloatTensor')

            # Make sure everything is done on GPU
            inputs = inputs.to(device)
            labels_one_hot = labels_one_hot.to(device)
            labels = labels.to(device)
            
            # Forward pass
            outputs = model.forward(inputs)

            # Compute loss
            loss = criterion(outputs, labels_one_hot)

            # Update loss
            epoch_validation_loss += loss.cpu().detach().numpy()

            #compute accuracy
            preds = torch.argmax(outputs,1)
            correct += (preds==labels).float().sum()
        v_acc = correct/(len(val_loader)*BATCH_SIZE)
        writer.add_scalar('Accuracy/evaluation' ,v_acc ,i)
        v_loss = epoch_validation_loss/len(val_loader.dataset)
        writer.add_scalar('Loss/evaluation', v_loss, i)

        if loss<min_LSTM_loss:
            torch.save(model, 'best_model.pt')
            min_LSTM_loss = loss
            early_stopping = 0
        else:
            early_stopping += 1

        if early_stopping >= 15:
            break

        # Train model
        model.train()
        correct = 0
        for _,(inputs,labels) in enumerate(train_loader):
            # One-hot encode labels
            labels_one_hot = F.one_hot(labels,2)

            # Convert images and labels to tensor
            inputs = torch.Tensor(inputs).type('torch.FloatTensor')
            labels_one_hot = torch.Tensor(labels_one_hot).type('torch.FloatTensor')

            # Make sure everything is done on GPU
            inputs = inputs.to(device)
            labels_one_hot = labels_one_hot.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model.forward(inputs)

            # Compute loss
            loss = criterion(outputs, labels_one_hot)

            # Reset gradients
            optimizer.zero_grad()

            # Compute gradients
            loss.backward()

            # Update weights
            optimizer.step()

            # Update loss
            epoch_training_loss += loss.cpu().detach().numpy()

            #compute accuracy
            preds = torch.argmax(outputs,1)
            correct += (preds==labels).float().sum()
        t_acc = correct/(len(train_loader)*BATCH_SIZE)
        writer.add_scalar('Accuracy/training' ,t_acc ,i)
        t_loss = epoch_training_loss/len(train_loader.dataset)
        writer.add_scalar('Loss/training', t_loss, i)

        if i % 10 == 0:
            print(f'Epoch {i}, training loss: {t_loss}, training accuracy: {t_acc}, validation loss: {v_loss}, validation accuracy: {v_acc}')

    best_model = torch.load('best_model.pt', weights_only=False)
    return best_model,epochs_done

In [11]:
# loading the first data
train_x,train_y,val_x,val_y,vectorizer = load_and_process_data("amazon_cells_labelled.txt")
#train_x = pad(train_x)
#val_x = pad(val_x)
size = train_x.size(1)


trainset = TensorDataset(train_x,train_y)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE,
                                          shuffle=True, num_workers=2)
valset = TensorDataset(val_x, val_y)
valloader = torch.utils.data.DataLoader(valset, batch_size=BATCH_SIZE,
                                         shuffle=False, num_workers=2)

In [12]:
# defining the model
net = Net(size)
optimizer = optim.Adam(net.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()
epochs = 200

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net = net.to(device)

In [20]:
writer = SummaryWriter(comment='fully_connected-small_db')
best_model, epoch_reached = train_model(net, criterion, optimizer, trainloader, valloader, epochs, writer, BATCH_SIZE)
print(f'{epoch_reached} epoch reached')
writer.close()
net.to('cpu')

Epoch 0, training loss: 0.03220616653561592, training accuracy: 0.495555579662323, validation loss: 0.014222116209566593, validation accuracy: 0.4699999988079071
Epoch 10, training loss: 2.24794050041055e-08, training accuracy: 1.0, validation loss: 0.024489451199769974, validation accuracy: 0.8199999928474426
19 epoch reached


Net(
  (fc1): Linear(in_features=7277, out_features=3638, bias=True)
  (fc2): Linear(in_features=3638, out_features=1819, bias=True)
  (fc3): Linear(in_features=1819, out_features=909, bias=True)
  (fc4): Linear(in_features=909, out_features=454, bias=True)
  (fc5): Linear(in_features=454, out_features=227, bias=True)
  (fc6): Linear(in_features=227, out_features=113, bias=True)
  (fc7): Linear(in_features=113, out_features=56, bias=True)
  (fc8): Linear(in_features=56, out_features=28, bias=True)
  (fc9): Linear(in_features=28, out_features=2, bias=True)
)

In [6]:
# loading the next data
train_x,train_y,val_x,val_y,vectorizer2 = load_and_process_data("amazon_cells_labelled_LARGE_25K.txt")
#train_x = pad(train_x)
#val_x = pad(val_x)
size2 = train_x.size(1)


trainset = TensorDataset(train_x,train_y)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE,
                                          shuffle=True, num_workers=2)
valset = TensorDataset(val_x, val_y)
valloader = torch.utils.data.DataLoader(valset, batch_size=BATCH_SIZE,
                                         shuffle=False, num_workers=2)

In [7]:
# defining the model
net = Net(size2)
optimizer = optim.Adam(net.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()
epochs = 200

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net = net.to(device)

In [8]:
writer = SummaryWriter(comment='fully_connected-large_db')
best_model2, epoch_reached2 = train_model(net, criterion, optimizer, trainloader, valloader, epochs, writer, BATCH_SIZE)
print(f'{epoch_reached2} epoch reached')
writer.close()
net.to('cpu')

Epoch 0, training loss: 0.01013773400336504, training accuracy: 0.8224889039993286, validation loss: 0.01391210313886404, validation accuracy: 0.4043999910354614
Epoch 10, training loss: 0.19886238873004913, training accuracy: 0.9863555431365967, validation loss: 7.835793972015381, validation accuracy: 0.8611999750137329
17 epoch reached


In [9]:
def load_data_EXTRALARGE(data_path, word_vectorizer, max_lines = None, val_size = 0.1):
    if max_lines == None:
        training_xlarge = pd.read_csv(data_path, delimiter=',', header=None)        
    else:
        training_xlarge = pd.read_csv(data_path, delimiter=',', header=None).sample(max_lines, random_state = 0)            # choose a sample of size 200000
    #test_xlarge = pd.read_csv("test.csv", delimiter=',', header=None)
    training_xlarge.columns = [ 'Class', 'Title', 'Sentence']
    training_xlarge['index'] = training_xlarge.index                                          # add new column index
    columns = ['index', 'Class', 'Sentence']
    training_xlarge = preprocess_pandas(training_xlarge, columns)                             # pre-process
    test_size = val_size
    training_data_xl, validation_data_xl, training_labels_xl, validation_labels_xl = train_test_split( # split the data into training, validation, and test splits
        training_xlarge['Sentence'].values.astype('U'),
        training_xlarge['Class'].values.astype('int32'),
        test_size=0.10,
        random_state=0,
        shuffle=True
    )
    training_labels_xl -= 1                       # mapping from {1, 2} to {0, 1}
    validation_labels_xl -= 1

    # vectorize data using TFIDF and transform for PyTorch for scalability
    training_data_xl = word_vectorizer.transform(training_data_xl)        # transform texts to sparse matrix
    training_data_xl = training_data_xl.todense()                             # convert to dense matrix for Pytorch
    validation_data_xl = word_vectorizer.transform(validation_data_xl)
    validation_data_xl = validation_data_xl.todense()
    training_data_xl = torch.from_numpy(np.array(training_data_xl)).type(torch.FloatTensor)
    training_labels_xl = torch.from_numpy(np.array(training_labels_xl)).long()
    validation_data_xl = torch.from_numpy(np.array(validation_data_xl)).type(torch.FloatTensor)
    validation_labels_xl = torch.from_numpy(np.array(validation_labels_xl)).long()
    
    return training_data_xl, validation_data_xl, training_labels_xl, validation_labels_xl

In [10]:
import time
def test(network, word_vectorizer):
    start = time.time()
    training_data, validation_data, training_labels, validation_labels = load_data_EXTRALARGE("test.csv", word_vectorizer, max_lines = 10001, val_size = 10000)
    #validation_data = pad(validation_data)
    
    ### your prediction
    #here goes the result:
    network.to("cpu")
    outputs = network.forward(validation_data)
    predicted_class_ids_validation = torch.argmax(outputs,1)
    # evaluation
    end = time.time()
    
    print("Distribution of the sentiments in the test dataset:")
    print("positive: ", sum(validation_labels == 1)/len(validation_labels), "\t negative: ", 1-sum(validation_labels == 1)/len(validation_labels))
    print("Accuracy:")
    print(sum(validation_labels == predicted_class_ids_validation)/len(validation_labels))
    TP = sum( (validation_labels == predicted_class_ids_validation) & (validation_labels == 1) )
    FP = sum( (validation_labels != predicted_class_ids_validation) & (validation_labels == 1) )
    FN = sum( (validation_labels != predicted_class_ids_validation) & (validation_labels == 0) )
    F1 = 2*TP / (2*TP + FP + FN)
    
    print("Precision: \t", TP/(TP+FP))
    print("Recall: \t", TP/(TP+FN))
    print("F1 Score: \t", F1)
    
    print("with regard to positive sentiments.")
    print("Prediction runtime: \t", end - start)

In [21]:
test(best_model, vectorizer)

Distribution of the sentiments in the test dataset:
positive:  tensor(0.5235) 	 negative:  tensor(0.4765)
Accuracy:
tensor(0.5005)
Precision: 	 tensor(0.0534)
Recall: 	 tensor(0.8750)
F1 Score: 	 tensor(0.1007)
with regard to positive sentiments.
Prediction runtime: 	 73.82173180580139


In [11]:
test(best_model2, vectorizer2)

Distribution of the sentiments in the test dataset:
positive:  tensor(0.5235) 	 negative:  tensor(0.4765)
Accuracy:
tensor(0.8571)
Precision: 	 tensor(0.8817)
Recall: 	 tensor(0.8508)
F1 Score: 	 tensor(0.8660)
with regard to positive sentiments.
Prediction runtime: 	 79.41915130615234
