In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from matplotlib import pyplot
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report

def preprocess_pandas(data, columns):
    df_ = pd.DataFrame(columns=columns)
    data['Sentence'] = data['Sentence'].str.lower()
    data['Sentence'] = data['Sentence'].replace('[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+', '', regex=True)                      # remove emails
    data['Sentence'] = data['Sentence'].replace('((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.|$)){4}', '', regex=True)    # remove IP address
    data['Sentence'] = data['Sentence'].str.replace('[^\w\s]','')                                                       # remove special characters
    data['Sentence'] = data['Sentence'].replace('\d', '', regex=True)                                                   # remove numbers
    for index, row in data.iterrows():
        word_tokens = word_tokenize(row['Sentence'])
        filtered_sent = [w for w in word_tokens if not w in stopwords.words('english')]
        df_ = df_.append({
            "index": row['index'],
            "Class": row['Class'],
            "Sentence": " ".join(filtered_sent[0:])
        }, ignore_index=True)
    return data

# If this is the primary file that is executed (ie not an import of another file)
if __name__ == "__main__":
    # get data, pre-process and split
    data = pd.read_csv("amazon_cells_labelled.txt", delimiter='\t', header=None)
    data.columns = ['Sentence', 'Class']
    data['index'] = data.index                                          # add new column index
    columns = ['index', 'Class', 'Sentence']
    data = preprocess_pandas(data, columns)                             # pre-process
    training_data, validation_data, training_labels, validation_labels = train_test_split( # split the data into training, validation, and test splits
        data['Sentence'].values.astype('U'),
        data['Class'].values.astype('int32'),
        test_size=0.10,
        random_state=0,
        shuffle=True
    )

    # vectorize data using TFIDF and transform for PyTorch for scalability
    word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features=50000, max_df=0.5, use_idf=True, norm='l2')
    training_data = word_vectorizer.fit_transform(training_data)        # transform texts to sparse matrix
    training_data = training_data.todense()                             # convert to dense matrix for Pytorch
    vocab_size = len(word_vectorizer.vocabulary_)
    validation_data = word_vectorizer.transform(validation_data)
    validation_data = validation_data.todense()
    train_x_tensor = torch.from_numpy(np.array(training_data)).type(torch.FloatTensor)
    train_y_tensor = torch.from_numpy(np.array(training_labels)).long()
    validation_x_tensor = torch.from_numpy(np.array(validation_data)).type(torch.FloatTensor)
    validation_y_tensor = torch.from_numpy(np.array(validation_labels)).long()

  data['Sentence'] = data['Sentence'].str.replace('[^\w\s]','')                                                       # remove special characters


In [9]:
# We create our own dataset to load the BoW embedded texts
class BoWDataset(torch.utils.data.Dataset):
    def __init__(self, sentence_vectors, labels):
        self.sentence_vectors = sentence_vectors
        self.labels = labels
        
    def __getitem__(self, index):
        return self.labels[index], self.sentence_vectors[index]
  
    def __len__(self):
        return len(self.labels)

In [29]:
bow_train_data = BoWDataset(train_x_tensor, train_y_tensor)
bow_test_data = BoWDataset(validation_x_tensor, validation_y_tensor)
bow_trainloader = DataLoader(bow_train_data, batch_size=300, shuffle=True)
bow_testloader = DataLoader(bow_test_data, batch_size=100, shuffle=False)

## CREATE A NEURAL NETWORK TO TRAIN
network = nn.Sequential(
    # HINT: We want our input to be the size of our BoW embedding and our output to be the different possible classes
    nn.Linear(vocab_size, 1000),
    nn.ReLU(),
    nn.Linear(1000, 50),
    nn.ReLU(),
    nn.Linear(50, 2)
)

optimizer = optim.Adam(network.parameters(), lr=0.1)
loss_function = nn.CrossEntropyLoss()
epochs = 30

for epoch in range(epochs):
    for batch_nr, (labels, data) in enumerate(bow_trainloader):
        prediction = network(data)
        loss = loss_function(prediction, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        #Print the epoch, batch, and loss
        print(
            f'\rEpoch {epoch+1} [{batch_nr+1}/{len(bow_trainloader)}] - Loss: {loss}',
            end=''
        )

Epoch 30 [3/3] - Loss: 0.036428036896143e-10

In [30]:
with torch.no_grad():
    classes = [0, 1]
    correct = 0
    total = 0
    y_pred = []
    y_true = []
    # For each batch of testing data (since the dataset is too large to run all data through the network at once)
    # Calculate the accuracy
    for batch_nr, (labels, data) in enumerate(bow_testloader):
        prediction = torch.argmax(network(data), dim=-1)
            
        for i in range(len(prediction)):
            if prediction[i] == labels[i]:
                correct += 1
            total += 1
            y_pred.append(prediction[i])
            y_true.append(labels[i])
            
    print(f'The accuracy of the network is {str(100*correct/total)[:4]}%.')

The accuracy of the network is 84.0%.


In [None]:
print(f'validation_x_tensor.size(0): {train_x_tensor.size(0)}')

In [None]:
validation_data = word_vectorizer.transform(validation_data)
validation_data = validation_data.todense()
train_x_tensor = torch.from_numpy(np.array(training_data)).type(torch.FloatTensor)
train_y_tensor = torch.from_numpy(np.array(training_labels)).long()
validation_x_tensor = torch.from_numpy(np.array(validation_data)).type(torch.FloatTensor)
validation_y_tensor = torch.from_numpy(np.array(validation_labels)).long()


In [31]:
print(f'train_x_tensor.size(0): {train_x_tensor.size(0)}')
print(f'train_x_tensor.size(1): {train_x_tensor.size(1)}')

for i, each in enumerate(train_x_tensor[0]):
    if each > 0.0:
        print(f'First word index {i}: {each}')

train_x_tensor.size(0): 900
train_x_tensor.size(1): 7305
First word index 82: 0.1396804302930832
First word index 97: 0.20427875220775604
First word index 150: 0.20427875220775604
First word index 152: 0.21663209795951843
First word index 230: 0.0694056823849678
First word index 367: 0.19551390409469604
First word index 814: 0.21663209795951843
First word index 815: 0.21663209795951843
First word index 1457: 0.20427875220775604
First word index 1458: 0.21663209795951843
First word index 1656: 0.21663209795951843
First word index 1657: 0.21663209795951843
First word index 2074: 0.15143045783042908
First word index 2143: 0.09566876292228699
First word index 2195: 0.14092417061328888
First word index 4172: 0.09744404256343842
First word index 4183: 0.21663209795951843
First word index 4752: 0.21663209795951843
First word index 4753: 0.21663209795951843
First word index 5192: 0.16469334065914154
First word index 5194: 0.21663209795951843
First word index 5850: 0.179941326379776
First word 