In [1]:
import sys
sys.path.insert(1, '../..')

import torch
import torch.nn as nn
import random
import pandas as pd
import numpy as np

random.seed(33)

unique_name = "SBERT_NLI_Mean"

In [2]:
vectors = np.loadtxt("../../data/processed/vectors/Phemernr1_SBERT_NLI_Mean_vectors.txt", delimiter=",")
vectors.shape

(5802, 768)

In [3]:
phemernr2 = pd.read_csv("../../data/processed/phemernr1_dataset_with_tvt.csv", lineterminator="\n")
phemernr2.head()

Unnamed: 0,tweet_text,topic,label,tvt
0,BOMBSHELL: #Ferguson chief says the police off...,ferguson,rumours,test
1,It appears that #Ferguson PD are trying to ass...,ferguson,rumours,training
2,"All weekend ppl will be talking about the ""rob...",ferguson,rumours,test
3,Why would the officer tell #MikeBrown to get o...,ferguson,rumours,test
4,Michael Brown is the 17 yr old boy who was sho...,ferguson,rumours,training


In [4]:
labels = []
for i, p2 in phemernr2.iterrows():
    if p2['label'] == 'rumours':
        labels.append([0])
    elif p2['label'] == 'non-rumours':
        labels.append([1])
    else:
        labels.append(None)
labels[:10]

[[0], [0], [0], [0], [0], [0], [0], [0], [0], [0]]

In [5]:
train_vectors = np.array([vectors[i] for i, p2 in phemernr2.iterrows() if p2['tvt'] == 'training'])
val_vectors = np.array([vectors[i] for i, p2 in phemernr2.iterrows() if p2['tvt'] == 'validation'])
test_vectors = np.array([vectors[i] for i, p2 in phemernr2.iterrows() if p2['tvt'] == 'test'])

train_labels = np.array([labels[i] for i, p2 in phemernr2.iterrows() if p2['tvt'] == 'training'])
val_labels = np.array([labels[i] for i, p2 in phemernr2.iterrows() if p2['tvt'] == 'validation'])
test_labels = np.array([labels[i] for i, p2 in phemernr2.iterrows() if p2['tvt'] == 'test'])

In [6]:
label_tag = ['rumours', 'non-rumours']
label_tag

['rumours', 'non-rumours']

In [7]:
print(train_vectors.shape)
print(val_vectors.shape)
print(test_vectors.shape)

print(train_labels.shape)
print(val_labels.shape)
print(test_labels.shape)

(3498, 768)
(1150, 768)
(1154, 768)
(3498, 1)
(1150, 1)
(1154, 1)


In [8]:
print(np.unique(train_labels, return_counts=True))
print(np.unique(val_labels, return_counts=True))
print(np.unique(test_labels, return_counts=True))

(array([0, 1]), array([1175, 2323]))
(array([0, 1]), array([416, 734]))
(array([0, 1]), array([381, 773]))


In [9]:
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torch.optim as optim
import matplotlib.pyplot as plt
import time
import os
from typing import Callable


class NNClassifier(nn.Module):
    def __init__(self,
        n_input: int,
        n_output: int = 1,
        criterion: Callable = nn.BCELoss,
        beta1: float = 0.5,
        lr: float = 0.0002,
        device: str = None
    ):
        super(NNClassifier, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(n_input, 512),
            nn.LeakyReLU(0.1),
#             nn.BatchNorm1d(512),
            nn.Linear(512, 512),
            nn.LeakyReLU(0.1),
#             nn.BatchNorm1d(512),
            nn.Linear(512, 256),
            nn.LeakyReLU(0.1),
#             nn.BatchNorm1d(256),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.1),
#             nn.BatchNorm1d(128),
            nn.Linear(128, n_output),
            nn.Sigmoid()
        )
        self.criterion = criterion()
        if not device or device not in ['cpu', 'cuda']:
            self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        else:
            self.device = device
        
        self.model = self.model.to(self.device)
        if self.device == 'cuda':
            self.model = torch.nn.DataParallel(self.model)
            cudnn.benchmark = True

        self.optimizer = optim.Adam(self.model.parameters(), lr=lr, betas=(beta1, 0.999))

    def forward(self, input):
        return self.model(input)
    
    def load_pretrained(self, filepath: str, key: str = "net", is_parallel: bool = False):
        checkpoint = torch.load(filepath)
        if is_parallel:
            self.model = torch.nn.DataParallel(self.model)
        self.model.load_state_dict(checkpoint[key], strict=False)
    
    def train_eval(self,
        train_x, train_y,
        test_x, test_y,
        n_iter: int = 100,
        batch_size: int = 128,
        saves: str = None
    ):
        trainset = torch.utils.data.TensorDataset(train_x, train_y) # create your datset
        trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size) # create your dataloader

        testset = torch.utils.data.TensorDataset(test_x, test_y) # create your datset
        testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size) # create your dataloader

        train_accs = []
        train_losses = []
        test_accs = []
        test_losses = []

        print(f"Using {self.device}")
        best_acc = 0
        best_loss = 1000
        best_test_acc = 0
        epoch = 0
        start_time = time.time()
        results = {}
        while True:
            epoch += 1
            self.model.train()
            train_loss = 0
            correct = 0
            total = 0
            for batch_idx, (inputs, targets) in enumerate(trainloader):
                self.model.zero_grad()
                inputs, targets = inputs.to(self.device), targets.to(self.device)
                outputs = self.model(inputs)
                try:
                    loss = self.criterion(outputs, targets)
                except Exception:
                    loss = self.criterion(outputs, targets.long())
                loss.backward()
                self.optimizer.step()

                train_loss += loss.item()
                total += targets.size(0)
                
            train_losses.append(train_loss)

            self.model.eval()
            test_loss = 0
            test_acc = 0
            with torch.no_grad():
                inputs, targets = test_x.to(self.device), test_y.to(self.device)
                outputs = self.model(inputs)
                loss = self.criterion(outputs, targets)

                test_loss += loss.item()
                
                preds = self.predict(test_x)
                conf_mat = ConfusionMatrix(
                    labels=test_y,
                    predictions=[p[0] for p in preds.cpu().numpy()],
                    binary=True
                )
                conf_mat.evaluate(logs=False)
                test_acc = conf_mat.accuracy

            test_losses.append(test_loss)
            
            if (epoch) % round(n_iter/20) == 0:
                print(f"-- Epoch {epoch}, Train Loss : {train_loss}, Test Loss : {test_loss}")

            # Save checkpoint.
#             if saves and test_loss < best_loss:
#                 print(f"Saving after new best loss : {test_loss}")
#                 best_loss = test_loss
            if saves and test_acc > best_test_acc:
                print(f"Saving after new best accuracy : {test_acc}")
                best_test_acc = test_acc

                state = {
                    'net': self.model.state_dict(),
                }
                if not os.path.isdir('models'):
                    os.mkdir('models')
                torch.save(state, f"../../data/models/{saves}.pth")
            
            if epoch >= n_iter:
                break

        # visualizing accuracy over epoch
        fig, ax2 = plt.subplots(1)
        plt.subplots_adjust(top = 0.99, bottom=0.01, hspace=1.5, wspace=0.4)

        ax2.plot([i for i in range(len(train_losses))], train_losses, c='b', marker="o", label='Train Loss')
        ax2.plot([i for i in range(len(test_losses))], test_losses, c='r', marker="o", label='Test Loss')
        ax2.set_ylabel('Loss')
        ax2.set_xlabel('Epoch')
        ax2.set_xlim(0, len(train_losses))
        ax2.set_ylim(min([min(train_losses), min(test_losses)])*0.1, max([max(train_losses), max(test_losses)]))
        ax2.title.set_text(f"Loss over time (epoch)")
        ax2.legend(loc='lower right')

        plt.show()
    
    def predict(self, input_x):
        self.model.eval()
        with torch.no_grad():
            return self.model(torch.Tensor(input_x))

In [10]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from library.classification import SKLearnClassification
from library.evaluation import ConfusionMatrix

dataset_name = "Phemernr1"

logres_model = LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr', max_iter=10000)
neigh = KNeighborsClassifier(n_neighbors=3)
svm = LinearSVC()

models = [
    SKLearnClassification(logres_model, "Logistic Regression"),
    SKLearnClassification(neigh, "K-Nearest Neighbor"),
    SKLearnClassification(svm, "Support Vector Machine"),
]
for model in models:
    print(f"\n--- {model.model_name.upper()} ---")
    model.train(train_vectors, train_labels, dataset_name)
    
    print("Validation Set")
    preds = model.predict(val_vectors)

    conf_mat = ConfusionMatrix(
        labels=val_labels,
        predictions=preds,
        binary=True
    )
    conf_mat.evaluate()
    
    print("Test Set")
    preds = model.predict(test_vectors)

    conf_mat = ConfusionMatrix(
        labels=test_labels,
        predictions=preds,
        binary=True
    )
    conf_mat.evaluate()

    print("--- END ---\n")


--- LOGISTIC REGRESSION ---


  return f(*args, **kwargs)


---> execution time : 1.98 seconds
Validation Set
Binary Class Evaluation

True Positive : 643
False Positive : 116
False Negative : 91
True Negative : 300

Class positive Evaluation
- Precision : 84.717 %
- Recall : 87.602 %
- F1 : 0.86135

Class negative Evaluation
- Precision : 76.726 %
- Recall : 72.115 %
- F1 : 0.74349

Combined Evaluation
- Accuracy : 82.0 %
- Precision : 80.722 %
- Recall : 79.859 %
- F1 : 0.80288
- Average Confidence : 100.0 %
Model, Combined,,,,positive,,,negative,,,
Anonymous, 82.0, 80.722, 79.859, 0.80288, 84.717, 87.602, 0.86135, 76.726, 72.115, 0.74349, 
Test Set
Binary Class Evaluation

True Positive : 690
False Positive : 110
False Negative : 83
True Negative : 271

Class positive Evaluation
- Precision : 86.25 %
- Recall : 89.263 %
- F1 : 0.8773

Class negative Evaluation
- Precision : 76.554 %
- Recall : 71.129 %
- F1 : 0.73741

Combined Evaluation
- Accuracy : 83.276 %
- Precision : 81.402 %
- Recall : 80.196 %
- F1 : 0.80794
- Average Confidence : 10

  return self._fit(X, y)


Binary Class Evaluation

True Positive : 643
False Positive : 92
False Negative : 91
True Negative : 324

Class positive Evaluation
- Precision : 87.483 %
- Recall : 87.602 %
- F1 : 0.87543

Class negative Evaluation
- Precision : 78.072 %
- Recall : 77.885 %
- F1 : 0.77978

Combined Evaluation
- Accuracy : 84.087 %
- Precision : 82.778 %
- Recall : 82.743 %
- F1 : 0.8276
- Average Confidence : 100.0 %
Model, Combined,,,,positive,,,negative,,,
Anonymous, 84.087, 82.778, 82.743, 0.8276, 87.483, 87.602, 0.87543, 78.072, 77.885, 0.77978, 
Test Set
Binary Class Evaluation

True Positive : 663
False Positive : 80
False Negative : 110
True Negative : 301

Class positive Evaluation
- Precision : 89.233 %
- Recall : 85.77 %
- F1 : 0.87467

Class negative Evaluation
- Precision : 73.236 %
- Recall : 79.003 %
- F1 : 0.7601

Combined Evaluation
- Accuracy : 83.536 %
- Precision : 81.234 %
- Recall : 82.386 %
- F1 : 0.81806
- Average Confidence : 100.0 %
Model, Combined,,,,positive,,,negative,,,
A

  return f(*args, **kwargs)


---> execution time : 4.03 seconds
Validation Set
Binary Class Evaluation

True Positive : 637
False Positive : 127
False Negative : 97
True Negative : 289

Class positive Evaluation
- Precision : 83.377 %
- Recall : 86.785 %
- F1 : 0.85047

Class negative Evaluation
- Precision : 74.87 %
- Recall : 69.471 %
- F1 : 0.7207

Combined Evaluation
- Accuracy : 80.522 %
- Precision : 79.124 %
- Recall : 78.128 %
- F1 : 0.78623
- Average Confidence : 100.0 %
Model, Combined,,,,positive,,,negative,,,
Anonymous, 80.522, 79.124, 78.128, 0.78623, 83.377, 86.785, 0.85047, 74.87, 69.471, 0.7207, 
Test Set
Binary Class Evaluation

True Positive : 684
False Positive : 132
False Negative : 89
True Negative : 249

Class positive Evaluation
- Precision : 83.824 %
- Recall : 88.486 %
- F1 : 0.86092

Class negative Evaluation
- Precision : 73.669 %
- Recall : 65.354 %
- F1 : 0.69263

Combined Evaluation
- Accuracy : 80.849 %
- Precision : 78.746 %
- Recall : 76.92 %
- F1 : 0.77822
- Average Confidence : 1



In [13]:
print("Multiclass Classification using 4-Layer Linear Network")
model_name = f"Phemernr1_4LayerNet_{unique_name}"
model = NNClassifier(train_vectors.shape[1], criterion=nn.BCELoss)
# model.train_eval(torch.Tensor(train_vectors),
#                 torch.Tensor(train_labels),
#                 torch.Tensor(val_vectors),
#                 torch.Tensor(val_labels),
#                 saves=model_name,
#                 n_iter=1000,
#                 batch_size=256)

model.load_pretrained(f"../../data/models/{model_name}.pth")

print("\nValidation Set")
preds = model.predict(val_vectors)
print(f"Predictions : {preds.shape}")

preds = preds.cpu().numpy()

conf_mat = ConfusionMatrix(
    labels=val_labels,
    predictions=[p[0] for p in preds],
    binary=True,
    model_name=model_name
)
conf_mat.evaluate()

print("\nTest Set")
preds = model.predict(test_vectors)
print(f"Predictions : {preds.shape}")

preds = preds.cpu().numpy()

conf_mat = ConfusionMatrix(
    labels=test_labels,
    predictions=[p[0] for p in preds],
    binary=True,
    model_name=model_name
)
conf_mat.evaluate()

Multiclass Classification using 4-Layer Linear Network

Validation Set
Predictions : torch.Size([1150, 1])
Binary Class Evaluation

True Positive : 670
False Positive : 101
False Negative : 64
True Negative : 315

Class positive Evaluation
- Precision : 86.9 %
- Recall : 91.281 %
- F1 : 0.89037

Class negative Evaluation
- Precision : 83.113 %
- Recall : 75.721 %
- F1 : 0.79245

Combined Evaluation
- Accuracy : 85.652 %
- Precision : 85.007 %
- Recall : 83.501 %
- F1 : 0.84247
- Average Confidence : 92.67 %
Model, Combined,,,,positive,,,negative,,,
Phemernr1_4LayerNet_SBERT_NLI_Mean, 85.652, 85.007, 83.501, 0.84247, 86.9, 91.281, 0.89037, 83.113, 75.721, 0.79245, 

Test Set
Predictions : torch.Size([1154, 1])
Binary Class Evaluation

True Positive : 711
False Positive : 104
False Negative : 62
True Negative : 277

Class positive Evaluation
- Precision : 87.239 %
- Recall : 91.979 %
- F1 : 0.89547

Class negative Evaluation
- Precision : 81.711 %
- Recall : 72.703 %
- F1 : 0.76944

Comb