In [1]:
import sys
sys.path.insert(1, '../..')

import torch
import torch.nn as nn
import random
import pandas as pd
import numpy as np
import time
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

torch.manual_seed(33)
np.random.seed(33)
random.seed(33)

from library.evaluation import ConfusionMatrix

dataset_name = "Phemernr2-RNR"
unique_name = "RoBERTa_Finetuned_Check_Finetune"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
vectors = np.loadtxt("../../data/vectors/40_Epoch_Phemernr2-RNR_RoBERTa_base_finetuned_vectors_check_finetune.txt", delimiter=",")
vectors.shape

(6425, 768)

In [3]:
data = pd.read_csv("../../data/phemernr2_dataset_with_tvt.csv", sep=",")
data.head()

Unnamed: 0,tweet_id,tweet_text,label,tvt2,tvt2_1
0,552833795142209536,the east london mosque would like to offer its...,non-rumours,training,training
1,580318210609696769,breaking - a germanwings airbus a320 plane rep...,true,validation,testting
2,552798891994009601,reports that two of the dead in the #charliehe...,true,training,training
3,576790814942236672,after #putin disappeared russian tv no longer ...,non-rumours,validation,training
4,499678822598340608,saw #ferguson for myself. #justiceformichaelbr...,non-rumours,testting,testting


In [4]:
data['label'] = data['label'].replace(['true', 'unverfied', 'false'], 'rumours')

In [5]:
labels_str = ['rumours', 'non-rumours']
labels_str

['rumours', 'non-rumours']

In [6]:
labels = []
for i, d in data.iterrows():
#     lab = labels_str.index(d['label'])
    if d['label'] == "rumours":
        lab = 0
    else:
        lab = 1
    labels.append(lab)
labels[:10]

[1, 0, 0, 1, 1, 1, 1, 1, 1, 1]

In [7]:
train_vectors = np.array([vectors[i] for i, d in data.iterrows() if d['tvt2'] == 'training'])
val_vectors = np.array([vectors[i] for i, d in data.iterrows() if d['tvt2'] == 'validation'])
test_vectors = np.array([vectors[i] for i, d in data.iterrows() if d['tvt2'] == 'testting'])

train_labels = np.array([labels[i] for i, d in data.iterrows() if d['tvt2'] == 'training'])
val_labels = np.array([labels[i] for i, d in data.iterrows() if d['tvt2'] == 'validation'])
test_labels = np.array([labels[i] for i, d in data.iterrows() if d['tvt2'] == 'testting'])

In [8]:
print(train_vectors.shape)
print(val_vectors.shape)
print(test_vectors.shape)

print(train_labels.shape)
print(val_labels.shape)
print(test_labels.shape)

(4336, 768)
(1462, 768)
(627, 768)
(4336,)
(1462,)
(627,)


In [9]:
X = torch.tensor(train_vectors, dtype=torch.float32)
y = torch.tensor(train_labels, dtype=torch.long)

In [10]:
print(X)

tensor([[-0.5399,  0.6523, -0.1750,  ...,  1.5126, -0.2599, -1.1257],
        [ 0.5139, -0.1890,  0.3052,  ..., -0.8881,  0.4728,  0.7124],
        [-0.4865,  0.4412, -0.2613,  ...,  1.6282, -0.1829, -0.7105],
        ...,
        [ 0.4678, -0.1784,  0.3317,  ..., -0.9926,  0.0118,  0.8226],
        [-0.4078,  0.4802, -0.2571,  ...,  1.3258, -0.1092, -1.0044],
        [-0.4353,  0.5631, -0.2498,  ...,  1.1105, -0.2882, -1.3151]])


In [11]:
print(y)

tensor([1, 0, 1,  ..., 0, 1, 1])


In [12]:
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torch.optim as optim
import matplotlib.pyplot as plt
import skorch
import os
from typing import Callable


class NNModel(nn.Module):
    def __init__(
        self,
        n_input: int,
        n_output: int = 1
    ):
        super(NNModel, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(n_input, 256),
            nn.LeakyReLU(0.1),
#             nn.BatchNorm1d(512),
            nn.Dropout(p=0.5),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.1),
#             nn.BatchNorm1d(512),
            nn.Dropout(p=0.5),
            nn.Linear(128, 64),
            nn.LeakyReLU(0.1),
#             nn.BatchNorm1d(256),
            nn.Dropout(p=0.5),
            nn.Linear(64, 32),
            nn.LeakyReLU(0.1),
#             nn.BatchNorm1d(128),
            nn.Dropout(p=0.5),
            nn.Linear(32, n_output)
        )
        # self.layer = nn.Linear(n_input, 256)
        # self.act = nn.LeakyReLU(0.1)
        # self.do = nn.Dropout(p=0.5)
        # self.layer = nn.Linear(256,128)
        # self.act = nn.LeakyReLU(0.1)
        # self.do = nn.Dropout(p=0.5)
        # self.layer = nn.Linear(128,64)
        # self.act = nn.LeakyReLU(0.1)
        # self.do = nn.Dropout(p=0.5)
        # self.layer = nn.Linear(64,32)
        # self.act = nn.LeakyReLU(0.1)
        # self.do = nn.Dropout(p=0.5)
        # self.output = nn.Linear(32,n_output)
        # self.prob = nn.CrossEntropyLoss()

    def forward(self, input):
        return self.main(input)
    
    # def forward(self, input):
    #     x = self.act(self.layer(input))
    #     x = self.prob(self.output(x))
    #     return x


NNmodeling = NNModel(train_vectors.shape[1], n_output=2)


In [13]:
from skorch import NeuralNetClassifier
from sklearn.model_selection import GridSearchCV
# from dask_searchcv import GridSearchCV


model = NeuralNetClassifier(
    NNmodeling,
    criterion=nn.CrossEntropyLoss,
    optimizer=optim.Adam,  # Specify optimizer class without instantiating it
    optimizer__lr=4e-5,  # Set optimizer-related hyperparameters using optimizer__ prefix
    optimizer__betas=(0.5, 0.999),
    optimizer__weight_decay=1e-5,
    verbose=False
)

# model = torch.nn.DataParallel(model)
# cudnn.benchmark = True

In [14]:
# define the grid search parameters
param_grid = {
    'batch_size': [256, 512],
    'max_epochs': [10, 50, 100, 500, 1000]
}
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3, pre_dispatch="2*n_jobs")
grid_result = grid.fit(X, y)
 
# summarize results
print("Best Mean Test Score: %f using parameters %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("Mean Test Score: %f, Standard Deviation: %f, with parameters: %r" % (mean, stdev, param))

Best Mean Test Score: 0.935194 using parameters {'batch_size': 512, 'max_epochs': 1000}
Mean Test Score: 0.891373, Standard Deviation: 0.004518, with parameters: {'batch_size': 256, 'max_epochs': 10}
Mean Test Score: 0.891373, Standard Deviation: 0.004518, with parameters: {'batch_size': 256, 'max_epochs': 50}
Mean Test Score: 0.891373, Standard Deviation: 0.004518, with parameters: {'batch_size': 256, 'max_epochs': 100}
Mean Test Score: 0.928274, Standard Deviation: 0.003122, with parameters: {'batch_size': 256, 'max_epochs': 500}
Mean Test Score: 0.934270, Standard Deviation: 0.003408, with parameters: {'batch_size': 256, 'max_epochs': 1000}
Mean Test Score: 0.891142, Standard Deviation: 0.004635, with parameters: {'batch_size': 512, 'max_epochs': 10}
Mean Test Score: 0.891373, Standard Deviation: 0.004518, with parameters: {'batch_size': 512, 'max_epochs': 50}
Mean Test Score: 0.891373, Standard Deviation: 0.004518, with parameters: {'batch_size': 512, 'max_epochs': 100}
Mean Test S