In [1]:
import sys
sys.path.insert(1, '../..')

import torch
import torch.nn as nn
import random
import pandas as pd
import numpy as np
import time

random.seed(33)

from library.evaluation import ConfusionMatrix

unique_name = "RoBERTa_Finetuned"

In [2]:
vectors = np.loadtxt("../../data/processed/vectors/Phemernr2_RoBERTa_base_finetuned_vectors.txt", delimiter=",")
vectors.shape

(6425, 768)

In [3]:
data = pd.read_csv("../../data/processed/phemernr2_dataset_with_tvt.csv", lineterminator="\n")
data.head()

Unnamed: 0,tweet_id,tweet_text,label,label2,topic,tvt,cv_fold,tt,tvt2
0,552833795142209536,The East London Mosque would like to offer its...,non-rumours,non-rumours,charliehebdo-all-rnr-threads,test,2,test,training
1,580318210609696769,BREAKING - A Germanwings Airbus A320 plane rep...,rumours,true,germanwings-crash-all-rnr-threads,training,3,training,training
2,552798891994009601,Reports that two of the dead in the #CharlieHe...,rumours,true,charliehebdo-all-rnr-threads,test,2,test,training
3,576790814942236672,After #Putin disappeared Russian TV no longer ...,non-rumours,non-rumours,putinmissing-all-rnr-threads,test,2,test,training
4,499678822598340608,Saw #Ferguson for myself. #justiceformichaelbr...,non-rumours,non-rumours,ferguson-all-rnr-threads,training,3,training,training


In [4]:
labels_str = data['label2'].unique().tolist()
labels_str

['non-rumours', 'true', 'unverified', 'false']

In [5]:
labels = []
for i, d in data.iterrows():
    lab = labels_str.index(d['label2'])
#     labels.append([1 if j == lab else 0 for j in range(len(labels_str))])
    labels.append(lab)
labels[:10]

[0, 1, 1, 0, 0, 0, 0, 2, 0, 0]

In [6]:
train_vectors = np.array([vectors[i] for i, d in data.iterrows() if d['tvt2'] == 'training'])
val_vectors = np.array([vectors[i] for i, d in data.iterrows() if d['tvt2'] == 'validation'])
test_vectors = np.array([vectors[i] for i, d in data.iterrows() if d['tvt2'] == 'testting'])

train_labels = np.array([labels[i] for i, d in data.iterrows() if d['tvt2'] == 'training'])
val_labels = np.array([labels[i] for i, d in data.iterrows() if d['tvt2'] == 'validation'])
test_labels = np.array([labels[i] for i, d in data.iterrows() if d['tvt2'] == 'testting'])

In [7]:
print(train_vectors.shape)
print(val_vectors.shape)
print(test_vectors.shape)

print(train_labels.shape)
print(val_labels.shape)
print(test_labels.shape)

(4299, 768)
(1471, 768)
(655, 768)
(4299,)
(1471,)
(655,)


In [8]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from library.classification import SKLearnClassification
from library.evaluation import ConfusionMatrix

dataset_name = "Phemernr"

logres_model = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=10000)
neigh = KNeighborsClassifier(n_neighbors=7, weights="distance")
svm = LinearSVC(multi_class="ovr", max_iter=10000)
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)

models = [
    SKLearnClassification(logres_model, "Logistic Regression"),
    SKLearnClassification(rf, "Random Forest"),
    SKLearnClassification(neigh, "K-Nearest Neighbor"),
    SKLearnClassification(svm, "Support Vector Machine"),
]
for model in models:
    print(f"\n--- {model.model_name.upper()} ---")
    model.train(train_vectors, train_labels, dataset_name)
    
    print("\nValidation Set")
    preds = model.predict(val_vectors)

    conf_mat = ConfusionMatrix(
        labels=np.array([[1 if j == v else 0 for j in range(len(labels_str))] for v in val_labels]),
        predictions=np.array([[1 if j == p else 0 for j in range(len(labels_str))] for p in preds]),
        binary=False,
        model_name=f"{model.model_name.upper()} Validation"
    )
    conf_mat.evaluate(classes=labels_str)
    
    print("\nTest Set")
    preds = model.predict(test_vectors)

    conf_mat = ConfusionMatrix(
        labels=np.array([[1 if j == v else 0 for j in range(len(labels_str))] for v in test_labels]),
        predictions=np.array([[1 if j == p else 0 for j in range(len(labels_str))] for p in preds]),
        binary=False,
        model_name=f"{model.model_name.upper()} Test"
    )
    conf_mat.evaluate(classes=labels_str)

    print("--- END ---\n")


--- LOGISTIC REGRESSION ---
---> execution time : 5.21 seconds

Validation Set
1471 vs 1471
Multi Class Evaluation

Class non-rumours Evaluation
- Precision : 90.273 %
- Recall : 90.67 %
- F1 : 0.90471

Class true Evaluation
- Precision : 74.704 %
- Recall : 77.778 %
- F1 : 0.7621

Class unverified Evaluation
- Precision : 66.871 %
- Recall : 64.118 %
- F1 : 0.65465

Class false Evaluation
- Precision : 85.0 %
- Recall : 80.952 %
- F1 : 0.82927

Combined Evaluation
- Accuracy : 84.5 %
- Precision : 79.212 %
- Recall : 78.379 %
- F1 : 0.78793

- Average Confidence : 100.0 %
Model, Combined,,,,non-rumours,,,true,,,unverified,,,false,,,
LOGISTIC REGRESSION Validation, 84.5, 79.212, 78.379, 0.78793, 90.273, 90.67, 0.90471, 74.704, 77.778, 0.7621, 66.871, 64.118, 0.65465, 85.0, 80.952, 0.82927, 

Test Set
655 vs 655
Multi Class Evaluation

Class non-rumours Evaluation
- Precision : 88.808 %
- Recall : 88.378 %
- F1 : 0.88592

Class true Evaluation
- Precision : 71.818 %
- Recall : 73.148 %