In [1]:
import sys
sys.path.insert(1, '../..')

import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("../../data/processed/twitter15_dataset_with_tvt.csv", lineterminator="\n")
data.head()

Unnamed: 0,tweet_id,tweet_text,label,tvt,cv_fold,tt,tvt2
0,731166399389962242,🔥ca kkk grand wizard 🔥 endorses @hillaryclinto...,unverified,training,1,training,training
1,714598641827246081,an open letter to trump voters from his top st...,unverified,training,1,test,testting
2,691809004356501505,america is a nation of second chances —@potus ...,non-rumor,training,2,training,training
3,693204708933160960,"brandon marshall visits and offers advice, sup...",non-rumor,training,1,training,validation
4,551099691702956032,rip elly may clampett: so sad to learn #beverl...,true,training,3,training,training


In [3]:
bigram_data = pd.read_excel('../../data/processed/twitter15_ngram_distribution.xlsx', sheet_name='bigram')
bigram_data.columns = ["token", "unverified", "non-rumor", "true", "false"]
bigram_data.head()

Unnamed: 0,token,unverified,non-rumor,true,false
0,ca kkk,1,0,0,0
1,kkk grand,1,0,0,0
2,grand wizard,1,0,0,0
3,wizard endorses,1,0,0,0
4,endorses @hillaryclinton,1,0,0,0


In [4]:
bigram_vector_base = bigram_data['token'].tolist()
print(len(bigram_vector_base))
bigram_vector_base[10]

13842


'to trump'

In [5]:
label_rnr = []
for i, d in data.iterrows():
    if d['label'] == "non-rumor":
        label_rnr.append("non-rumours")
    else:
        label_rnr.append("rumours")
        
data['label_rnr'] = pd.Series(label_rnr)
data.head()

Unnamed: 0,tweet_id,tweet_text,label,tvt,cv_fold,tt,tvt2,label_rnr
0,731166399389962242,🔥ca kkk grand wizard 🔥 endorses @hillaryclinto...,unverified,training,1,training,training,rumours
1,714598641827246081,an open letter to trump voters from his top st...,unverified,training,1,test,testting,rumours
2,691809004356501505,america is a nation of second chances —@potus ...,non-rumor,training,2,training,training,non-rumours
3,693204708933160960,"brandon marshall visits and offers advice, sup...",non-rumor,training,1,training,validation,non-rumours
4,551099691702956032,rip elly may clampett: so sad to learn #beverl...,true,training,3,training,training,rumours


In [6]:
labels_str = data['label'].unique().tolist()
labels_str

['unverified', 'non-rumor', 'true', 'false']

In [7]:
labels = []
for i, d in data.iterrows():
    lab = labels_str.index(d['label'])
    labs = [1 if idx == lab else 0 for idx in range(len(labels_str))]
    labels.append(labs)
labels[:10]

[[1, 0, 0, 0],
 [1, 0, 0, 0],
 [0, 1, 0, 0],
 [0, 1, 0, 0],
 [0, 0, 1, 0],
 [0, 1, 0, 0],
 [1, 0, 0, 0],
 [0, 0, 1, 0],
 [1, 0, 0, 0],
 [0, 0, 0, 1]]

In [8]:
import string
import nltk
from nltk.tokenize import TweetTokenizer

tokenizer = TweetTokenizer(reduce_len=True)

def text2bigrams(text):
    bigrams = tokenizer.tokenize(text.encode('ascii', 'ignore').decode('utf8'))
    bigrams = [t for t in bigrams if t not in string.punctuation]
    bigrams = [t for t in bigrams if t not in ['URL', '‘', '’']]
    
    bigrams = nltk.bigrams(bigrams)
    bigrams = map(' '.join, bigrams)
    bigrams = [bgr for bgr in bigrams]
    
    return bigrams


def bigrams_vectors_generation(texts):
    bigram_vectors = []
    for text in texts:
        bigrams = text2bigrams(text)

        init_vec = [0.0 for _ in range(len(bigram_vector_base) + 1)]
        for bgr in bigrams:
            if bgr in bigram_vector_base:
                idx = bigram_vector_base.index(bgr)
                init_vec[idx] = 1.0
            else:
                init_vec[-1] = 1.0
        bigram_vectors.append(init_vec)
    
    return bigram_vectors

In [9]:
texts = data['tweet_text'].tolist()
vectors = bigrams_vectors_generation(texts)
vectors = np.array(vectors)
vectors.shape

(1490, 13843)

In [10]:
import string
import nltk
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

raw_texts = data['tweet_text'].tolist()

tokenizer = TweetTokenizer(reduce_len=True)
texts = [tokenizer.tokenize(text.encode('ascii', 'ignore').decode('utf8')) for text in raw_texts]
texts = [[t for t in text if t not in string.punctuation] for text in texts]
texts = [[t for t in text if t not in ['URL', '‘', '’']] for text in texts]

tokens = []

for i, text in enumerate(texts):
    bigrms = nltk.bigrams(text)
    bigrms = map(' '.join, bigrms)
    bigrms = [b for b in bigrms]
    tokens.append(bigrms)
print(tokens[:5])
    
corpus = tokens

def identity_tokenizer(text):
    return text

vectorizer = TfidfVectorizer(tokenizer=identity_tokenizer, stop_words="english", lowercase=False)
vectors = vectorizer.fit_transform(corpus)
vectors = vectors.toarray()
print(vectors.shape)

[['ca kkk', 'kkk grand', 'grand wizard', 'wizard endorses', 'endorses @hillaryclinton', '@hillaryclinton #neverhillary', '#neverhillary #trump2016'], ['an open', 'open letter', 'letter to', 'to trump', 'trump voters', 'voters from', 'from his', 'his top', 'top strategist-turned-defector', 'strategist-turned-defector via', 'via @xojanedotcom'], ['america is', 'is a', 'a nation', 'nation of', 'of second', 'second chances', 'chances @potus', '@potus on', 'on new', 'new reforms', 'reforms to', 'to solitary', 'solitary confinement'], ['brandon marshall', 'marshall visits', 'visits and', 'and offers', 'offers advice', 'advice support', 'support to', 'to brother', 'brother of', 'of fallen', 'fallen hero', 'hero zaevion', 'zaevion dobson'], ['rip elly', 'elly may', 'may clampett', 'clampett so', 'so sad', 'sad to', 'to learn', 'learn #beverlyhillbillies', '#beverlyhillbillies star', 'star donna', 'donna douglas', 'douglas has', 'has passed', 'passed away']]
(1490, 13842)




In [11]:
train_vectors = np.array([vectors[i] for i, d in data.iterrows() if d['tvt2'] == 'training'])
val_vectors = np.array([vectors[i] for i, d in data.iterrows() if d['tvt2'] == 'validation'])
test_vectors = np.array([vectors[i] for i, d in data.iterrows() if d['tvt2'] == 'testting'])

train_labels = np.array([labels[i] for i, d in data.iterrows() if d['tvt2'] == 'training'])
val_labels = np.array([labels[i] for i, d in data.iterrows() if d['tvt2'] == 'validation'])
test_labels = np.array([labels[i] for i, d in data.iterrows() if d['tvt2'] == 'testting'])

In [12]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from library.classification import SKLearnClassification
from library.evaluation import ConfusionMatrix

dataset_name = "Phemernr"

random_forest = RandomForestClassifier(class_weight="balanced")
neigh = KNeighborsClassifier(n_neighbors=5, weights="distance")
svm = LinearSVC(class_weight="balanced")

models = [
    SKLearnClassification(random_forest, "Random Forest"),
    SKLearnClassification(neigh, "K-Nearest Neighbor"),
    SKLearnClassification(svm, "Support Vector Machine"),
]
for model in models:
    print(f"\n--- {model.model_name.upper()} ---")
    if model.model_name == "Support Vector Machine":
        y = [l.tolist().index(max(l)) for l in train_labels]
        model.train(train_vectors, y, dataset_name)
    else:
        model.train(train_vectors, train_labels, dataset_name)
    
    print("Validation Set")
    preds = model.predict(val_vectors)
    if model.model_name == "Support Vector Machine":
        preds = np.array([[1 if p == idx else 0 for idx in range(len(labels_str))] for p in preds])
#     print(preds)

    conf_mat = ConfusionMatrix(
        labels=val_labels,
        predictions=preds,
        binary=False
    )
    conf_mat.evaluate(labels_str)
    
    print("Test Set")
    preds = model.predict(test_vectors)
    if model.model_name == "Support Vector Machine":
        preds = np.array([[1 if p == idx else 0 for idx in range(len(labels_str))] for p in preds])

    conf_mat = ConfusionMatrix(
        labels=test_labels,
        predictions=preds,
        binary=False
    )
    conf_mat.evaluate(labels_str)

    print("--- END ---\n")


--- RANDOM FOREST ---
---> execution time : 5.92 seconds
Validation Set
328 vs 328
Multi Class Evaluation

Class unverified Evaluation
- Precision : 64.516 %
- Recall : 24.39 %
- F1 : 0.35398

Class non-rumor Evaluation
- Precision : 35.747 %
- Recall : 97.531 %
- F1 : 0.52318

Class true Evaluation
- Precision : 100.0 %
- Recall : 62.353 %
- F1 : 0.76812

Class false Evaluation
- Precision : 95.652 %
- Recall : 27.5 %
- F1 : 0.42718

Combined Evaluation
- Accuracy : 53.049 %
- Precision : 73.979 %
- Recall : 52.944 %
- F1 : 0.61718

- Average Confidence : 95.73 %
Model, Combined,,,,unverified,,,non-rumor,,,true,,,false,,,
Anonymous, 53.049, 73.979, 52.944, 0.61718, 64.516, 24.39, 0.35398, 35.747, 97.531, 0.52318, 100.0, 62.353, 0.76812, 95.652, 27.5, 0.42718, 
Test Set
175 vs 175
Multi Class Evaluation

Class unverified Evaluation
- Precision : 83.333 %
- Recall : 42.553 %
- F1 : 0.56338

Class non-rumor Evaluation
- Precision : 38.596 %
- Recall : 97.778 %
- F1 : 0.55346

Class true