In [1]:
import sys
sys.path.insert(1, '../..')

import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("../../data/processed/twitter15_dataset_with_tvt.csv", lineterminator="\n")
data.head()

Unnamed: 0,tweet_id,tweet_text,label,tvt,cv_fold,tt,tvt2
0,731166399389962242,🔥ca kkk grand wizard 🔥 endorses @hillaryclinto...,unverified,training,1,training,testting
1,714598641827246081,an open letter to trump voters from his top st...,unverified,training,1,test,training
2,691809004356501505,america is a nation of second chances —@potus ...,non-rumor,training,2,training,training
3,693204708933160960,"brandon marshall visits and offers advice, sup...",non-rumor,training,1,training,training
4,551099691702956032,rip elly may clampett: so sad to learn #beverl...,true,training,3,training,training


In [3]:
bigram_data = pd.read_excel('../../data/processed/twitter15_ngram_distribution.xlsx', sheet_name='bigram')
bigram_data.columns = ["token", "unverified", "non-rumor", "true", "false"]
bigram_data.head()

Unnamed: 0,token,unverified,non-rumor,true,false
0,ca kkk,1,0,0,0
1,kkk grand,1,0,0,0
2,grand wizard,1,0,0,0
3,wizard endorses,1,0,0,0
4,endorses @hillaryclinton,1,0,0,0


In [4]:
bigram_vector_base = bigram_data['token'].tolist()
print(len(bigram_vector_base))
bigram_vector_base[10]

13842


'to trump'

In [5]:
label_rnr = []
for i, d in data.iterrows():
    if d['label'] == "non-rumor":
        label_rnr.append("non-rumours")
    else:
        label_rnr.append("rumours")
        
data['label_rnr'] = pd.Series(label_rnr)
data.head()

Unnamed: 0,tweet_id,tweet_text,label,tvt,cv_fold,tt,tvt2,label_rnr
0,731166399389962242,🔥ca kkk grand wizard 🔥 endorses @hillaryclinto...,unverified,training,1,training,testting,rumours
1,714598641827246081,an open letter to trump voters from his top st...,unverified,training,1,test,training,rumours
2,691809004356501505,america is a nation of second chances —@potus ...,non-rumor,training,2,training,training,non-rumours
3,693204708933160960,"brandon marshall visits and offers advice, sup...",non-rumor,training,1,training,training,non-rumours
4,551099691702956032,rip elly may clampett: so sad to learn #beverl...,true,training,3,training,training,rumours


In [6]:
# labels_str = ['rumour', 'non-rumour']
label_type = "label_rnr"
labels_str = data[label_type].unique().tolist()
labels_str

['rumours', 'non-rumours']

In [7]:
labels = []
for i, d in data.iterrows():
    lab = labels_str.index(d[label_type])
    labels.append(lab)
labels[:10]

[0, 0, 1, 1, 0, 1, 0, 0, 0, 0]

In [8]:
import string
import nltk
from nltk.tokenize import TweetTokenizer

tokenizer = TweetTokenizer(reduce_len=True)

def text2bigrams(text):
    bigrams = tokenizer.tokenize(text.encode('ascii', 'ignore').decode('utf8'))
    bigrams = [t for t in bigrams if t not in string.punctuation]
    bigrams = [t for t in bigrams if t not in ['URL', '‘', '’']]
    
    bigrams = nltk.bigrams(bigrams)
    bigrams = map(' '.join, bigrams)
    bigrams = [bgr for bgr in bigrams]
    
    return bigrams


def bigrams_vectors_generation(texts):
    bigram_vectors = []
    for text in texts:
        bigrams = text2bigrams(text)

        init_vec = [0.0 for _ in range(len(bigram_vector_base) + 1)]
        for bgr in bigrams:
            if bgr in bigram_vector_base:
                idx = bigram_vector_base.index(bgr)
                init_vec[idx] = 1.0
            else:
                init_vec[-1] = 1.0
        bigram_vectors.append(init_vec)
    
    return bigram_vectors

In [9]:
texts = data['tweet_text'].tolist()
vectors = bigrams_vectors_generation(texts)

In [10]:
train_vectors = np.array([vectors[i] for i, d in data.iterrows() if d['tvt2'] == 'training'])
val_vectors = np.array([vectors[i] for i, d in data.iterrows() if d['tvt2'] == 'validation'])
test_vectors = np.array([vectors[i] for i, d in data.iterrows() if d['tvt2'] == 'testting'])

train_labels = np.array([labels[i] for i, d in data.iterrows() if d['tvt2'] == 'training'])
val_labels = np.array([labels[i] for i, d in data.iterrows() if d['tvt2'] == 'validation'])
test_labels = np.array([labels[i] for i, d in data.iterrows() if d['tvt2'] == 'testting'])

In [11]:
from sklearn.svm import LinearSVC

from library.classification import SKLearnClassification
from library.evaluation import ConfusionMatrix

svm = LinearSVC()

model = SKLearnClassification(svm, "Support Vector Machine")
print(f"\n--- START ---")
model.train(train_vectors, train_labels, "Twitter15")

print("Validation Set")
preds = model.predict(val_vectors)

conf_mat = ConfusionMatrix(
    labels=val_labels,
    predictions=preds,
    binary=True
)
conf_mat.evaluate()

print("--- END ---\n")


--- START ---
---> execution time : 0.05 seconds
Validation Set
Binary Class Evaluation

True Positive : 12
False Positive : 3
False Negative : 80
True Negative : 252

Class positive Evaluation
- Precision : 80.0 %
- Recall : 13.043 %
- F1 : 0.2243

Class negative Evaluation
- Precision : 75.904 %
- Recall : 98.824 %
- F1 : 0.8586

Combined Evaluation
- Accuracy : 76.081 %
- Precision : 77.952 %
- Recall : 55.934 %
- F1 : 0.65133
- Average Confidence : 100.0 %
Model, Combined,,,,positive,,,negative,,,
Anonymous, 76.081, 77.952, 55.934, 0.65133, 80.0, 13.043, 0.2243, 75.904, 98.824, 0.8586, 
--- END ---



In [12]:
from scipy.sparse import csr_matrix

coefs = model.model.coef_
if type(coefs) == csr_matrix:
    coefs.toarray().tolist()[0]
else:
    coefs.tolist()
coefs_and_features = list(zip(coefs[0], bigram_vector_base))

# Most predictive overall
coefs_and_features = sorted(coefs_and_features, key=lambda x: x[0], reverse=True)
print(f"Total tokens : {len(coefs_and_features)}")

Total tokens : 13842


In [13]:
coefs.shape

(1, 13843)

In [14]:
coefs_and_features

[(0.8180774703160487, 'report spider'),
 (0.8180774390771315, 'happy #gossipgirlday'),
 (0.8180773810004512, 'happy #lefthandersday'),
 (0.8180773359911802, 'happy #nationalhugday'),
 (0.8180770925241869, 'happy #nationalchocolatecakeday'),
 (0.49084655235578784, 'oooh my'),
 (0.49084655235578784, 'my d-rose'),
 (0.43099038288864167, 'the las'),
 (0.43099038288864167, 'vegas raiders'),
 (0.35060466775159643, 'some #wednesdaywisdom'),
 (0.35060466775159643, '#wednesdaywisdom from'),
 (0.35060466775159643, 'from #thelionking'),
 (0.3506046126616214, "it wasn't"),
 (0.3506046126616214, "wasn't even"),
 (0.3506046126616214, 'even close'),
 (0.3506045373817379, "leo's first"),
 (0.3506045373817379, 'first sag'),
 (0.3506045373817379, 'sag #sagawards'),
 (0.30190934106450573, '#oscarnoms nominations'),
 (0.30190934106450573, 'nominations for'),
 (0.30190934106450573, 'for best'),
 (0.28770743797385195, 'the #instyleglobes'),
 (0.2837367054008867, 'to end'),
 (0.2726924537202929, 'congratulat

In [15]:
def custom_vectors_generation(texts, tokens):
    bigram_vectors = []
    for text in texts:
        bigrams = text2bigrams(text)

        init_vec = [0.0 for _ in range(len(tokens) + 1)]
        for bgr in bigrams:
            if bgr in tokens:
                idx = tokens.index(bgr)
                init_vec[idx] = 1.0
            else:
                init_vec[-1] = 1.0
        bigram_vectors.append(init_vec)
    
    return bigram_vectors

def custom_vectors_generation_v2(texts, tokens):
    bigram_vectors = []
    for text in texts:
        bigrams = text2bigrams(text)

        init_vec = [0.0 for _ in range(len(tokens))]
        for bgr in bigrams:
            if bgr in tokens:
                idx = tokens.index(bgr)
                init_vec[idx] = 1.0
        bigram_vectors.append(init_vec)
    
    return bigram_vectors

# start = 100
# f1s = []
# for i in range(100):
#     n_token = start + 100*i
#     print(f"Using first {n_token} Tokens ....")
#     picked_tokens = [t[1] for t in coefs_and_features[:n_token]]
    
#     vectors = custom_vectors_generation(texts, picked_tokens)
# #     vectors = custom_vectors_generation_v2(texts, picked_tokens)
    
#     train_vectors = np.array([vectors[i] for i, d in data.iterrows() if d['tvt2'] == 'training'])
#     val_vectors = np.array([vectors[i] for i, d in data.iterrows() if d['tvt2'] == 'validation'])
    
#     model = SKLearnClassification(svm, "Support Vector Machine")
#     model.train(train_vectors, train_labels, "Twitter15", logs=False)

#     preds = model.predict(val_vectors)
#     conf_mat = ConfusionMatrix(
#         labels=val_labels,
#         predictions=preds,
#         binary=True
#     )
#     conf_mat.evaluate(logs=False)
#     f1s.append(conf_mat.f1)

In [16]:
# print(max(f1s))
# f1s.index(max(f1s))

In [17]:
# import matplotlib.pyplot as plt

# x = [i*100 for i in range(len(f1s))]
# y = f1s

# plt.xlabel("Token/Bigram used")
# plt.ylabel("F1 Score")
# plt.plot(x, y)
# plt.show()

In [18]:
# import json

# best_tokens = [t[1] for t in coefs_and_features[:6100]]
# best_tokens[:10]
# 
# with open("../../data/processed/twitter15_best_bigrams.txt", "w") as f:
#     for token in best_tokens:
#         f.write(token + "\n")

In [19]:
coefs_and_features[:10]

[(0.8180774703160487, 'report spider'),
 (0.8180774390771315, 'happy #gossipgirlday'),
 (0.8180773810004512, 'happy #lefthandersday'),
 (0.8180773359911802, 'happy #nationalhugday'),
 (0.8180770925241869, 'happy #nationalchocolatecakeday'),
 (0.49084655235578784, 'oooh my'),
 (0.49084655235578784, 'my d-rose'),
 (0.43099038288864167, 'the las'),
 (0.43099038288864167, 'vegas raiders'),
 (0.35060466775159643, 'some #wednesdaywisdom')]

In [20]:
coefs_and_features[-10:]

[(-0.2535352418638607, 'darren wilson'),
 (-0.25569284795052605, 'tribute to'),
 (-0.2818699524610294, '#charliehebdo shooting'),
 (-0.3024949698552558, 'shot down'),
 (-0.30915359801704734, '#mh17 14'),
 (-0.30915359801704734, '14 ..'),
 (-0.3107663806203996, 'iphone 6'),
 (-0.3410508691054994, 'war memorial'),
 (-0.3911762877177825, "paul walker's"),
 (-0.49499204261390356, 'paul walker')]

In [21]:
n_best = 750
best_tokens = []

for cf in coefs_and_features[-n_best:]:
    best_tokens.append(cf[1])

for cf in coefs_and_features[:n_best]:
    best_tokens.append(cf[1])
    
with open("../../data/processed/twitter15_best_bigrams.txt", "w") as f:
    for token in best_tokens:
        f.write(token + "\n")