In [1]:
import sys
sys.path.insert(1, '../..')

import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("../../data/processed/twitter16_dataset_with_tvt.csv", lineterminator="\n")
data.head()

Unnamed: 0,tweet_id,tweet_text,label,tvt,cv_fold,tt,tvt2,tvt2_1,tvt2_2,tvt2_3
0,656955120626880512,correct predictions in back to the future ii U...,false,training,1,training,validation,training,training,validation
1,615689290706595840,.@whitehouse in rainbow colors for #scotusmarr...,true,training,3,training,training,validation,training,training
2,613404935003217920,cops bought the alleged church shooter burger ...,false,training,2,test,training,training,training,training
3,731166399389962242,🔥ca kkk grand wizard 🔥 endorses @hillaryclinto...,unverified,training,3,test,training,training,training,training
4,714598641827246081,an open letter to trump voters from his top st...,unverified,training,1,test,training,validation,training,training


In [3]:
label_rnr = []
for i, d in data.iterrows():
    if d['label'] in ["unverified", "true", "false"]:
        label_rnr.append("rumours")
    else:
        label_rnr.append("non-rumours")
        
data['label_rnr'] = pd.Series(label_rnr)
data.head()

Unnamed: 0,tweet_id,tweet_text,label,tvt,cv_fold,tt,tvt2,tvt2_1,tvt2_2,tvt2_3,label_rnr
0,656955120626880512,correct predictions in back to the future ii U...,false,training,1,training,validation,training,training,validation,rumours
1,615689290706595840,.@whitehouse in rainbow colors for #scotusmarr...,true,training,3,training,training,validation,training,training,rumours
2,613404935003217920,cops bought the alleged church shooter burger ...,false,training,2,test,training,training,training,training,rumours
3,731166399389962242,🔥ca kkk grand wizard 🔥 endorses @hillaryclinto...,unverified,training,3,test,training,training,training,training,rumours
4,714598641827246081,an open letter to trump voters from his top st...,unverified,training,1,test,training,validation,training,training,rumours


In [4]:
# labels_str = ['rumour', 'non-rumour']
label_type = "label_rnr"
labels_str = data[label_type].unique().tolist()
labels_str

['rumours', 'non-rumours']

In [5]:
labels = []
for i, d in data.iterrows():
    lab = labels_str.index(d[label_type])
    labels.append(lab)
labels[:10]

[0, 0, 0, 0, 0, 0, 0, 1, 1, 0]

In [6]:
import string
import nltk
from nltk.tokenize import TweetTokenizer

tokenizer = TweetTokenizer(reduce_len=True)


def text2unigrams(text):
    texts = tokenizer.tokenize(text.encode('ascii', 'ignore').decode('utf8'))
    texts = [t for t in texts if t not in string.punctuation]
    texts = [t for t in texts if t not in ['URL', '‘', '’']]
    
    unigrams = texts
    
    return unigrams


def text2bigrams(text):
    texts = tokenizer.tokenize(text.encode('ascii', 'ignore').decode('utf8'))
    texts = [t for t in texts if t not in string.punctuation]
    texts = [t for t in texts if t not in ['URL', '‘', '’']]
    
    bigrams = nltk.bigrams(texts)
    bigrams = map(' '.join, bigrams)
    bigrams = [bgr for bgr in bigrams]
    
    return bigrams


def text2trigrams(text):
    texts = tokenizer.tokenize(text.encode('ascii', 'ignore').decode('utf8'))
    texts = [t for t in texts if t not in string.punctuation]
    texts = [t for t in texts if t not in ['URL', '‘', '’']]
    
    trigrams = nltk.trigrams(texts)
    trigrams = map(' '.join, trigrams)
    trigrams = [bgr for bgr in trigrams]
    
    return trigrams


def bigrams_vectors_generation(texts):
    bigram_vectors = []
    for text in texts:
        bigrams = text2bigrams(text)

        init_vec = [0.0 for _ in range(len(bigram_vector_base) + 1)]
        for bgr in bigrams:
            if bgr in bigram_vector_base:
                idx = bigram_vector_base.index(bgr)
                init_vec[idx] = 1.0
            else:
                init_vec[-1] = 1.0
        bigram_vectors.append(init_vec)
    
    return bigram_vectors


def custom_vectors_generation(texts, vector_terms):
    vectors = []
    for text in texts:
        bigrams = text2bigrams(text)
        trigrams = text2trigrams(text)

        init_vec = [0.0 for _ in range(len(vector_terms) + 1)]
        for bgr in bigrams:
            if bgr in vector_terms:
                idx = vector_terms.index(bgr)
                init_vec[idx] = 1.0
            else:
                init_vec[-1] = 1.0
        for tgr in trigrams:
            if tgr in vector_terms:
                idx = vector_terms.index(tgr)
                init_vec[idx] = 1.0
            else:
                init_vec[-1] = 1.0
        vectors.append(init_vec)
    
    return vectors

In [7]:
texts = data['tweet_text'].tolist()
# vectors = bigrams_vectors_generation(texts)

In [8]:
import nltk
from nltk.collocations import *

top_n = 2000
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

finder2 = BigramCollocationFinder.from_words([])
finder3 = TrigramCollocationFinder.from_words([])

# generating bigram and trigram
for text in texts:
    unigrams = text2unigrams(text)
    bigrams = text2bigrams(text)
    trigrams = text2trigrams(text)
    
    for ngrm in unigrams:
        if ngrm not in finder2.word_fd:
            finder2.word_fd[ngrm] = 0
        finder2.word_fd[ngrm] += 1
        finder2.N += 1
        
        if ngrm not in finder3.word_fd:
            finder3.word_fd[ngrm] = 0
        finder3.word_fd[ngrm] += 1
        finder3.N += 1
    
    for ngrm in bigrams:
        term = tuple([i for i in ngrm.split()])
        
        if term not in finder2.ngram_fd:
            finder2.ngram_fd[term] = 0
            
        finder2.ngram_fd[term] += 1

    for ngrm in trigrams:
        term = tuple([i for i in ngrm.split()])
        
        if term not in finder3.ngram_fd:
            finder3.ngram_fd[term] = 0
            
        finder3.ngram_fd[term] += 1
        
# only bigrams that appear 3+ times
finder2.apply_freq_filter(3)
finder3.apply_freq_filter(3)

combined = []
for res in finder2.score_ngrams(bigram_measures.pmi):
    combined.append(res)
for res in finder3.score_ngrams(trigram_measures.pmi):
    combined.append(res)
combined = sorted(combined, key=lambda x: x[1], reverse=True)

In [9]:
print(len(finder2.ngram_fd), len(finder3.ngram_fd))
count = 0
for k, v in combined:
    print(f"{k} - {v}")
    count += 1
    if count >= 10:
        break

501 275
('1991', 'book', 'quotes') - 22.928836235477128
('kissing', 'islands', 'greenland') - 22.928836235477128
('confiscates', 'several', 'thousand') - 22.758911234034812
('fda', 'confiscates', 'several') - 22.758911234034812
('niagara', 'falls', 'turned') - 22.606908140589766
('several', 'thousand', 'chickens') - 22.34387373475597
('drive', 'destruction', 'company') - 21.86994254642356
('cpl', 'nathan', 'cirillo') - 21.536518812698368
('blacks', '1991', 'book') - 21.513798736198282
('signers', 'stock', 'sinks') - 21.469404616839828


In [10]:
term_vector_base = [" ".join(c[0]) for c in combined[:top_n]]
vectors = custom_vectors_generation(texts, term_vector_base)

In [11]:
len(vectors[102])

777

In [12]:
min_score = 100
max_score = 0
n_2gram = 0
n_3gram = 0
for k, v in combined[:top_n]:
    min_score = min(min_score, v)
    max_score = max(max_score, v)
    
    if len(k) == 2:
        n_2gram += 1
    
    if len(k) == 3:
        n_3gram += 1
        
print(min_score)
print(max_score)
print(n_2gram)
print(n_3gram)

0.6362816335444546
22.928836235477128
501
275


In [13]:
train_vectors = np.array([vectors[i] for i, d in data.iterrows() if d['tvt2'] == 'training'])
val_vectors = np.array([vectors[i] for i, d in data.iterrows() if d['tvt2'] == 'validation'])
test_vectors = np.array([vectors[i] for i, d in data.iterrows() if d['tvt2'] == 'testting'])

train_labels = np.array([labels[i] for i, d in data.iterrows() if d['tvt2'] == 'training'])
val_labels = np.array([labels[i] for i, d in data.iterrows() if d['tvt2'] == 'validation'])
test_labels = np.array([labels[i] for i, d in data.iterrows() if d['tvt2'] == 'testting'])

In [14]:
from sklearn.svm import LinearSVC

from library.classification import SKLearnClassification
from library.evaluation import ConfusionMatrix

svm = LinearSVC()

model = SKLearnClassification(svm, "Support Vector Machine")
print(f"\n--- START ---")
model.train(train_vectors, train_labels, "Phemernr2")

print("Validation Set")
preds = model.predict(val_vectors)

conf_mat = ConfusionMatrix(
    labels=val_labels,
    predictions=preds,
    binary=True
)
conf_mat.evaluate()

print("--- END ---\n")


--- START ---
---> execution time : 0.0 seconds
Validation Set
Binary Class Evaluation

True Positive : 30
False Positive : 26
False Negative : 15
True Negative : 121

Class positive Evaluation
- Precision : 53.571 %
- Recall : 66.667 %
- F1 : 0.59406

Class negative Evaluation
- Precision : 88.971 %
- Recall : 82.313 %
- F1 : 0.85512

Combined Evaluation
- Accuracy : 78.646 %
- Precision : 71.271 %
- Recall : 74.49 %
- F1 : 0.72845
- Average Confidence : 100.0 %
Model, Combined,,,,positive,,,negative,,,
Anonymous, 78.646, 71.271, 74.49, 0.72845, 53.571, 66.667, 0.59406, 88.971, 82.313, 0.85512, 
--- END ---



In [15]:
from scipy.sparse import csr_matrix

coefs = model.model.coef_
if type(coefs) == csr_matrix:
    coefs.toarray().tolist()[0]
else:
    coefs.tolist()
coefs_and_features = list(zip(coefs[0], term_vector_base))

# Most predictive overall
# coefs_and_features = sorted(coefs_and_features, key=lambda x: x[0], reverse=True)
coefs_and_features = sorted(coefs_and_features, key=lambda x: abs(x[0]), reverse=True)
print(f"Total tokens : {len(coefs_and_features)}")

Total tokens : 776


In [16]:
coefs.shape

(1, 777)

In [17]:
coefs_and_features

[(-1.1985172876629744, 'hostage situation'),
 (0.9923987525651141, 'years ago'),
 (-0.9784438804233626, 'lindt cafe'),
 (-0.912490536268636, '#opkkk #hoodsoff'),
 (-0.9124903831728407, 'el chapo'),
 (-0.9080746813090507, 'car dealership'),
 (-0.8799016426800104, '#charliehebdo attackers'),
 (-0.8551186256903766, "what's the"),
 (0.841982928032402, 'u s'),
 (-0.8408505848253083, 'from the'),
 (-0.8382345114814709, 'will be'),
 (0.8284516442857878, 'reports of'),
 (-0.8281791938410259, 'open letter'),
 (-0.821241540819541, 'charlie hebdo'),
 (-0.8212415269834239, 'to me'),
 (-0.821241510318415, 'red cross'),
 (-0.821241494366622, 'a crow'),
 (-0.8097953250508094, 'donald trump'),
 (-0.8012111804896871, 'burger king'),
 (-0.7957525482041105, 'being held'),
 (0.7948051743987319, 'a dog'),
 (-0.7896552782725876, 'about the'),
 (0.7896046123827893, 'jeb bush'),
 (0.771223874280771, 'prime minister'),
 (-0.7624823913111787, 'bag charge'),
 (-0.7522403423985748, 'was a'),
 (-0.7501675632216137

In [18]:
coefs_and_features[:10]

[(-1.1985172876629744, 'hostage situation'),
 (0.9923987525651141, 'years ago'),
 (-0.9784438804233626, 'lindt cafe'),
 (-0.912490536268636, '#opkkk #hoodsoff'),
 (-0.9124903831728407, 'el chapo'),
 (-0.9080746813090507, 'car dealership'),
 (-0.8799016426800104, '#charliehebdo attackers'),
 (-0.8551186256903766, "what's the"),
 (0.841982928032402, 'u s'),
 (-0.8408505848253083, 'from the')]

In [19]:
coefs_and_features[-10:]

[(0.0, 'house is'),
 (0.0, 'colors to'),
 (0.0, 'of steve'),
 (0.0, 'father of'),
 (0.0, 'a day'),
 (0.0, 'the father'),
 (0.0, 'a muslim'),
 (0.0, 'jobs the'),
 (0.0, 'of this'),
 (0.0, 'at a')]

In [20]:
coefs_and_features[-10:]

[(0.0, 'house is'),
 (0.0, 'colors to'),
 (0.0, 'of steve'),
 (0.0, 'father of'),
 (0.0, 'a day'),
 (0.0, 'the father'),
 (0.0, 'a muslim'),
 (0.0, 'jobs the'),
 (0.0, 'of this'),
 (0.0, 'at a')]

In [22]:
n_best = 750
best_tokens = []

# for cf in coefs_and_features[-n_best:]:
#     best_tokens.append(cf[1])

for cf in coefs_and_features[:n_best]:
    best_tokens.append(cf[1])
    
print(best_tokens[:10])
    
with open("../../data/processed/twitter16-rnr_best_terms.txt", "w") as f:
    for token in best_tokens:
        f.write(token + "\n")

['hostage situation', 'years ago', 'lindt cafe', '#opkkk #hoodsoff', 'el chapo', 'car dealership', '#charliehebdo attackers', "what's the", 'u s', 'from the']
