In [1]:
import sys
sys.path.insert(1, '../..')

import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("../../data/processed/liarpantsfire_dataset.csv", lineterminator="\n")
data.head()

Unnamed: 0,id,statement,label,tvt2,tvt2_1,tvt2_2,tvt2_3
0,2635.json,Says the Annies List political group supports ...,false,validation,validation,training,training
1,10540.json,When did the decline of coal start? It started...,half-true,training,training,validation,training
2,324.json,"Hillary Clinton agrees with John McCain ""by vo...",mostly-true,training,training,validation,validation
3,1123.json,Health care reform legislation is likely to ma...,false,training,validation,testting,training
4,9028.json,The economic turnaround started at the end of ...,half-true,training,training,testting,training


In [3]:
statements = data['statement'].tolist()
comp = [len(s.split(" ")) for s in statements]
print("Average words : ", sum(comp)/len(comp))
print("Max words : ", max(comp))
print("Data : ", len(comp))

Average words :  17.991947463059965
Max words :  302
Data :  12791


In [4]:
# labels_str = ['rumour', 'non-rumour']
label_type = "label"
labels_str = data[label_type].unique().tolist()
labels_str

['false', 'half-true', 'mostly-true', 'true', 'barely-true', 'pants-fire']

In [5]:
labels = []
for i, d in data.iterrows():
    lab = labels_str.index(d[label_type])
    labels.append(lab)
labels[:10]

[0, 1, 2, 0, 1, 3, 4, 1, 1, 2]

In [6]:
import string
import nltk
from nltk.tokenize import TweetTokenizer

tokenizer = TweetTokenizer(reduce_len=True)


def text2unigrams(text):
    texts = tokenizer.tokenize(text.encode('ascii', 'ignore').decode('utf8'))
    texts = [t for t in texts if t not in string.punctuation]
    texts = [t for t in texts if t not in ['URL', '‘', '’']]
    
    unigrams = texts
    
    return unigrams


def text2bigrams(text):
    texts = tokenizer.tokenize(text.encode('ascii', 'ignore').decode('utf8'))
    texts = [t for t in texts if t not in string.punctuation]
    texts = [t for t in texts if t not in ['URL', '‘', '’']]
    
    bigrams = nltk.bigrams(texts)
    
    return bigrams


def text2trigrams(text):
    texts = tokenizer.tokenize(text.encode('ascii', 'ignore').decode('utf8'))
    texts = [t for t in texts if t not in string.punctuation]
    texts = [t for t in texts if t not in ['URL', '‘', '’']]
    
    trigrams = nltk.trigrams(texts)
    
    return trigrams


def custom_vectors_generation(texts, vector_terms):
    vectors = []
    for text in texts:
        bigrams = text2bigrams(text)
        trigrams = text2trigrams(text)

        init_vec = [0.0 for _ in range(len(vector_terms) + 1)]
        for bgr in bigrams:
            if bgr in vector_terms:
                idx = vector_terms.index(bgr)
                init_vec[idx] = 1.0
            else:
                init_vec[-1] = 1.0
        for tgr in trigrams:
            if tgr in vector_terms:
                idx = vector_terms.index(tgr)
                init_vec[idx] = 1.0
            else:
                init_vec[-1] = 1.0
        vectors.append(init_vec)
    
    return vectors

In [7]:
texts = data['statement'].tolist()
# vectors = bigrams_vectors_generation(texts)

In [8]:
import nltk
from nltk.collocations import *

top_n = 2000
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

finder2 = BigramCollocationFinder.from_words([])
finder3 = TrigramCollocationFinder.from_words([])

# generating bigram and trigram
for text in texts:
    unigrams = text2unigrams(text)
    bigrams = text2bigrams(text)
    trigrams = text2trigrams(text)
    
    for ngrm in unigrams:
        if ngrm not in finder2.word_fd:
            finder2.word_fd[ngrm] = 0
        finder2.word_fd[ngrm] += 1
        finder2.N += 1
        
        if ngrm not in finder3.word_fd:
            finder3.word_fd[ngrm] = 0
        finder3.word_fd[ngrm] += 1
        finder3.N += 1
    
    for ngrm in bigrams:
        term = ngrm
        
        if term not in finder2.ngram_fd:
            finder2.ngram_fd[term] = 0
            
        finder2.ngram_fd[term] += 1

    for ngrm in trigrams:
        term = ngrm
        
        if term not in finder3.ngram_fd:
            finder3.ngram_fd[term] = 0
            
        finder3.ngram_fd[term] += 1
        
# only bigrams that appear 3+ times
finder2.apply_freq_filter(3)
finder3.apply_freq_filter(3)

combined = []
for res in finder2.score_ngrams(bigram_measures.pmi):
    combined.append(res)
for res in finder3.score_ngrams(trigram_measures.pmi):
    combined.append(res)
combined = sorted(combined, key=lambda x: x[1], reverse=True)

In [9]:
print(len(finder2.ngram_fd), len(finder3.ngram_fd))
count = 0
for k, v in combined:
    print(f"{k} - {v}")
    count += 1
    if count >= 10:
        break

12752 5950
('Ku', 'Klux', 'Klan') - 32.47277595228941
('Child', 'Left', 'Behind') - 31.735810358123206
('Portland-Milwaukie', 'Light', 'Rail') - 31.32077285884436
('Minister', 'Benjamin', 'Netanyahu') - 31.320772858844357
('Prime', 'Minister', 'Benjamin') - 31.320772858844357
('Religious', 'Freedom', 'Restoration') - 31.05773845301057
('Rio', 'Grande', 'Valley') - 30.887813451568256
('Leticia', 'Van', 'de') - 30.05773845301057
('Van', 'de', 'Putte') - 30.05773845301057
('Catherine', 'Cortez', 'Masto') - 29.612953610337673


In [10]:
term_vector_base = [c[0] for c in combined[:top_n]]
vectors = custom_vectors_generation(texts, term_vector_base)

In [11]:
len(vectors[102])

2001

In [12]:
min_score = 100
max_score = 0
n_2gram = 0
n_3gram = 0
for k, v in combined[:top_n]:
    min_score = min(min_score, v)
    max_score = max(max_score, v)
    
    if len(k) == 2:
        n_2gram += 1
    
    if len(k) == 3:
        n_3gram += 1
        
print(min_score)
print(max_score)
print(n_2gram)
print(n_3gram)

12.886311945632038
32.47277595228941
304
1696


In [13]:
train_vectors = np.array([vectors[i] for i, d in data.iterrows() if d['tvt2'] == 'training'])
val_vectors = np.array([vectors[i] for i, d in data.iterrows() if d['tvt2'] == 'validation'])
test_vectors = np.array([vectors[i] for i, d in data.iterrows() if d['tvt2'] == 'testting'])

train_labels = np.array([labels[i] for i, d in data.iterrows() if d['tvt2'] == 'training'])
val_labels = np.array([labels[i] for i, d in data.iterrows() if d['tvt2'] == 'validation'])
test_labels = np.array([labels[i] for i, d in data.iterrows() if d['tvt2'] == 'testting'])

In [14]:
print(train_vectors.shape)
print(val_vectors.shape)
print(test_vectors.shape)

print(train_labels.shape)
print(val_labels.shape)
print(test_labels.shape)

train_vectors.mean(axis=0)

(8620, 2001)
(2870, 2001)
(1301, 2001)
(8620,)
(2870,)
(1301,)


array([1.16009281e-04, 3.48027842e-04, 3.48027842e-04, ...,
       1.16009281e-04, 8.12064965e-04, 1.00000000e+00])

In [25]:
from sklearn.svm import LinearSVC

from library.classification import SKLearnClassification
from library.evaluation import ConfusionMatrix

svm = LinearSVC(multi_class="ovr", max_iter=10000)

model = SKLearnClassification(svm, "Support Vector Machine")
print(f"\n--- START ---")
model.train(train_vectors, train_labels, "LiarPantsFire")

print("Validation Set")
preds = model.predict(val_vectors)

conf_mat = ConfusionMatrix(
    labels=np.array([[1 if j == v else 0 for j in range(len(labels_str))] for v in val_labels]),
    predictions=np.array([[1 if j == p else 0 for j in range(len(labels_str))] for p in preds]),
    binary=False
)
conf_mat.evaluate(classes=labels_str)

print("--- END ---\n")


--- START ---
---> execution time : 0.32 seconds
Validation Set
2870 vs 2870
Multi Class Evaluation

Class false Evaluation
- Precision : 19.346 %
- Recall : 64.87 %
- F1 : 0.29804

Class half-true Evaluation
- Precision : 22.569 %
- Recall : 11.304 %
- F1 : 0.15064

Class mostly-true Evaluation
- Precision : 22.727 %
- Recall : 12.774 %
- F1 : 0.16355

Class true Evaluation
- Precision : 25.0 %
- Recall : 7.782 %
- F1 : 0.11869

Class barely-true Evaluation
- Precision : 21.03 %
- Recall : 10.448 %
- F1 : 0.1396

Class pants-fire Evaluation
- Precision : 23.377 %
- Recall : 7.965 %
- F1 : 0.11881

Combined Evaluation
- Accuracy : 20.592 %
- Precision : 22.342 %
- Recall : 19.19 %
- F1 : 0.20646

- Average Confidence : 100.0 %
Model, Combined,,,,false,,,half-true,,,mostly-true,,,true,,,barely-true,,,pants-fire,,,
Anonymous, 20.592, 22.342, 19.19, 0.20646, 19.346, 64.87, 0.29804, 22.569, 11.304, 0.15064, 22.727, 12.774, 0.16355, 25.0, 7.782, 0.11869, 21.03, 10.448, 0.1396, 23.377, 7.96

In [16]:
from scipy.sparse import csr_matrix

coefs = model.model.coef_
if type(coefs) == csr_matrix:
    coefs.toarray().tolist()[0]
else:
    coefs.tolist()
coefs_and_features = list(zip(coefs[0], term_vector_base))

# Most predictive overall
# coefs_and_features = sorted(coefs_and_features, key=lambda x: x[0], reverse=True)
coefs_and_features = sorted(coefs_and_features, key=lambda x: abs(x[0]), reverse=True)
print(f"Total tokens : {len(coefs_and_features)}")

Total tokens : 2000


In [17]:
coefs.shape

(6, 2001)

In [18]:
coefs_and_features

[(1.486268592731033, ('Attorney', 'General', 'Eric')),
 (1.414297692532734, ('Supreme', 'Court', 'nominee')),
 (1.4103088208769277, ('wildly', 'unpopular')),
 (1.4089777018878118, ('Terry', 'McAuliffe')),
 (1.4012763618629411, ('For', 'every', 'one')),
 (1.3928195472247873, ('takeover', 'of', 'healthcare')),
 (1.3721011660292852, ('babies', 'born', 'in')),
 (1.3692931475036962, ('and', 'your', 'doctor')),
 (1.3563602197005926, ('278,000', 'per', 'job')),
 (1.3499320430781183, ('voted', 'six', 'times')),
 (1.3113094503109817, ('Ted', 'Cruz', 'is')),
 (1.3108328282994202, ('Sept', '11', '2001')),
 (1.2958573295196896, ('talking', 'about', 'how')),
 (1.2938145271514832, ('Dan', 'Branch')),
 (-1.2879146957779373, ('turn', 'Medicare', 'into')),
 (1.2780069366911997, ('Says', 'Ted', 'Strickland')),
 (1.2780069353539512, ('Megyn', 'Kelly')),
 (1.2780068848155206, ('biggest', 'tax', 'increase')),
 (1.2686073537841924, ('socialized', 'medicine')),
 (1.264859513657529, ('changed', 'his', 'positi

In [19]:
coefs_and_features[:10]

[(1.486268592731033, ('Attorney', 'General', 'Eric')),
 (1.414297692532734, ('Supreme', 'Court', 'nominee')),
 (1.4103088208769277, ('wildly', 'unpopular')),
 (1.4089777018878118, ('Terry', 'McAuliffe')),
 (1.4012763618629411, ('For', 'every', 'one')),
 (1.3928195472247873, ('takeover', 'of', 'healthcare')),
 (1.3721011660292852, ('babies', 'born', 'in')),
 (1.3692931475036962, ('and', 'your', 'doctor')),
 (1.3563602197005926, ('278,000', 'per', 'job')),
 (1.3499320430781183, ('voted', 'six', 'times'))]

In [20]:
coefs_and_features[-10:]

[(0.0, ('citizenship', 'for', 'illegal')),
 (0.0, ('are', 'unemployed', 'or')),
 (0.0, ('total', 'amount', 'of')),
 (0.0, ('Bank', 'of', 'America')),
 (0.0, ('where', 'you', 'can')),
 (0.0, ('childhood', 'obesity')),
 (0.0, ('standing', 'committees')),
 (0.0, ('when', 'I', 'took')),
 (0.0, ('pot', 'shops')),
 (0.0, ('current', 'health', 'care'))]

In [21]:
coefs_and_features[-10:]

[(0.0, ('citizenship', 'for', 'illegal')),
 (0.0, ('are', 'unemployed', 'or')),
 (0.0, ('total', 'amount', 'of')),
 (0.0, ('Bank', 'of', 'America')),
 (0.0, ('where', 'you', 'can')),
 (0.0, ('childhood', 'obesity')),
 (0.0, ('standing', 'committees')),
 (0.0, ('when', 'I', 'took')),
 (0.0, ('pot', 'shops')),
 (0.0, ('current', 'health', 'care'))]

In [26]:
n_best = 750
best_tokens = []

# for cf in coefs_and_features[-n_best:]:
#     best_tokens.append(cf[1])

for cf in coefs_and_features[:n_best]:
    best_tokens.append(cf[1])
    
print(best_tokens[:10])
    
with open("../../data/processed/liarpantsfire_best_terms.txt", "w") as f:
    for token in best_tokens:
        f.write(" ".join(token) + "\n")

[('Attorney', 'General', 'Eric'), ('Supreme', 'Court', 'nominee'), ('wildly', 'unpopular'), ('Terry', 'McAuliffe'), ('For', 'every', 'one'), ('takeover', 'of', 'healthcare'), ('babies', 'born', 'in'), ('and', 'your', 'doctor'), ('278,000', 'per', 'job'), ('voted', 'six', 'times')]
