# LIAR

In [130]:
# We imoprt LIAR/valid.tsv:
import pandas as pd

df = pd.read_csv('LIAR/train.tsv', sep='\t', header=None)

# We print the first 5 rows of the dataframe:
print(df.head())

           0            1                                                  2   \
0   2635.json        false  Says the Annies List political group supports ...   
1  10540.json    half-true  When did the decline of coal start? It started...   
2    324.json  mostly-true  Hillary Clinton agrees with John McCain "by vo...   
3   1123.json        false  Health care reform legislation is likely to ma...   
4   9028.json    half-true  The economic turnaround started at the end of ...   

                                   3               4                     5   \
0                            abortion    dwayne-bohac  State representative   
1  energy,history,job-accomplishments  scott-surovell        State delegate   
2                      foreign-policy    barack-obama             President   
3                         health-care    blog-posting                   NaN   
4                        economy,jobs   charlie-crist                   NaN   

         6           7     8     9    

In [131]:
# What labels do we have?
print(df[1].unique())

['false' 'half-true' 'mostly-true' 'true' 'barely-true' 'pants-fire']


In [132]:
# Combined 'barely-true', 'true', 'mostly-true', 'half-true' as 'true'
# Combined 'pants-fire', 'false' as 'false'

#bin_data = df.replace({'barely-true': 'true', 'true': 'true', 'mostly-true': 'true', 'half-true': 'true', 'pants-fire': 'false', 'false': 'false'})
# Replace the second column only:
bin_data = df.replace({1: {'barely-true': True, 'true': True, 'mostly-true': True, 'half-true': False, 'pants-fire': False, 'false': False}})

print(bin_data[1].unique())


[False  True]


  bin_data = df.replace({1: {'barely-true': True, 'true': True, 'mostly-true': True, 'half-true': False, 'pants-fire': False, 'false': False}})


In [133]:
# Check the distribution of the labels:
print(bin_data[1].value_counts())


1
True     5292
False    4948
Name: count, dtype: int64


In [134]:
# We start with our advanced model. We import tokenizer.pickle, and model3.h5:

import pickle
import tensorflow as tf
from tensorflow import keras

# Load tokenizer:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

# Load model:
model = keras.models.load_model('model3.h5')

# We check the model summary:
print(model.summary())

# We check the tokenizer:
print(tokenizer)



None
<keras.src.legacy.preprocessing.text.Tokenizer object at 0x000001DD487863B0>


In [152]:
# Tokenize the sequences then pad them to a length of 800:
from tensorflow.keras.preprocessing.sequence import pad_sequences

print(bin_data[2])

# Tokenize the sequences:
sequences = tokenizer.texts_to_sequences(bin_data[2])
padded = pad_sequences(sequences, maxlen=50, padding='post')

# We print the padded sequences:
print(padded[0])

0        Says the Annies List political group supports ...
1        When did the decline of coal start? It started...
2        Hillary Clinton agrees with John McCain "by vo...
3        Health care reform legislation is likely to ma...
4        The economic turnaround started at the end of ...
                               ...                        
10235    There are a larger number of shark attacks in ...
10236    Democrats have now become the party of the [At...
10237    Says an alternative to Social Security that op...
10238    On lifting the U.S. Cuban embargo and allowing...
10239    The Department of Veterans Affairs has a manua...
Name: 2, Length: 10240, dtype: object
[  215     1   591   176   199  2968   521 24983  4721    11  1057     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0]


In [153]:
# We make predictions and calculate the accuracy:
import numpy as np

# Make predictions:
predictions = model.predict(padded)

# We print the predictions:
print(predictions)


[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 25ms/step
[[0.46995676]
 [0.31839132]
 [0.49957252]
 ...
 [0.29538363]
 [0.31497043]
 [0.4670701 ]]


In [154]:
#right now the predictions are between 0 and 1 (because of the final sigmoid). Our threshold will be 0.5:
threshold = 0.5
bin_predictions = np.where(predictions > threshold, 1, 0)

# We print the binarized predictions:
print(bin_predictions)
print(bin_predictions.sum() / len(bin_predictions))

[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]
0.33359375


In [155]:
# Convert from 0/1 to False/True:
bin_predictions = np.where(bin_predictions == 1, True, False)

print(bin_predictions)

[[False]
 [False]
 [False]
 ...
 [False]
 [False]
 [False]]


In [156]:
# Print a new csv file where we have just the text and the predictions:
#new_df = pd.DataFrame({'text': bin_data[2], 'predictions': bin_predictions})
new_df = pd.DataFrame({'text': bin_data[2], 'predictions': bin_predictions[:,0]})
new_df.to_csv('complex_model_predictions.csv', index=False)

In [157]:
# import complex_model_predictions.csv to check the results:
complex_preds = pd.read_csv('complex_model_predictions.csv')
print(df.head())

# classificaiton report:
from sklearn.metrics import classification_report

# using complex_prds:
print(classification_report(bin_data[1], complex_preds['predictions']))

           0            1                                                  2   \
0   2635.json        false  Says the Annies List political group supports ...   
1  10540.json    half-true  When did the decline of coal start? It started...   
2    324.json  mostly-true  Hillary Clinton agrees with John McCain "by vo...   
3   1123.json        false  Health care reform legislation is likely to ma...   
4   9028.json    half-true  The economic turnaround started at the end of ...   

                                   3               4                     5   \
0                            abortion    dwayne-bohac  State representative   
1  energy,history,job-accomplishments  scott-surovell        State delegate   
2                      foreign-policy    barack-obama             President   
3                         health-care    blog-posting                   NaN   
4                        economy,jobs   charlie-crist                   NaN   

         6           7     8     9    

In [158]:
# We calculate the accuracy:
from sklearn.metrics import f1_score

# Accuracy:
print(f1_score(bin_data[1], bin_predictions, average='weighted'))
# Accuracy (% of true predictions):
print((bin_data[1] == bin_predictions[:,0]).sum() / len(bin_predictions))


0.4938125244633295
0.5076171875


In [None]:
# classification report:
from sklearn.metrics import classification_report



Now let's try using the simple model.

In [114]:
import pandas as pd

df = pd.read_csv('LIAR/train.tsv', sep='\t', header=None)
bin_data = df.replace({1: {'barely-true': True, 'true': True, 'mostly-true': True, 'half-true': False, 'pants-fire': False, 'false': False}})


  bin_data = df.replace({1: {'barely-true': True, 'true': True, 'mostly-true': True, 'half-true': False, 'pants-fire': False, 'false': False}})


In [47]:
# Get simple_model_importance.csv:
simple_model = pd.read_csv('simple_model_importance.csv')

print(simple_model)


            token  importance
0             plu    2.984705
1             one    0.009988
2          articl   -0.547404
3           googl   -0.006929
4               (   -0.313867
...           ...         ...
13337          ob   -3.565572
13338  fullscreen  515.382703
13339           ▪    0.525201
13340         ubu    0.000000
13341      kayfab    0.000000

[13342 rows x 2 columns]


In [48]:
# recreate token_importance by using 'token' and 'importance' to recreatge the dictionary:
token_importance = dict(zip(simple_model['token'], simple_model['importance']))

print(token_importance)

{'plu': 2.984704501522293, 'one': 0.0099884370277911, 'articl': -0.5474040284162298, 'googl': -0.0069292827066362, '(': -0.3138669828984257, 'thank': 1.3453378916441352, 'ali': 16.52316870203728, 'alfoneh': 3181.619383604608, 'assist': 1.011047715998615, 'compil': 7.127855534229111, ')': -0.3286715940288662, 'polit': 0.0824645192497444, 'nuclear': 1.30599154846897, 'issu': 0.097384311279919, 'suprem': 3.8869371774122, 'leader': 1.1574956165768209, 'tell': 0.2834871802902106, 'islam': 0.5587235709818203, 'student': 1.2134056463483285, 'associ': 0.6324036402445377, 'foreign': 0.2519764891633147, 'univers': 0.7641392779544451, ':': -0.1438052674715424, '"': 0.0215670699231684, 'conspiraci': 1.0159181600401972, 'machin': 1.3841256783929268, 'enemi': 0.4865024092871723, ',': 0.032872484636885, 'includ': 0.1750563971418592, "'": 0.5049988539798476, 'scientif': 1.0963414293640406, 'apartheid': 14.902127642555088, 'tri': 0.8695546752616403, 'subject': 0.5845845700588128, 'nation': -0.133547125

In [115]:
# To use the model on our data we need to tokenize the validation text:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

pattern = r'<num>|<date>|<email>|<url>|\w+|[^\w\s]'
tokenizer = RegexpTokenizer(pattern)
stemmer = PorterStemmer()

def preprocess(text):
    tokens = tokenizer.tokenize(text)
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words]
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens

# We preprocess the text:
bin_data['tokens'] = bin_data[2].apply(preprocess)

print(bin_data['tokens'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Main\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0        [say, anni, list, polit, group, support, third...
1        [declin, coal, start, ?, start, natur, ga, too...
2        [hillari, clinton, agre, john, mccain, ", vote...
3        [health, care, reform, legisl, like, mandat, f...
4                [econom, turnaround, start, end, term, .]
                               ...                        
10235    [larger, number, shark, attack, florida, case,...
10236    [democrat, becom, parti, [, atlanta, ], metro,...
10237    [say, altern, social, secur, oper, galveston, ...
10238    [lift, u, ., ., cuban, embargo, allow, travel,...
10239    [depart, veteran, affair, manual, tell, vetera...
Name: tokens, Length: 10240, dtype: object


In [116]:
# Let's run the model on the data:
def classify(sequence, token_importance):
    tokens = sequence.split()
    score = 0
    for token in tokens:
        if token in token_importance:
            score += token_importance[token]
    return score

def bin_classify(sequence, token_importance):
    score = classify(sequence, token_importance)
    return score > 0

In [120]:
# We apply the model to the data:
simple_predictions = bin_data['tokens'].apply(lambda x: bin_classify(' '.join(x), token_importance))

print(simple_predictions)

# We calculate the accuracy:
print(f1_score(bin_data[1], simple_predictions, average='weighted'))

print(simple_predictions.sum())


0         True
1        False
2         True
3         True
4         True
         ...  
10235     True
10236    False
10237    False
10238    False
10239    False
Name: tokens, Length: 10240, dtype: bool
0.5161987440954309
5351


In [118]:
# Generate a classification report:
from sklearn.metrics import classification_report

print(classification_report(bin_data[1], simple_predictions))

              precision    recall  f1-score   support

       False       0.50      0.49      0.50      4948
        True       0.53      0.54      0.53      5292

    accuracy                           0.52     10240
   macro avg       0.52      0.52      0.52     10240
weighted avg       0.52      0.52      0.52     10240



In [119]:
# To take an example lets take the fisrt 1 text and compare the text, the score for each token, and the final prediction:
print(bin_data['tokens'][0])
#first_row_scores = bin_data['tokens'][0].apply(lambda x: token_importance.get(x, 0))
#AttributeError: 'list' object has no attribute 'apply' ^
first_row_scores = [token_importance.get(x, 0) for x in bin_data['tokens'][0]]
print(first_row_scores)
print(sum(first_row_scores))
print(bin_classify(' '.join(bin_data['tokens'][0]), token_importance))

['say', 'anni', 'list', 'polit', 'group', 'support', 'third', '-', 'trimest', 'abort', 'demand', '.']
[0.0020905095428442, 1.2245500363984028, -0.5118568149420442, 0.0824645192497444, -0.1009224704612552, 0.0968153839601062, 0.2168412131628696, 0.2527392178903505, 0, -0.0896367231882666, -0.3574140196044912, 0.1327369776388498]
0.9484078296471103
True


# Fake news validation

In [69]:
# Let's import our data:
import pandas as pd
complete_data = pd.read_parquet('pre_processed_news.parquet')

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(complete_data['cleaned_content'], complete_data['type'], test_size=0.2, random_state=42)
y_train = y_train.apply(lambda x: True if x in ["reliable", "political"] else False)
y_test = y_test.apply(lambda x: True if x in ["reliable", "political"] else False)

# Let's split test into test and validation:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

print('X_train: ', X_train.shape)
print('X_test: ', X_test.shape)
print('y_train: ', y_train.shape)
print('y_test: ', y_test.shape)
print('X_val: ', X_val.shape)
print('y_val: ', y_val.shape)

X_train:  (588414,)
X_test:  (73552,)
y_train:  (588414,)
y_test:  (73552,)
X_val:  (73552,)
y_val:  (73552,)


In [71]:
# We're gonna use X_val and y_val to validate our models.
# We start with the simple:

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

pattern = r'<num>|<date>|<email>|<url>|\w+|[^\w\s]'
tokenizer = RegexpTokenizer(pattern)
stemmer = PorterStemmer()

def preprocess(text):
    tokens = tokenizer.tokenize(text)
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words]
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens


X_val = X_val.apply(preprocess)

print(X_val)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Main\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


__null_dask_index__
15066    [happen, <num>, bound, happen, ., <num>, north...
24666    [mitt, romney, stop, iowa, last, night, huge, ...
8852     [sterl, k, ., brown, chang, view, ., j, ., sim...
3911     [hour, roughli, <num>, pack, street, new, york...
30222    [liber, wonder, ', femal, mass, shooter, ., co...
                               ...                        
14018    [rage, alcohol, tom, logan, celebr, oktoberfes...
13693    [one, world, religion, watch, :, enc, ", ecume...
20804    [account, heineken, usa, ,, white, plain, ,, c...
5710     [truth, idea, tabl, would, littl, chang, situa...
12135    [patriot, post, ®, ️, america, ,, except, nati...
Name: cleaned_content, Length: 73552, dtype: object


In [79]:
simple_validation_pred = X_val.apply(lambda x: bin_classify(' '.join(x), token_importance))
print(simple_validation_pred[:10])

# measure accuracy in F1 score:
print("F1: ")
print(f1_score(y_val, simple_validation_pred, average='weighted'))

from sklearn.metrics import accuracy_score
print("Accuracy: ")
print(accuracy_score(y_val, simple_validation_pred))


__null_dask_index__
15066    False
24666    False
8852      True
3911     False
30222    False
9129      True
21451     True
16883    False
561      False
241      False
Name: cleaned_content, dtype: bool
F1: 
0.7401711719538172
Accuracy: 
0.7396535784207091


In [110]:
# create a classification report on simple_validation_pred:
from sklearn.metrics import classification_report
print(classification_report(y_val, simple_validation_pred))

# also confusion matrix:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_val, simple_validation_pred))

              precision    recall  f1-score   support

       False       0.78      0.72      0.75     40071
        True       0.70      0.76      0.73     33481

    accuracy                           0.74     73552
   macro avg       0.74      0.74      0.74     73552
weighted avg       0.74      0.74      0.74     73552

[[29028 11043]
 [ 8106 25375]]


In [89]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)


In [88]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(complete_data['cleaned_content'], complete_data['type'], test_size=0.2, random_state=42)
y_train = y_train.apply(lambda x: True if x in ["reliable", "political"] else False)
y_test = y_test.apply(lambda x: True if x in ["reliable", "political"] else False)

# Let's split test into test and validation:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

print('X_train: ', X_train.shape)
print('X_test: ', X_test.shape)
print('y_train: ', y_train.shape)
print('y_test: ', y_test.shape)
print('X_val: ', X_val.shape)
print('y_val: ', y_val.shape)

X_train:  (588414,)
X_test:  (73552,)
y_train:  (588414,)
y_test:  (73552,)
X_val:  (73552,)
y_val:  (73552,)


In [91]:
# Advanced model on the validation set as well:
# Tokenize the sequences:
sequences = tokenizer.texts_to_sequences(X_val)
padded = pad_sequences(sequences, maxlen=800, padding='post')

print(padded[0])

[   12   787     6     7     4     9  3770     2   866   179     6     7
     1  1563 10564  2422   122     1    82   206   958   154     1   994
  2193     1  8109  9623  7769   799 20182     2   419   655     2     1
  5239  1408     1  8490  7860  6402   744 12202   586    88     3     1
   404  4498    14   552    14    97     8    82     6   347   221 12202
     4  3344   586    84  4498    39     6   563  5896 14975     4 10011
 22085    44     6   620    14   727   344  2382   651 11252  2814     4
  1889  1738    48  2769     6  2664     6   600     4   829     7    63
    15     5    82     3  2540   582  1953  1485    14  5219    16     7
  1695 22290  5197    67    33    23   473    61    18     2   932     6
     7    38     1  1993   200    59   157    17 22896    10     5  3989
  1288    16   728  1041   959    33    17   896  5212   162  8877  1953
  2139    84    13  3538   869 22290    33    55   133     8     1  1553
     9    81  3468  2830  1553     4  2580     1  1

In [92]:
# Make predictions:
predictions = model.predict(padded)

# Binarize the predictions:
threshold = 0.5
bin_predictions = np.where(predictions > threshold, 1, 0)
bin_predictions = np.where(bin_predictions == 1, True, False)

print("F1: ")
print(f1_score(y_val, bin_predictions, average='weighted'))

print("Accuracy: ")
print(accuracy_score(y_val, bin_predictions))

[1m2299/2299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m875s[0m 380ms/step
F1: 
0.8935466729284702
Accuracy: 
0.8934903197737655


In [107]:
# Confusion matrix:
from sklearn.metrics import confusion_matrix
#print(confusion_matrix(y_val, bin_predictions))
# add textlabels for the confusion matrix. we have "real" and "fake" for both the true and the predicted values.
print(confusion_matrix(y_val, bin_predictions))
labels = ['real', 'fake']
print(metrics.classification_report(y_val, bin_predictions, target_names=labels))

[[35922  4149]
 [ 3685 29796]]
              precision    recall  f1-score   support

        real       0.91      0.90      0.90     40071
        fake       0.88      0.89      0.88     33481

    accuracy                           0.89     73552
   macro avg       0.89      0.89      0.89     73552
weighted avg       0.89      0.89      0.89     73552

