In [1]:
import pandas as pd
import numpy as np
import itertools
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
import matplotlib.pyplot as plt

**Read Dataset 1**

In [2]:
df_1 = pd.read_csv('./fake_or_real_news.csv')
y_1 = df_1.label
print(y_1.shape)
df_1 = df_1.drop('label', axis=1)
print(df_1.shape)
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(df_1['text'], y_1, test_size=0.33, random_state=53)

(6335,)
(6335, 3)


**Create Vectorizers, I will be using only TFIDFVectorizer.**

In [3]:
print("TFIDFVectorizer is being applied...")
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_train = tfidf_vectorizer.fit_transform(X_train_1)
print(X_train_1.shape)
print(tfidf_train.shape)
tfidf_test = tfidf_vectorizer.transform(X_test_1)

TFIDFVectorizer is being applied...
(4244,)
(4244, 56922)


**Create Models**

In [4]:
clf_SGD = SGDClassifier()
clf_SVC = LinearSVC() 
clf_BNB = BernoulliNB()

## Configuration 1

In [5]:
print("Stochastic Gradient Descent is training with TFIDFVectorizer...")
clf_SGD.fit(tfidf_train, y_train_1)
print("Stochastic Gradient Descent is labeling on Training & Test Data with TFIDFVectorizer...")
SGD_prediction_on_training = clf_SGD.predict(tfidf_train)
SGD_prediction_on_test = clf_SGD.predict(tfidf_test)

Stochastic Gradient Descent is training with TFIDFVectorizer...
Stochastic Gradient Descent is labeling on Training & Test Data with TFIDFVectorizer...




A Helper Method for Plotting Confusion Matrix

In [6]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    See full source and example: 
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

**Prediction of Labels(Fake/Real)**

In [7]:
SGD_score_on_training = metrics.accuracy_score(y_train_1, SGD_prediction_on_training)
SGD_precision_on_training = metrics.precision_score(y_train_1, SGD_prediction_on_training, pos_label="REAL")
SGD_recall_on_training = metrics.recall_score(y_train_1, SGD_prediction_on_training, pos_label="REAL")
SGD_f1_on_training = metrics.f1_score(y_train_1, SGD_prediction_on_training, pos_label="REAL")
print("accuracy for SGD on training dataset1:   %0.3f" % SGD_score_on_training)
print("precision for SGD on training dataset1:   %0.3f" % SGD_precision_on_training)
print("recall for SGD on training dataset1:   %0.3f" % SGD_recall_on_training)
print("f1 for SGD on training dataset1:   %0.3f" % SGD_f1_on_training)
#cm = metrics.confusion_matrix(y_train_1, SGD_prediction_on_training, labels=['FAKE', 'REAL'])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])
#print("Confusion matrix for SGD on training data")

accuracy for SGD on training dataset1:   0.999
precision for SGD on training dataset1:   1.000
recall for SGD on training dataset1:   0.999
f1 for SGD on training dataset1:   0.999


In [8]:
SGD_score_on_test = metrics.accuracy_score(y_test_1, SGD_prediction_on_test)
SGD_precision_on_test = metrics.precision_score(y_test_1, SGD_prediction_on_test, pos_label="REAL")
SGD_recall_on_test = metrics.recall_score(y_test_1, SGD_prediction_on_test, pos_label="REAL")
SGD_f1_on_test = metrics.f1_score(y_test_1, SGD_prediction_on_test, pos_label="REAL")
print("accuracy for SGD on test dataset1:   %0.3f" % SGD_score_on_test)
print("precision for SGD on test dataset1:   %0.3f" % SGD_precision_on_test)
print("recall for SGD on test dataset1:   %0.3f" % SGD_recall_on_test)
print("f1 for SGD on test dataset1:   %0.3f" % SGD_f1_on_test)
#cm = metrics.confusion_matrix(y_test_1, SGD_prediction_on_test, labels=['FAKE', 'REAL'])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])
#print("Confusion matrix for SGD on test data")

accuracy for SGD on test dataset1:   0.935
precision for SGD on test dataset1:   0.962
recall for SGD on test dataset1:   0.911
f1 for SGD on test dataset1:   0.936


**DO CROSS VALIDATION ON CONFIGURATION 1**

In [9]:
cv = ShuffleSplit(n_splits=5, train_size=0.75, random_state=4222)
#create scorers
accuracy_scorer = make_scorer(accuracy_score)
precision_scorer = make_scorer(precision_score, pos_label='REAL')
recall_scorer = make_scorer(recall_score, pos_label='REAL')
f1_scorer = make_scorer(f1_score, pos_label=('REAL'))
all_data_countVec = tfidf_vectorizer.fit_transform(df_1['text'])
accuracy_K_fold = cross_val_score(clf_SGD, all_data_countVec, y_1, cv=cv, scoring=accuracy_scorer)
precision_K_fold = cross_val_score(clf_SGD, all_data_countVec, y_1, cv=cv, scoring=precision_scorer)
recall_K_fold = cross_val_score(clf_SGD, all_data_countVec, y_1, cv=cv, scoring=recall_scorer)
f1_K_fold = cross_val_score(clf_SGD, all_data_countVec, y_1, cv=cv, scoring=f1_scorer)
print("accuracies: ", accuracy_K_fold)
print("precisions: ", precision_K_fold)
print("recalls: ", recall_K_fold)
print("f1: ", f1_K_fold)



accuracies:  [0.93533123 0.94479495 0.91955836 0.93533123 0.94164038]
precisions:  [0.96308725 0.95221843 0.94620253 0.96245734 0.96402878]
recalls:  [0.89589905 0.92105263 0.91463415 0.90675241 0.90443686]
f1:  [0.93610224 0.93243243 0.92113565 0.93046358 0.93706294]


## Configuration 2


**Read Dataset 2 and manipulate its labels**

In [10]:
df_2_train = pd.read_csv('./liar_dataset/train.tsv', sep='\t', usecols=[1, 2], names=['label', 'text'])
df_2_validation = pd.read_csv('./liar_dataset/valid.tsv', sep='\t', usecols=[1, 2], names=['label', 'text'])
df_2_test = pd.read_csv('./liar_dataset/test.tsv', sep='\t', usecols=[1, 2], names=['label', 'text'])
#To calculate precision & recall, I make the following mapping on dataset2:
#false & pants on fire -> false
#barely true & half true & mostly true -> true
df_2_train.loc[df_2_train['label'] == 'true', 'label'] = 'REAL'
df_2_train.loc[df_2_train['label'] == 'mostly-true', 'label'] = 'REAL'
df_2_train.loc[df_2_train['label'] == 'barely-true', 'label'] = 'REAL'
df_2_train.loc[df_2_train['label'] == 'half-true', 'label'] = 'REAL'
df_2_train.loc[df_2_train['label'] == 'false', 'label'] = 'FAKE'
df_2_train.loc[df_2_train['label'] == 'pants-fire', 'label'] = 'FAKE'

df_2_validation.loc[df_2_validation['label'] == 'true', 'label'] = 'REAL'
df_2_validation.loc[df_2_validation['label'] == 'mostly-true', 'label'] = 'REAL'
df_2_validation.loc[df_2_validation['label'] == 'barely-true', 'label'] = 'REAL'
df_2_validation.loc[df_2_validation['label'] == 'half-true', 'label'] = 'REAL'
df_2_validation.loc[df_2_validation['label'] == 'false', 'label'] = 'FAKE'
df_2_validation.loc[df_2_validation['label'] == 'pants-fire', 'label'] = 'FAKE'

df_2_test.loc[df_2_test['label'] == 'true', 'label'] = 'REAL'
df_2_test.loc[df_2_test['label'] == 'mostly-true', 'label'] = 'REAL'
df_2_test.loc[df_2_test['label'] == 'barely-true', 'label'] = 'REAL'
df_2_test.loc[df_2_test['label'] == 'half-true', 'label'] = 'REAL'
df_2_test.loc[df_2_test['label'] == 'false', 'label'] = 'FAKE'
df_2_test.loc[df_2_test['label'] == 'pants-fire', 'label'] = 'FAKE'

y_2_train = df_2_train.label
print(y_2_train.shape)
y_2_validation = df_2_validation.label
print(y_2_validation.shape)
y_2_test = df_2_test.label
print(y_2_test.shape)

df_2_train = df_2_train.drop('label', axis=1)
print(df_2_train.shape)
df_2_validation = df_2_validation.drop('label', axis=1)
print(df_2_validation.shape)
df_2_test = df_2_test.drop('label', axis=1)
print(df_2_test.shape)

(10240,)
(1284,)
(1267,)
(10240, 1)
(1284, 1)
(1267, 1)


**Create TFIDFVector and fit the model with training dataset's TFIDFVector**

In [11]:
tfidf_train = tfidf_vectorizer.fit_transform(df_2_train.text)
tfidf_validation = tfidf_vectorizer.transform(df_2_validation.text)
tfidf_test = tfidf_vectorizer.transform(df_2_test.text)
print("SVC is training with TFIDFVectorizer...")
clf_SVC.fit(tfidf_train, y_2_train)
print("SVC is labeling on Training & Test Data with TFIDFVectorizer...")
SVC_prediction_on_training = clf_SVC.predict(tfidf_train)
SVC_prediction_on_validation = clf_SVC.predict(tfidf_validation)
SVC_prediction_on_test = clf_SVC.predict(tfidf_test)

SVC is training with TFIDFVectorizer...
SVC is labeling on Training & Test Data with TFIDFVectorizer...


**Prediction of Labels(Fake/Real) on TRAIN - VALIDATION - TEST data**

In [12]:
SVC_score_on_training = metrics.accuracy_score(y_2_train, SVC_prediction_on_training)
SVC_precision_on_training = metrics.precision_score(y_2_train, SVC_prediction_on_training, pos_label="REAL")
SVC_recall_on_training = metrics.recall_score(y_2_train, SVC_prediction_on_training, pos_label="REAL")
SVC_f1_on_training = metrics.f1_score(y_2_train, SVC_prediction_on_training, pos_label="REAL")
print("accuracy for SVC on training dataset2:   %0.3f" % SVC_score_on_training)
print("precision for SVC on training dataset2:   %0.3f" % SVC_precision_on_training)
print("recall for SVC on training dataset2:   %0.3f" % SVC_recall_on_training)
print("f1 for SVC on training dataset2:   %0.3f" % SVC_f1_on_training)

accuracy for SVC on training dataset2:   0.922
precision for SVC on training dataset2:   0.911
recall for SVC on training dataset2:   0.989
f1 for SVC on training dataset2:   0.948


In [13]:
SVC_score_on_validation = metrics.accuracy_score(y_2_validation, SVC_prediction_on_validation)
SVC_precision_on_validation = metrics.precision_score(y_2_validation, SVC_prediction_on_validation, pos_label="REAL")
SVC_recall_on_validation = metrics.recall_score(y_2_validation, SVC_prediction_on_validation, pos_label="REAL")
SVC_f1_on_validation = metrics.f1_score(y_2_validation, SVC_prediction_on_validation, pos_label="REAL")
print("accuracy for SVC on validation dataset2:   %0.3f" % SVC_score_on_validation)
print("precision for SVC on validation dataset2:   %0.3f" % SVC_precision_on_validation)
print("recall for SVC on validation dataset2:   %0.3f" % SVC_recall_on_validation)
print("f1 for SVC on validation dataset2:   %0.3f" % SVC_f1_on_validation)

accuracy for SVC on validation dataset2:   0.677
precision for SVC on validation dataset2:   0.735
recall for SVC on validation dataset2:   0.848
f1 for SVC on validation dataset2:   0.787


In [14]:
SVC_score_on_test = metrics.accuracy_score(y_2_test, SVC_prediction_on_test)
SVC_precision_on_test = metrics.precision_score(y_2_test, SVC_prediction_on_test, pos_label="REAL")
SVC_recall_on_test = metrics.recall_score(y_2_test, SVC_prediction_on_test, pos_label="REAL")
SVC_f1_on_test = metrics.f1_score(y_2_test, SVC_prediction_on_test, pos_label="REAL")
print("accuracy for SVC on test dataset2:   %0.3f" % SVC_score_on_test)
print("precision for SVC on test dataset2:   %0.3f" % SVC_precision_on_test)
print("recall for SVC on test dataset2:   %0.3f" % SVC_recall_on_test)
print("f1 for SVC on test dataset2:   %0.3f" % SVC_f1_on_test)

accuracy for SVC on test dataset2:   0.695
precision for SVC on test dataset2:   0.755
recall for SVC on test dataset2:   0.861
f1 for SVC on test dataset2:   0.805


## Configuration 3

In [15]:
clf_SVC.fit(tfidf_vectorizer.transform(X_train_1), y_train_1)
predict_SVC_on_ds1 = clf_SVC.predict(tfidf_vectorizer.transform(X_test_1))

SVC_score_on_test = metrics.accuracy_score(y_test_1, predict_SVC_on_ds1)
SVC_precision_on_test = metrics.precision_score(y_test_1, predict_SVC_on_ds1, pos_label="REAL")
SVC_recall_on_test = metrics.recall_score(y_test_1, predict_SVC_on_ds1, pos_label="REAL")
SVC_f1_on_test = metrics.f1_score(y_test_1, predict_SVC_on_ds1, pos_label="REAL")
print("accuracy for SVC on test dataset1:   %0.3f" % SVC_score_on_test)
print("precision for SVC on test dataset1:   %0.3f" % SVC_precision_on_test)
print("recall for SVC on test dataset1:   %0.3f" % SVC_recall_on_test)
print("f1 for SVC on test dataset1:   %0.3f" % SVC_f1_on_test)

print("----")
clf_SGD.fit(tfidf_vectorizer.transform(df_2_train.text), y_2_train)
predict_SGD_on_ds2 = clf_SGD.predict(tfidf_vectorizer.transform(df_2_test.text))

SGD_score_on_test = metrics.accuracy_score(y_2_test, predict_SGD_on_ds2)
SGD_precision_on_test = metrics.precision_score(y_2_test, predict_SGD_on_ds2, pos_label="REAL")
SGD_recall_on_test = metrics.recall_score(y_2_test, predict_SGD_on_ds2, pos_label="REAL")
SGD_f1_on_test = metrics.f1_score(y_2_test, predict_SGD_on_ds2, pos_label="REAL")
print("accuracy for SGD on test dataset2:   %0.3f" % SGD_score_on_test)
print("precision for SGD on test dataset2:   %0.3f" % SGD_precision_on_test)
print("recall for SGD on test dataset2:   %0.3f" % SGD_recall_on_test)
print("f1 for SGD on test dataset2:   %0.3f" % SGD_f1_on_test)



accuracy for SVC on test dataset1:   0.921
precision for SVC on test dataset1:   0.943
recall for SVC on test dataset1:   0.902
f1 for SVC on test dataset1:   0.922
----
accuracy for SGD on test dataset2:   0.726
precision for SGD on test dataset2:   0.739
recall for SGD on test dataset2:   0.965
f1 for SGD on test dataset2:   0.837




## Configuration 4

**Firstly, fetch and split dataset3**

In [16]:
def get_dataset3_split(dataset1_in, dataset2_in):
    try:
        print('processing datasets')
        print('ds1=', dataset1_in)
        print('ds2=', dataset2_in)

        print('-- fake news')
        df1 = pd.read_csv(dataset1_in, sep=',', usecols=['title','text','label'])
        df1['claim'] = df1[['title', 'text']].apply(lambda x: '. '.join(x), axis=1)
        del df1['title']
        del df1['text']
        df1.rename(index=str, columns={'label': 'y'}, inplace=True)
        print(df1.keys())
        print(len(df1[df1['y']=='REAL']))
        print(len(df1[df1['y']=='FAKE']))
        df1['y'] = np.where(df1['y'] == 'FAKE', 'false', 'true')
        print(len(df1))

        print('-- liar liar')
        df2 = pd.read_csv(dataset2_in, sep='\t', header=None, usecols=[1,2], names=['y', 'claim'])
        print(df2.keys())
        print(set(df2.y), len(df2))
        print(len(df2[df2['y'] == 'true']))
        print(len(df2[df2['y'] == 'false']))
        df2=df2[(df2['y'] == 'true') | (df2['y'] == 'false')]
        print(set(df2.y), len(df2))

        df3=pd.concat([df1, df2], ignore_index=True)

        print(df3['y'].value_counts())
        print('done')
        return train_test_split(df3['claim'], df3['y'], test_size=0.30, random_state=4222)
    except Exception as e:
        print(e)


ds1 = './fake_or_real_news.csv' 
ds2 = './liar_dataset/train.tsv'
ds3_train, ds3_test, ds3_y_train, ds3_y_test = get_dataset3_split(ds1,ds2)


processing datasets
ds1= ./fake_or_real_news.csv
ds2= ./liar_dataset/train.tsv
-- fake news
Index(['y', 'claim'], dtype='object')
3171
3164
6335
-- liar liar
Index(['y', 'claim'], dtype='object')
{'false', 'true', 'half-true', 'barely-true', 'pants-fire', 'mostly-true'} 10240
1676
1995
{'false', 'true'} 3671
false    5159
true     4847
Name: y, dtype: int64
done


In [17]:
tfidf_train_ds3 = tfidf_vectorizer.fit_transform(ds3_train)
tfidf_test_ds3 = tfidf_vectorizer.transform(ds3_test)
tfidf_y_train_ds3 = tfidf_vectorizer.transform(ds3_y_train)
tfidf_y_test_ds3 = tfidf_vectorizer.transform(ds3_y_test)
print("Bernoulli Naive Bayes is training with TFIDFVectorizer...")
clf_BNB.fit(tfidf_train_ds3.toarray(), ds3_y_train)
print("Stochastic Gradient Descent is labeling on Training & Test Data with TFIDFVectorizer...")
BNB_prediction_on_training = clf_BNB.predict(tfidf_train_ds3.toarray())
BNB_prediction_on_test = clf_BNB.predict(tfidf_test_ds3.toarray())

Bernoulli Naive Bayes is training with TFIDFVectorizer...
Stochastic Gradient Descent is labeling on Training & Test Data with TFIDFVectorizer...


In [18]:
BNB_score_on_training = metrics.accuracy_score(ds3_y_train, BNB_prediction_on_training)
BNB_precision_on_training = metrics.precision_score(ds3_y_train, BNB_prediction_on_training, pos_label="true")
BNB_recall_on_training = metrics.recall_score(ds3_y_train, BNB_prediction_on_training, pos_label="true")
BNB_f1_on_training = metrics.f1_score(ds3_y_train, BNB_prediction_on_training, pos_label="true")
print("accuracy for BNB on training dataset3:   %0.3f" % BNB_score_on_training)
print("precision for BNB on training dataset3:   %0.3f" % BNB_precision_on_training)
print("recall for BNB on training dataset3:   %0.3f" % BNB_recall_on_training)
print("f1 for BNB on training dataset3:   %0.3f" % BNB_f1_on_training)

accuracy for BNB on training dataset3:   0.749
precision for BNB on training dataset3:   0.867
recall for BNB on training dataset3:   0.563
f1 for BNB on training dataset3:   0.683


In [19]:
BNB_score_on_test = metrics.accuracy_score(ds3_y_test, BNB_prediction_on_test)
BNB_precision_on_test = metrics.precision_score(ds3_y_test, BNB_prediction_on_test, pos_label="true")
BNB_recall_on_test = metrics.recall_score(ds3_y_test, BNB_prediction_on_test, pos_label="true")
BNB_f1_on_test = metrics.f1_score(ds3_y_test, BNB_prediction_on_test, pos_label="true")
print("accuracy for BNB on test dataset3:   %0.3f" % BNB_score_on_test)
print("precision for BNB on test dataset3:   %0.3f" % BNB_precision_on_test)
print("recall for BNB on test dataset3:   %0.3f" % BNB_recall_on_test)
print("f1 for BNB on test dataset3:   %0.3f" % BNB_f1_on_test)

accuracy for BNB on test dataset3:   0.719
precision for BNB on test dataset3:   0.809
recall for BNB on test dataset3:   0.563
f1 for BNB on test dataset3:   0.664
