In [None]:
import pandas as pd
import numpy as np
import itertools
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import matplotlib.pyplot as plt



In [None]:
df = pd.read_csv('./fake_or_real_news.csv')
df = df.set_index('Unnamed: 0')

y = df.label
df = df.drop('label', axis=1)

X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.33, random_state=53)




In [None]:
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)


In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

In [None]:
count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())

difference = set(count_df.columns) - set(tfidf_df.columns)

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    See full source and example: 
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


In [None]:
clf = MultinomialNB()

clf.fit(tfidf_train, y_train)
pred = clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
scores = precision_recall_fscore_support(y_test, pred, average='weighted')
print("accuracy:   %0.3f" % score)
print("precision: %0.3f" % scores[0])
print("recall: %0.3f" % scores[1])
print("f-score: %0.3f" % scores[2])

confusion_matrix = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(confusion_matrix, classes=['FAKE', 'REAL'])

In [None]:
clf = MultinomialNB()

clf.fit(count_train, y_train)
pred = clf.predict(count_test)
score = metrics.accuracy_score(y_test, pred)
scores = precision_recall_fscore_support(y_test, pred, average='weighted')
print("accuracy:   %0.3f" % score)
print("precision:   %0.3f" % scores[0])
print("recall:   %0.3f" % scores[1])
print("f-score:   %0.3f" % scores[2])

confusion_matrix = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(confusion_matrix, classes=['FAKE', 'REAL'])

In [None]:
lin_clf = PassiveAggressiveClassifier(n_iter=50)

lin_clf.fit(tfidf_train, y_train)
pred = lin_clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
scores = precision_recall_fscore_support(y_test, pred, average='weighted')
print("accuracy:   %0.3f" % score)
print("precision:   %0.3f" % scores[0])
print("recall:   %0.3f" % scores[1])
print("f-score:   %0.3f" % scores[2])


confusion_matrix = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(confusion_matrix, classes=['FAKE', 'REAL'])


In [None]:
df = pd.read_csv('liar_dataset/train.tsv', sep="\t", header=None)

X_train = df[2]
y_train = df[1]



df = pd.read_csv('liar_dataset/test.tsv', sep="\t", header=None)

X_test = df[2]
y_test = df[1]


tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

In [None]:
# Changing from multiple classification to binary
def FromMulToBi(train_set):
    y_train_sc_1 = train_set.copy()

    for i in range(len(y_train_sc_1)):
        if y_train[i] != 'true':
            y_train_sc_1[i] = 'false'
    
    return y_train_sc_1

y_train_sc1 = FromMulToBi(y_train)
y_test_sc_1 = FromMulToBi(y_test)

In [None]:
lin_clf = PassiveAggressiveClassifier(n_iter=50)

lin_clf.fit(tfidf_train, y_train_sc_1)
pred = lin_clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test_sc_1, pred)
scores = precision_recall_fscore_support(y_test_sc_1, pred, average='weighted')
print("accuracy:   %0.3f" % score)
print("precision:   %0.3f" % scores[0])
print("recall:   %0.3f" % scores[1])
print("f-score:   %0.3f" % scores[2])


confusion_matrix = metrics.confusion_matrix(y_test, pred, labels=['true', 'false'])
plot_confusion_matrix(confusion_matrix, classes=['true', 'false'])


In [None]:
lin_clf = KNeighborsClassifier(3)

lin_clf.fit(tfidf_train, y_train_sc_1)
pred = lin_clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test_sc_1, pred)
scores = precision_recall_fscore_support(y_test_sc_1, pred, average='weighted')
print("accuracy:   %0.3f" % score)
print("precision:   %0.3f" % scores[0])
print("recall:   %0.3f" % scores[1])
print("f-score:   %0.3f" % scores[2])


confusion_matrix = metrics.confusion_matrix(y_test, pred, labels=['true', 'false'])
plot_confusion_matrix(confusion_matrix, classes=['true', 'false'])

In [None]:
lin_clf = LogisticRegression(class_weight='balanced')

lin_clf.fit(tfidf_train, y_train_sc_1)
pred = lin_clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test_sc_1, pred)
scores = precision_recall_fscore_support(y_test_sc_1, pred, average='weighted')
print("accuracy:   %0.3f" % score)
print("precision:   %0.3f" % scores[0])
print("recall:   %0.3f" % scores[1])
print("f-score:   %0.3f" % scores[2])


confusion_matrix = metrics.confusion_matrix(y_test, pred, labels=['true', 'false'])
plot_confusion_matrix(confusion_matrix, classes=['true', 'false'])

In [None]:
import random
import sys
from sklearn.cross_validation import train_test_split

#ds1 = sys.argv[1]
#ds2 = sys.argv[2]
ds1 = './fake_or_real_news.csv'
ds2 = 'liar_dataset/train.tsv'

def get_dataset3_split(dataset1_in, dataset2_in):
    try:
        print('processing datasets')
        print('ds1=', dataset1_in)
        print('ds2=', dataset2_in)

        print('-- fake news')
        df1 = pd.read_csv(dataset1_in, sep=',', usecols=['title','text','label'])
        df1['claim'] = df1[['title', 'text']].apply(lambda x: '. '.join(x), axis=1)
        del df1['title']
        del df1['text']
        df1.rename(index=str, columns={'label': 'y'}, inplace=True)
        print(df1.keys())
        print(len(df1[df1['y']=='REAL']))
        print(len(df1[df1['y']=='FAKE']))
        df1['y'] = np.where(df1['y'] == 'FAKE', 'false', 'true')
        print(len(df1))

        print('-- liar liar')
        df2 = pd.read_csv(dataset2_in, sep='\t', header=None, usecols=[1,2], names=['y', 'claim'])
        print(df2.keys())
        print(set(df2.y), len(df2))
        print(len(df2[df2['y'] == 'true']))
        print(len(df2[df2['y'] == 'false']))
        df2=df2[(df2['y'] == 'true') | (df2['y'] == 'false')]
        print(set(df2.y), len(df2))

        df3=pd.concat([df1, df2], ignore_index=True)

        print(df3['y'].value_counts())
        print('done')
        return train_test_split(df3['claim'], df3['y'], test_size=0.30, random_state=35)
    except Exception as e:
        print(e)

ds3_train, ds3_test, ds3_y_train, ds3_y_test = get_dataset3_split(ds1,ds2)
print(len(ds3_y_train))
print(len(ds3_train))
print(len(ds3_test))
print(ds3_y_train.get_values()[1], ds3_train.get_values()[1])

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train = tfidf_vectorizer.fit_transform(ds3_train)
tfidf_test = tfidf_vectorizer.transform(ds3_test)



In [None]:
lin_clf = PassiveAggressiveClassifier(n_iter=50)

print(len(y_train_sc_1))

lin_clf.fit(tfidf_train, ds3_y_train)
pred = lin_clf.predict(tfidf_test)
score = metrics.accuracy_score(ds3_y_test, pred)
scores = precision_recall_fscore_support(ds3_y_test, pred, average='weighted')
print("accuracy:   %0.3f" % score)
print("precision:   %0.3f" % scores[0])
print("recall:   %0.3f" % scores[1])
print("f-score:   %0.3f" % scores[2])


confusion_matrix = metrics.confusion_matrix(ds3_test, pred, labels=['true', 'false'])
plot_confusion_matrix(confusion_matrix, classes=['true', 'false'])
