# Exercise 2

In [11]:
import pandas as pd
import numpy as np
import itertools
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.preprocessing import StandardScaler  
import matplotlib.pyplot as plt
from sklearn.model_selection import ShuffleSplit
import warnings
%pylab inline
warnings.simplefilter('ignore')

def clf_and_pred(clf, X_train, y_train, X_test, y_test, pred_train=True, no_train=False):
   
    if not no_train:
        clf.fit(X_train, y_train)
    accuracy={}
    recall={}
    precision={}
    f_measure={}
    if pred_train:
        pred = clf.predict(X_train)
        accuracy['train'] = metrics.accuracy_score(y_train, pred)
        recall['train'] = metrics.recall_score(y_train, pred, average='weighted')
        precision['train'] = metrics.precision_score(y_train, pred, average='weighted')
        f_measure['train'] = metrics.f1_score(y_train, pred, average='weighted')
    pred = clf.predict(X_test)
    accuracy['test'] = metrics.accuracy_score(y_test, pred)
    recall['test'] = metrics.recall_score(y_test, pred, average='weighted')
    precision['test'] = metrics.precision_score(y_test, pred, average='weighted')
    f_measure['test'] = metrics.f1_score(y_test, pred, average='weighted')

    return accuracy, recall, precision, f_measure

Populating the interactive namespace from numpy and matplotlib


# Import datasets

In [5]:


dataset1=pd.read_csv("fake_or_real_news.csv", sep=',', usecols=['title','text','label'])
dataset2_train=pd.read_csv("train.tsv", sep='\t', header=None, usecols=[1,2,3,4,5,6,7,8,9,10,11,12,13], names=['label', 'statement', 'subject', 'speaker', 'jobOfSpeaker', 'state', 'party', 'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire', 'context'])
dataset2_valid=pd.read_csv("valid.tsv", sep='\t', header=None, usecols=[1,2,3,4,5,6,7,8,9,10,11,12,13], names=['label', 'statement', 'subject', 'speaker', 'jobOfSpeaker', 'state', 'party', 'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire', 'context'])
dataset2_test=pd.read_csv("test.tsv", sep='\t', header=None, usecols=[1,2,3,4,5,6,7,8,9,10,11,12,13], names=['label', 'statement', 'subject', 'speaker', 'jobOfSpeaker', 'state', 'party', 'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire', 'context'])

dataset1['statement'] = dataset1[['title', 'text']].apply(lambda x: '. '.join(x), axis=1)
del dataset1['title']
del dataset1['text']

for dataset in [dataset2_train, dataset2_valid, dataset2_test]:
    for i in range(0,len(dataset['label'])):
        if dataset['label'][i]=='true' or dataset['label'][i]=='mostly-true':
            dataset['label'][i]='REAL'
        else:
            dataset['label'][i]='FAKE'

# Task 1

In [15]:
y=dataset1.label
X=dataset1.drop('label', axis=1)

ss = ShuffleSplit(n_splits=5, test_size=0.25, random_state=4215)
#accuracy=recall=precision=f_measure=[]
for train_index, test_index in ss.split(X['statement'], y):
    X_train1 = X['statement'].iloc[train_index]
    y_train1 = y.iloc[train_index]
    X_test1 = X['statement'].iloc[test_index]
    y_test1 = y.iloc[test_index]
    count_vectorizer1 = CountVectorizer(stop_words='english', ngram_range=(2, 2))
    count_train1 = count_vectorizer1.fit_transform(X_train1)
    count_test1 = count_vectorizer1.transform(X_test1)
    
    scaler = StandardScaler(with_mean=False)
    scaler.fit(count_train1)

    NN_train = scaler.transform(count_train1)  
    NN_test = scaler.transform(count_test1)  

    clf1 = MLPClassifier(hidden_layer_sizes=(3), max_iter=10, random_state=4222)
    accuracy, recall, precision, f_measure = clf_and_pred(clf1, NN_train, y_train1, NN_test, y_test1)
    print("train results: ")
    print("accuracy:   %0.2f" % accuracy['train'])
    print("recall:   %0.2f" % recall['train'])
    print("precision:   %0.2f" % precision['train'])
    print("f-measure:   %0.2f" % f_measure['train'])
    print("test results: ")
    print("accuracy:   %0.2f" % accuracy['test'])
    print("recall:   %0.2f" % recall['test'])
    print("precision:   %0.2f" % precision['test'])
    print("f-measure:   %0.2f" % f_measure['test'])

train results: 
accuracy:   0.99
recall:   0.99
precision:   0.99
f-measure:   0.99
test results: 
accuracy:   0.87
recall:   0.87
precision:   0.88
f-measure:   0.87
train results: 
accuracy:   1.00
recall:   1.00
precision:   1.00
f-measure:   1.00
test results: 
accuracy:   0.93
recall:   0.93
precision:   0.93
f-measure:   0.93
train results: 
accuracy:   1.00
recall:   1.00
precision:   1.00
f-measure:   1.00
test results: 
accuracy:   0.93
recall:   0.93
precision:   0.93
f-measure:   0.93
train results: 
accuracy:   1.00
recall:   1.00
precision:   1.00
f-measure:   1.00
test results: 
accuracy:   0.92
recall:   0.92
precision:   0.92
f-measure:   0.92
train results: 
accuracy:   0.51
recall:   0.51
precision:   0.26
f-measure:   0.34
test results: 
accuracy:   0.48
recall:   0.48
precision:   0.23
f-measure:   0.31


# Task 2

In [12]:
y_train2=dataset2_train['label']
X_train2=dataset2_train['statement']
y_valid2=dataset2_valid['label']
X_valid2=dataset2_valid['statement']
y_test2=dataset2_test['label']
X_test2=dataset2_test['statement']


count_vectorizer2 = CountVectorizer(stop_words='english', ngram_range=(2, 2))
count_train2 = count_vectorizer2.fit_transform(X_train2)
count_test2 = count_vectorizer2.transform(X_test2)
count_valid2 = count_vectorizer2.transform(X_valid2)

clf2 = MultinomialNB()

accuracy, recall, precision, f_measure = clf_and_pred(clf2, count_train2, y_train2, count_test2, y_test2)
print("train results: ")
print("accuracy:   %0.2f" % accuracy['train'])
print("recall:   %0.2f" % recall['train'])
print("precision:   %0.2f" % precision['train'])
print("f-measure:   %0.2f" % f_measure['train'])
print("test results: ")
print("accuracy:   %0.2f" % accuracy['test'])
print("recall:   %0.2f" % recall['test'])
print("precision:   %0.2f" % precision['test'])
print("f-measure:   %0.2f" % f_measure['test'])
accuracy, recall, precision, f_measure = clf_and_pred(clf2, count_train2, y_train2, count_valid2, y_valid2, pred_train=False, no_train=True)
print("valid results: ")
print("accuracy:   %0.2f" % accuracy['test'])
print("recall:   %0.2f" % recall['test'])
print("precision:   %0.2f" % precision['test'])
print("f-measure:   %0.2f" % f_measure['test'])





train results: 
accuracy:   0.99
recall:   0.99
precision:   0.99
f-measure:   0.99
test results: 
accuracy:   0.63
recall:   0.63
precision:   0.61
f-measure:   0.61
valid results: 
accuracy:   0.67
recall:   0.67
precision:   0.64
f-measure:   0.63


# Task 3

In [13]:
count_test3a=count_vectorizer2.transform(X_test1)
accuracy, recall, precision, f_measure = clf_and_pred(clf2, count_train2, y_train2, count_test3a, y_test1, pred_train=False, no_train=True)
print("test results: ")
print("accuracy:   %0.2f" % accuracy['test'])
print("recall:   %0.2f" % recall['test'])
print("precision:   %0.2f" % precision['test'])
print("f-measure:   %0.2f" % f_measure['test'])

count_test3b = count_vectorizer1.transform(X_test2)
NN_test3b = scaler.transform(count_test3b)  
accuracy, recall, precision, f_measure = clf_and_pred(clf1, NN_train, y_train1, NN_test3b, y_test2, pred_train=False, no_train=True)
print("test results: ")
print("accuracy:   %0.2f" % accuracy['test'])
print("recall:   %0.2f" % recall['test'])
print("precision:   %0.2f" % precision['test'])
print("f-measure:   %0.2f" % f_measure['test'])



valid results: 
accuracy:   0.50
recall:   0.50
precision:   0.48
f-measure:   0.44
valid results: 
accuracy:   0.35
recall:   0.35
precision:   0.13
f-measure:   0.19


# Task 4

In [14]:
dataset3=pd.concat([dataset1, dataset2_train, dataset2_valid, dataset2_test], ignore_index=True)
y=dataset3.label
X=dataset3.drop('label', axis=1)
for train_index, test_index in ss.split(X['statement'], y):
    X_train4=dataset3['statement'].iloc[train_index]
    y_train4=y.iloc[train_index]
    X_test4=dataset3['statement'].iloc[test_index]
    y_test4=y.iloc[test_index]
    count_vectorizer4 = CountVectorizer(stop_words='english', ngram_range=(2, 2))
    count_train4 = count_vectorizer4.fit_transform(X_train4)
    count_test4 = count_vectorizer4.transform(X_test4)
    clf4 = RandomForestClassifier(max_depth=1, random_state=4222)
    accuracy, recall, precision, f_measure = clf_and_pred(clf1, count_train4, y_train4, count_test4, y_test4)
    print("train results: ")
    print("accuracy:   %0.2f" % accuracy['train'])
    print("recall:   %0.2f" % recall['train'])
    print("precision:   %0.2f" % precision['train'])
    print("f-measure:   %0.2f" % f_measure['train'])
    print("test results: ")
    print("accuracy:   %0.2f" % accuracy['test'])
    print("recall:   %0.2f" % recall['test'])
    print("precision:   %0.2f" % precision['test'])
    print("f-measure:   %0.2f" % f_measure['test'])

train results: 
accuracy:   0.40
recall:   0.40
precision:   0.16
f-measure:   0.23
test results: 
accuracy:   0.40
recall:   0.40
precision:   0.16
f-measure:   0.23
train results: 
accuracy:   1.00
recall:   1.00
precision:   1.00
f-measure:   1.00
test results: 
accuracy:   0.70
recall:   0.70
precision:   0.72
f-measure:   0.67
train results: 
accuracy:   1.00
recall:   1.00
precision:   1.00
f-measure:   1.00
test results: 
accuracy:   0.72
recall:   0.72
precision:   0.72
f-measure:   0.71
train results: 
accuracy:   1.00
recall:   1.00
precision:   1.00
f-measure:   1.00
test results: 
accuracy:   0.65
recall:   0.65
precision:   0.70
f-measure:   0.65
train results: 
accuracy:   1.00
recall:   1.00
precision:   1.00
f-measure:   1.00
test results: 
accuracy:   0.73
recall:   0.73
precision:   0.73
f-measure:   0.71
