In [79]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, make_scorer
#from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBClassifier
#import spacy
#from collections import defaultdict
#import spacy.cli 
#from transformers import pipeline
#spacy.cli.download("en_core_web_sm")

In [38]:
train = pd.read_csv('/Users/Stefano_1/Downloads/fake-news/train.csv')
test = pd.read_csv('/Users/Stefano_1/Downloads/fake-news/test.csv')

In [32]:
test.head()

Unnamed: 0,id,title,author,text,all_text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning...","Specter of Trump Loosens Tongues, if Not Purse..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,#NoDAPL: Native American Leaders Vow to Stay A...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different...","Tim Tebow Will Attempt Another Comeback, This ..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,Keiser Report: Meme Wars (E995) 42 mins ago 1 ...


In [6]:
train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [39]:
train['all_text'] = train['title'] + ' ' + train['text']

In [40]:
train['all_text'] = train['all_text'].astype('string')

In [47]:
train.all_text[299]

'US to Hold Off on Cyberwar With Russia Until After Election - Jason Ditz US to Hold Off on Cyberwar With Russia Until After Election \nObama wants to do it together with Clinton   Antiwar.com \nWhile the Obama Administration has made much of its intention to start a full-scale cyberwar against Russia at a “time of their choosing,” the most recent reports suggest that the war is effectively on hold at least until the presidential election in two weeks. \nFrom President Obama’s standpoint, the hope is to work with Hillary Clinton, if she becomes president-elect, to launch a cyber war that they both can get behind. Indeed, both have appeared very hawkish against Russia, and Obama apparently doesn’t want to deny Clinton a chance to participate in the early days of a war she’d inherit. \nStarting a cyberwar ahead of a Trump win would be even less wise, as Trump has opposed the idea of picking fights with Russia, and expressed strong doubts about Democratic Party “certainty” that Russia is 

In [41]:
train = train.drop(['title', 'text'], axis = 1)
#train = train.dropna(subset='all_text')
train['all_text'] = train['all_text'].fillna('')

In [42]:
X_train, X_test, y_train, y_test = train_test_split(train['all_text'], train['label'], test_size=0.2, random_state=16)

In [43]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [44]:
mod = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
mod.fit(X_train_tfidf, y_train)

In [45]:
y_pred = mod.predict(X_test_tfidf)

In [46]:
accuracy_score(y_test, y_pred)

0.9723557692307693

In [47]:
confusion_matrix(y_test, y_pred)

array([[2014,   68],
       [  47, 2031]])

In [25]:
classification_report(y_test, y_pred)

'              precision    recall  f1-score   support\n\n           0       0.98      0.98      0.98      2085\n           1       0.97      0.98      0.98      1956\n\n    accuracy                           0.98      4041\n   macro avg       0.98      0.98      0.98      4041\nweighted avg       0.98      0.98      0.98      4041\n'

In [50]:
test.all_text.isnull().sum()

0

In [49]:
test['all_text'] = test['title'] + ' ' + test['text']
test['all_text'] = test['all_text'].astype('string')
test['all_text'] = test['all_text'].fillna('')
test_tfidf = vectorizer.transform(test['all_text'])

In [54]:
test.dtypes

id           int64
title       object
author      object
text        object
all_text    string
dtype: object

In [56]:
prediction = mod.predict(test_tfidf)
#prediction.to_csv('/Users/Stefano_1/Documents/Data/Kaggle Fake News/xgb_pred.csv', index_label= 'Id')

In [72]:
test['label'] = prediction

In [74]:
test[['id', 'label']].to_csv('/Users/Stefano_1/Documents/Data/Kaggle Fake News/xgb_pred.csv', index_label= 'id', index = False)

In [73]:
test[['id', 'label']]

Unnamed: 0,id,label
0,20800,0
1,20801,1
2,20802,1
3,20803,0
4,20804,1
...,...,...
5195,25995,0
5196,25996,0
5197,25997,0
5198,25998,1


In [76]:
param_grid = {
    'n_estimators': [100, 200, 300],
    #'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    #'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2]
}


In [81]:
random_search = RandomizedSearchCV(estimator=mod, param_distributions=param_grid, 
                                   n_iter=50, scoring=make_scorer(accuracy_score), cv=5, verbose=1, n_jobs=-1, random_state=16)
random_search.fit(X_train_tfidf, y_train)


Fitting 5 folds for each of 50 candidates, totalling 250 fits




In [82]:
random_search.best_score_

0.9782451923076924

In [83]:
prediction = random_search.predict(test_tfidf)

In [85]:
test2 = test
test2['label'] = prediction

In [86]:
test2[['id', 'label']].to_csv('/Users/Stefano_1/Documents/Data/Kaggle Fake News/xgbcv_pred.csv', index_label= 'id', index = False)