### Importing libraries

In [1]:
import pandas as pd
import numpy as np

# Model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# NLP pre-processing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy

# Algorithms
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Metrics
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [2]:
df = pd.read_csv('news.csv', usecols = [1,2,3])

### Exploring data

In [3]:
df.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
df.shape

(6335, 3)

In [5]:
df.label.value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

In [6]:
df.label = np.where(df.label == 'FAKE',1,0)

In [7]:
df.label.value_counts()

0    3171
1    3164
Name: label, dtype: int64

In [8]:
df.isna().sum()

title    0
text     0
label    0
dtype: int64

### Split in train and test

In [9]:
X = df[['title','text']]
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .25)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(4751, 2)
(4751,)
(1584, 2)
(1584,)


### Pre-processing

Using both title and text columns 

In [10]:
tfidf = TfidfVectorizer(stop_words = 'english', lowercase = False)

tfidf.fit(X_train.title)

title_train = pd.DataFrame(tfidf.transform(X_train.title).todense())
title_test = pd.DataFrame(tfidf.transform(X_test.title).todense())

In [11]:
tfidf = TfidfVectorizer(stop_words = 'english', lowercase = False)

tfidf.fit(X_train.text)

text_train = pd.DataFrame(tfidf.transform(X_train.title).todense())
text_test = pd.DataFrame(tfidf.transform(X_test.title).todense())

In [12]:
df_train = pd.concat([title_train, text_train], axis = 1)
df_test = pd.concat([title_test, text_test], axis = 1)

In [13]:
print(df_train.shape)
print(df_test.shape)

(4751, 84564)
(1584, 84564)


### Try some initial classifiers

In [14]:
%%time

algos = [('sgd',SGDClassifier()),
        ('tree', DecisionTreeClassifier()),
        ('lr', LogisticRegression()),
        ('svc', SVC())]

results = []

for algo in algos:
    clf = algo[1].fit(df_train,y_train)
    preds = clf.predict(df_test)
    
    f1 = f1_score(y_test, preds)
    acc = accuracy_score(y_test, preds)
    
    results.append([algo[0],acc,f1])

Wall time: 26min 43s


In [16]:
pd.DataFrame(results, columns = ['clf','acc','f1']).sort_values('f1', ascending = False)

Unnamed: 0,clf,acc,f1
3,svc,0.863636,0.864322
2,lr,0.852273,0.851334
0,sgd,0.828914,0.822528
1,tree,0.784722,0.782387


### Fine-tuning best classifiers

In [None]:
%%time

svc_params = [{'C': [0.5,1,1.5]}]
lr_params = [{'C': [0.5,1,1.5]},
            {'penalty': ['l2','l1', 'elasticnet']}]
models = []

clf_and_params = [(SVC(),svc_params),(LogisticRegression(),lr_params)]

for algo in clf_and_params:
    clf = GridSearchCV(algo[0],algo[1])
    clf.fit(df_train,y_train)
    preds = clf.predict(df_test)

    print('Best params: ',clf.best_params_)
    print('Best training score: ',clf.best_score_)
    print('####')*4

    print('Test accuracy: ',accuracy_score(y_test, preds))
    print('Test f1 score: ',f1_score(y_test, preds))
    print(classification_report(y_test,preds))
    
    models.append(clf)