In [21]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

In [3]:
#Read the data
df=pd.read_csv('news.csv')


Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [10]:
#Get shape
df.shape


(6335, 4)

In [11]:
#Get  head
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [13]:
#checking for null values
df.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

No Null values are available. Hence splitting the data set into training and test set

In [26]:
#Splitting the data set into train and test sets
x_train,x_test,y_train,y_test=train_test_split(df['text'], df['label'], test_size=0.2, random_state=7)

In [30]:
#Initilaising the tf idf vector to analyse the text available in the text column in news csv
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
# Fit and transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

In [29]:
pac=PassiveAggressiveClassifier(max_iter=50,early_stopping=True,random_state=0,tol=1e-3)
pac.fit(tfidf_train,y_train)
#DataFlair - Predict on the test set and calculate accuracy
y_pred=pac.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 92.58%


In [8]:
#DataFlair - Build confusion matrix
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

array([[590,  48],
       [ 42, 587]], dtype=int64)

In [31]:
# Using Hyper parameter tuning to determine the best model
# define models and parameters

tol_values = [0.1, 0.01,0.001,0.0001,0.00001]
# define grid search
grid = dict(tol=tol_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=pac, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)

grid_result = grid_search.fit(tfidf_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.936527 using {'tol': 0.1}
0.936527 (0.011727) with: {'tol': 0.1}
0.936527 (0.011727) with: {'tol': 0.01}
0.936527 (0.011727) with: {'tol': 0.001}
0.936527 (0.011727) with: {'tol': 0.0001}
0.936527 (0.011727) with: {'tol': 1e-05}


In [32]:
pac_best=PassiveAggressiveClassifier(max_iter=50,early_stopping=True,random_state=0,tol=1e-3)
pac_best.fit(tfidf_train,y_train)
y_pred=pac_best.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 92.58%


In [33]:
#Building confusion matrix for the best model
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

array([[593,  45],
       [ 49, 580]], dtype=int64)

The objective of the problem is to determine the fake news from the label. In this case after using the hypermater tuning the false positives that is giving fake news as real has reduced, also increase in determining of fake news as fake news i.e true positives has increased. So this can be determined as the best model