In [2]:
# Import the necessary libraries and packages
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [3]:
# Read the data into a DataFrame
df=pd.read_csv("news.csv")

# Shape of the df DataFrame
df.shape

# First 5 rows of data
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
# Checking the labels used for classification
labels=df.label
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [5]:
# Splitting data into Train and Test Sets
X_train,X_test,y_train,y_test=train_test_split(df['text'],labels,test_size=0.2,random_state=42)

In [6]:
# Initializing an object of TfidVectorizer Class
tfidf_vectorizer=TfidfVectorizer(stop_words='english',max_df=0.7)
tfidf_train=tfidf_vectorizer.fit_transform(X_train)
tfidf_test=tfidf_vectorizer.transform(X_test)

In [9]:
# Creating a list of number of iterations to test
max_iter = [50,100,150,200,250]

# Creating a dictionary of parameter grid
param_grid = {'max_iter':max_iter}

# Creating an instance of GridSearchCV to find the optimal values
grid_search = GridSearchCV(
    estimator = PassiveAggressiveClassifier(),param_grid = param_grid, cv =5
)

# Fit the model on Train data
grid_search.fit(tfidf_train,y_train)

# Print the optimal parameters
print(f'Best paramters:{grid_search.best_params_}')

Best paramters:{'max_iter': 50}


In [10]:
# Create an instance of the PassiveAggressiveClassifier with max_iter = 50
pac=PassiveAggressiveClassifier(max_iter=50)

# Fir the model on train data
pac.fit(tfidf_train,y_train)

# Find prediction values on the test set
y_pred=pac.predict(tfidf_test)

# Calculate and print the accuracy score
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 93.37%


In [11]:
# Print the confusion matrix of our Prediction model
confusion_matrix(y_test,y_pred,labels=['FAKE','REAL'])

array([[586,  42],
       [ 42, 597]])