In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, HalvingGridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
news_df = pd.read_csv("news.csv")
news_df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [3]:
news_df.drop('Unnamed: 0', axis=1, inplace=True)
news_df.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   6335 non-null   object
 1   text    6335 non-null   object
 2   label   6335 non-null   object
dtypes: object(3)
memory usage: 148.6+ KB


In [5]:
labels = news_df['label']
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [6]:
enc = LabelEncoder()
labels_enc = enc.fit_transform(labels)

In [7]:
labels_enc

array([0, 0, 1, ..., 0, 1, 1])

In [8]:
with open("encoder.pkl", "wb") as handle:
  pickle.dump(enc, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
  news_df['text'], labels_enc, test_size=0.2,
  random_state=42
)

In [10]:
tdif = TfidfVectorizer(stop_words="english", max_df=0.7)

In [11]:
X_train_tdif = tdif.fit_transform(X_train)
X_test_tdif = tdif.transform(X_test)

In [12]:
X_train_tdif

<5068x61359 sparse matrix of type '<class 'numpy.float64'>'
	with 1321067 stored elements in Compressed Sparse Row format>

In [13]:
with open("vectorizer.pkl", "wb") as handle:
  pickle.dump(tdif, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
pac = PassiveAggressiveClassifier(
  random_state=42, early_stopping=True,
  shuffle=True)

In [15]:
params_grid = {
  "C": [0.1, 0.2, 0.5, 0.7, 1],
  "max_iter": [1000, 3000, 5000],
}

grid_pac = HalvingGridSearchCV(
  pac, params_grid, scoring='accuracy',
  cv=3, verbose=1)

In [16]:
grid_pac.fit(X_train_tdif, y_train)

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 563
max_resources_: 5068
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 15
n_resources: 563
Fitting 3 folds for each of 15 candidates, totalling 45 fits
----------
iter: 1
n_candidates: 5
n_resources: 1689
Fitting 3 folds for each of 5 candidates, totalling 15 fits
----------
iter: 2
n_candidates: 2
n_resources: 5067
Fitting 3 folds for each of 2 candidates, totalling 6 fits


HalvingGridSearchCV(cv=3,
                    estimator=PassiveAggressiveClassifier(early_stopping=True,
                                                          random_state=42),
                    param_grid={'C': [0.1, 0.2, 0.5, 0.7, 1],
                                'max_iter': [1000, 3000, 5000]},
                    scoring='accuracy', verbose=1)

In [17]:
grid_pac.best_estimator_

PassiveAggressiveClassifier(C=0.7, early_stopping=True, max_iter=3000,
                            random_state=42)

In [18]:
with open("model.pkl", "wb") as handle:
  pickle.dump(grid_pac, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [19]:
pred = grid_pac.predict(X_test_tdif)

In [20]:
accuracy_score(y_test, pred)

0.9344909234411997

In [21]:
confusion_matrix(y_test, pred)

array([[587,  41],
       [ 42, 597]])