# Techniators 💻 Fake News Detection Model Training

In [96]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from scipy.stats import randint, loguniform
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_validate, RandomizedSearchCV, GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import joblib


In [5]:
def load_data(path='../all_news.csv'):

    df = pd.read_csv(path)
    return df

In [6]:
df = load_data()
df.head()

Unnamed: 0,title_text,subject,date,label,special_char_count,uppercase_letter_count,sentiment_score
0,"As U.S. budget fight looms, Republicans flip t...",politicsNews,2017-12-31,real,152,148,0.9843
1,U.S. military to accept transgender recruits o...,politicsNews,2017-12-29,real,101,112,0.1501
2,Senior U.S. Republican senator: 'Let Mr. Muell...,politicsNews,2017-12-31,real,68,113,-0.6808
3,FBI Russia probe helped by Australian diplomat...,politicsNews,2017-12-30,real,62,107,-0.2201
4,Trump wants Postal Service to charge 'much mor...,politicsNews,2017-12-29,real,152,186,0.8055


## Data Splitting

In [8]:
df.columns

Index(['title_text', 'subject', 'date', 'label', 'special_char_count',
       'uppercase_letter_count', 'sentiment_score'],
      dtype='object')

In [52]:
feat = ['title_text', 'subject', 'date',  'special_char_count',
       'uppercase_letter_count', 'sentiment_score']

X_train, X_test, y_train, y_test = train_test_split(df[feat],
													df['label'],
													test_size=0.25)

## Build Preprocessor

In [79]:
from sklearn.feature_extraction.text import CountVectorizer
  
vectorizer = CountVectorizer(max_features=10000, stop_words="english")

categorical_variables = ['subject']
numeric_variables = ['special_char_count', 'uppercase_letter_count', 'sentiment_score']
text_variables = 'title_text'
drop_variables = ['date', 'title_text']

preprocessor = make_column_transformer(
    (OneHotEncoder(), categorical_variables),
    (StandardScaler(), numeric_variables),
    (vectorizer, text_variables),
    ('drop', drop_variables),
    
)
preprocessor

## Baseline Model

In [88]:
scoring_metric='accuracy'
cross_val_results = {}
dc = DummyClassifier()
cross_val_results['dummy'] = pd.DataFrame(cross_validate(dc, X_train, y_train, 
                                                         return_train_score=True,
                                                         scoring=scoring_metric)).agg(['mean', 'std']).round(3).T

cross_val_results['dummy']

Unnamed: 0,mean,std
fit_time,0.009,0.001
score_time,0.007,0.001
test_score,0.513,0.0
train_score,0.513,0.0


## Logistic Regression

In [94]:
pipe_reg = make_pipeline(
        preprocessor, 
        LogisticRegression()
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [95]:
cross_val_results['logistic'] = pd.DataFrame(cross_validate(pipe_reg, X_train, y_train, 
                                                         return_train_score=True,
                                                         scoring=scoring_metric)).agg(['mean', 'std']).round(3).T

cross_val_results['logistic']

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,mean,std
fit_time,5.783,0.148
score_time,1.213,0.029
test_score,0.992,0.001
train_score,1.0,0.0


## Decision Tree Classifier

In [97]:
pipe_dt = make_pipeline(
        preprocessor, 
        DecisionTreeClassifier()
)


In [98]:
cross_val_results['Decision_Tree'] = pd.DataFrame(cross_validate(pipe_dt, X_train, y_train, 
                                                         return_train_score=True,
                                                         scoring=scoring_metric)).agg(['mean', 'std']).round(3).T

cross_val_results['Decision_Tree']

Unnamed: 0,mean,std
fit_time,11.981,0.382
score_time,1.21,0.045
test_score,0.967,0.002
train_score,1.0,0.0


## Compare Pipeline 2 (Without Subject Column)

In [99]:
vectorizer = CountVectorizer(max_features=10000, stop_words="english")

# categorical_variables = ['subject']
numeric_variables = ['special_char_count', 'uppercase_letter_count', 'sentiment_score']
text_variables = 'title_text'
drop_variables = ['date', 'subject']

preprocessor = make_column_transformer(
    # (OneHotEncoder(), categorical_variables),
    (StandardScaler(), numeric_variables),
    (vectorizer, text_variables),
    ('drop', drop_variables),
    
)
preprocessor

## Logistic Regression Pipeline 2

In [100]:
pipe_reg_2 = make_pipeline(
        preprocessor, 
        LogisticRegression()
)

In [101]:
cross_val_results['logistic 2'] = pd.DataFrame(cross_validate(pipe_reg_2, X_train, y_train, 
                                                         return_train_score=True,
                                                         scoring=scoring_metric)).agg(['mean', 'std']).round(3).T

cross_val_results['logistic 2']

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,mean,std
fit_time,6.068,1.391
score_time,1.148,0.087
test_score,0.991,0.001
train_score,1.0,0.0


## Decision Tree Pipeline 2

In [103]:
pipe_dt2 = make_pipeline(
        preprocessor, 
        DecisionTreeClassifier()
)


In [104]:
cross_val_results['Decision_Tree_2'] = pd.DataFrame(cross_validate(pipe_dt2, X_train, y_train, 
                                                         return_train_score=True,
                                                         scoring=scoring_metric)).agg(['mean', 'std']).round(3).T

cross_val_results['Decision_Tree_2']

Unnamed: 0,mean,std
fit_time,19.1,3.137
score_time,1.504,0.275
test_score,0.955,0.002
train_score,1.0,0.0


## Compare Model Performance

**The `subject` feature does not improve much on the testing score, therefore, we will drop this column along with `date`**

**Logistic Regression & Decision Tree perform similarly, both model achieve > 95% accuracy on validation data**

In [106]:
# Show the results of all models
pd.concat(cross_val_results, axis=1)

Unnamed: 0_level_0,dummy,dummy,logistic,logistic,Decision_Tree,Decision_Tree,logistic 2,logistic 2,Decision_Tree_2,Decision_Tree_2
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
fit_time,0.009,0.001,5.783,0.148,11.981,0.382,6.068,1.391,19.1,3.137
score_time,0.007,0.001,1.213,0.029,1.21,0.045,1.148,0.087,1.504,0.275
test_score,0.513,0.0,0.992,0.001,0.967,0.002,0.991,0.001,0.955,0.002
train_score,0.513,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [107]:
pipe_reg_2

## Get Feature Coefficient on Logistic Regression

In [115]:
pipe_reg_2.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [119]:
# Get feature name
feature_name_countVect = (
 pipe_reg_2
 .named_steps["columntransformer"]
 .named_transformers_["countvectorizer"]
 .get_feature_names_out()
)

new_feature_name = numeric_variables + feature_name_countVect.tolist()

###  First 20 coefficients with largest magnitudes and corresponding features

In [122]:
coeff_reg = pd.DataFrame(pipe_reg_2.named_steps["logisticregression"].coef_.transpose(),
    index=pipe_reg_2.named_steps["columntransformer"].get_feature_names_out(),
    columns=["feature_coefficients"]).sort_values("feature_coefficients", ascending=False)
coeff_reg.head(20)

Unnamed: 0,feature_coefficients
standardscaler__special_char_count,2.665581
countvectorizer__thursday,1.286885
countvectorizer__wednesday,1.244767
countvectorizer__friday,1.199551
countvectorizer__tuesday,1.154119
countvectorizer__monday,1.012699
countvectorizer__factbox,0.982184
countvectorizer__est,0.97717
countvectorizer__market,0.965548
countvectorizer__bit,0.94272


## Milestone 1 Pipeline with Default Model

In [125]:
from sklearn.metrics import accuracy_score

print("LogisticRegression Model Performance on testing data :")
print(accuracy_score(y_test, pipe_reg_2.predict(X_test)))

LogisticRegression Model Performance on testing data :
0.9899701816210356


In [126]:
pipe_dt2.fit(X_train, y_train)

print("Decision Tree Model Performance on testing data :")
print(accuracy_score(y_test, pipe_dt2.predict(X_test)))

Decision Tree Model Performance on testing data :
0.9566278124152887


In [129]:
pipe_dt2

In [None]:
from sklearn.tree import export_graphviz
import graphviz

dt = pipe_dt2.named_steps['decisiontreeclassifier']

feature_names = numeric_variables + preprocessor.named_transformers_['countvectorizer'].get_feature_names()


class_names = ['FAKE', 'REAL']

dot_data = export_graphviz(dt, out_file=None, feature_names=feature_names, class_names=class_names, filled=True, rounded=True, special_characters=True, 
                           )
graph = graphviz.Source(dot_data)

graph.render('decision_tree', format='png', dpi=300)

In [136]:
from sklearn.metrics import classification_report

# assume that you have already trained and tested your decision tree model
y_pred = pipe_dt2.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        fake       0.95      0.96      0.96      5816
        real       0.96      0.95      0.95      5251

    accuracy                           0.96     11067
   macro avg       0.96      0.96      0.96     11067
weighted avg       0.96      0.96      0.96     11067



In [137]:
import joblib
joblib.dump(pipe_dt2, "dt_model.joblib")

['dt_model.joblib']