## Import main Modules and Data

In [34]:
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix

In [3]:
df=pd.read_csv("news.csv")
df

## Convert Text to numbers

TFIDF=Term Frequency(TF) X Inverse Document Frequency(IDF)

Term Frequency(TF)=no. of occurance of word / total number of words in the document

Inverse Document Frequency(IDF)=log(no. of documents / no. of documents containing word)

if TFIDF score of a word in a document is high, then the word occurs more in that document and less in others.

In [6]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7,
                                 stop_words=stopwords.words('english'))

max_feature=1500 to select the top 1500 features(words) based on frequency (repetition). 
Result will have columns with 1500 words.

min_df=5  minumum number of documents or rows that should contains the feature(word)

max_df=.7  .7 is 70%. this represents maximum 70% should have the feature,
if more than 70% of the documents(rows) contains the feature then it is excluded

stopwords  to remove words like 'is','are','a','are' etc..

In [7]:
features1= tfidfconverter.fit_transform(df['title'])
features_df1 = pd.DataFrame(features1.toarray(), columns=tfidfconverter.get_feature_names())

In [8]:
features2= tfidfconverter.fit_transform(df['text'])
features_df2 = pd.DataFrame(features2.toarray(), columns=tfidfconverter.get_feature_names())

In [51]:
data= pd.concat([features_df1,features_df2],axis=1)
data['label']=df['label']

In [53]:
data.columns

Index(['000', '10', '100', '11', '12', '13', '14', '15', '16', '17',
       ...
       'wrong', 'wrote', 'www', 'year', 'years', 'yes', 'yet', 'york', 'young',
       'label'],
      dtype='object', length=3001)

In [49]:
data = data.loc[:,~data.columns.duplicated()]

## Rename Duplicates

"title" and "text" will have same words. so to rename the duplicates

In [17]:
cols=pd.Series(data.columns)
for dup in cols[cols.duplicated()].unique(): 
    cols[cols[cols == dup].index.values.tolist()] = [dup + '.' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

# rename the columns with the cols list.
data.columns=cols

000
10
100
11
12
13
14
15
16
17
18
20
200
2015
2016
21
22
28
30
40
50
500
abedin
abortion
accept
access
accused
across
act
action
actions
activists
actually
added
address
administration
afghanistan
african
agency
agenda
agents
agree
ahead
air
al
aleppo
allies
almost
along
already
also
alternative
always
america
american
americans
among
announcement
another
answer
anti
anyone
anything
appeal
appear
arabia
area
argument
armed
arms
army
around
arrested
ask
asked
assad
assault
attack
attacks
attempt
attorney
avoid
away
back
backed
bad
ballot
ban
bank
barack
base
battle
become
becomes
begin
behind
believe
ben
bernie
best
better
beyond
bid
biden
big
biggest
bill
billion
black
blame
block
blood
body
boehner
bomb
border
born
brain
break
breaking
britain
brother
budget
build
bush
business
california
call
called
calling
calls
camp
campaign
campaigns
cancer
candidate
candidates
car
care
carolina
carson
case
cash
caucus
caucuses
cause
central
chairman
challenge
chance
change
changed
charged
charge

## Spliting Data For Final Evaluation

In [31]:
data1=data.sample(frac=0.85, random_state=12)
data2=data.drop(data1.index)
data1.reset_index(inplace=True,drop=True)

In [19]:
num_ftr=list(data.select_dtypes(include=['int16', 'int32', 'int64','float']).columns)

## Setting up Env

In [75]:
from pycaret.classification import *
setup1 = setup(data = data1, target = 'label',numeric_features=num_ftr,session_id=258,train_size=0.82,silent=True)

Unnamed: 0,Description,Value
0,session_id,258
1,Target,label
2,Target Type,Binary
3,Label Encoded,"FAKE: 0, REAL: 1"
4,Original Data,"(5385, 3001)"
5,Missing Values,False
6,Numeric Features,3000
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


## Compare models and Tune best 2

In [78]:
top10 = compare_models(n_select = 10)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9167,0.9754,0.9159,0.9184,0.9171,0.8333,0.8334,5.3
lr,Logistic Regression,0.9146,0.9739,0.9037,0.9255,0.9142,0.8292,0.83,5.241
xgboost,Extreme Gradient Boosting,0.9137,0.9714,0.9118,0.9164,0.914,0.8274,0.8275,24.404
rf,Random Forest Classifier,0.9085,0.9688,0.9055,0.9123,0.9087,0.817,0.8173,39.307
svm,SVM - Linear Kernel,0.9067,0.0,0.9046,0.91,0.9071,0.8134,0.8138,0.482
ridge,Ridge Classifier,0.9067,0.0,0.9015,0.9125,0.9067,0.8134,0.8139,0.541
et,Extra Trees Classifier,0.9033,0.9667,0.9091,0.9001,0.9044,0.8065,0.8068,3.056
gbc,Gradient Boosting Classifier,0.8947,0.9595,0.8776,0.9101,0.8935,0.7894,0.79,13.613
ada,Ada Boost Classifier,0.8702,0.9436,0.8524,0.8858,0.8686,0.7405,0.7412,3.179
knn,K Neighbors Classifier,0.8149,0.8978,0.7354,0.8781,0.7998,0.6303,0.6395,0.633


In [80]:
tuned1=tune_model(top10[0])

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9186,0.9744,0.9189,0.9189,0.9189,0.8371,0.8371
1,0.9118,0.9718,0.9144,0.9103,0.9124,0.8235,0.8235
2,0.914,0.9758,0.9009,0.9259,0.9132,0.8281,0.8284
3,0.9072,0.9698,0.9058,0.9099,0.9079,0.8145,0.8145
4,0.9208,0.9697,0.9013,0.9393,0.9199,0.8417,0.8424
5,0.9025,0.9768,0.9054,0.9013,0.9034,0.805,0.805
6,0.941,0.9766,0.9324,0.9495,0.9409,0.8821,0.8822
7,0.898,0.9674,0.9144,0.8865,0.9002,0.7959,0.7963
8,0.8934,0.9594,0.8784,0.907,0.8924,0.7869,0.7873
9,0.9093,0.973,0.8964,0.9213,0.9087,0.8186,0.8189


In [81]:
predict_model(tuned1)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.9351,0.9816,0.9278,0.9414,0.9346,0.8701,0.8702


Unnamed: 0,000,10,100,11,12,13,14,15,16,17,...,www,year.1,years.1,yes.1,yet.1,york.1,young.1,label,Label,Score
0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.014413,0.028323,0.0,0.018401,0.0,0.000000,REAL,REAL,0.9952
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.033266,0.0,0.000000,0.0,0.000000,FAKE,FAKE,0.9096
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.025053,0.049234,0.0,0.000000,0.0,0.000000,FAKE,FAKE,0.9992
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.017673,0.000000,0.0,0.000000,0.0,0.000000,REAL,REAL,0.9805
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.026796,0.0,0.000000,0.0,0.000000,FAKE,FAKE,0.9923
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
965,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.027585,0.054209,0.0,0.000000,0.0,0.044974,REAL,REAL,0.9989
966,0.0,0.543003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.034204,0.067217,0.0,0.000000,0.0,0.000000,FAKE,FAKE,0.9781
967,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,FAKE,FAKE,0.9996
968,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.056388,0.000000,0.0,0.000000,0.0,0.000000,FAKE,FAKE,0.5276


In [82]:
tuned2=tune_model(top10[1])

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9208,0.9767,0.9144,0.9269,0.9206,0.8416,0.8417
1,0.914,0.9749,0.9279,0.9035,0.9156,0.828,0.8283
2,0.9186,0.9797,0.9144,0.9227,0.9186,0.8371,0.8371
3,0.9231,0.9805,0.8969,0.9479,0.9217,0.8462,0.8475
4,0.9186,0.9734,0.8879,0.9474,0.9167,0.8372,0.8389
5,0.9116,0.9713,0.9234,0.9031,0.9131,0.8231,0.8233
6,0.9184,0.9794,0.9099,0.9266,0.9182,0.8367,0.8369
7,0.9138,0.9651,0.9234,0.9071,0.9152,0.8276,0.8278
8,0.9184,0.9675,0.9054,0.9306,0.9178,0.8368,0.8371
9,0.932,0.9776,0.9099,0.9528,0.9309,0.864,0.8649


In [83]:
predict_model(tuned2)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.9186,0.9728,0.901,0.9338,0.9171,0.8371,0.8376


Unnamed: 0,000,10,100,11,12,13,14,15,16,17,...,www,year.1,years.1,yes.1,yet.1,york.1,young.1,label,Label,Score
0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.014413,0.028323,0.0,0.018401,0.0,0.000000,REAL,REAL,0.9884
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.033266,0.0,0.000000,0.0,0.000000,FAKE,FAKE,0.9856
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.025053,0.049234,0.0,0.000000,0.0,0.000000,FAKE,FAKE,0.9856
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.017673,0.000000,0.0,0.000000,0.0,0.000000,REAL,FAKE,0.8250
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.026796,0.0,0.000000,0.0,0.000000,FAKE,FAKE,0.8903
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
965,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.027585,0.054209,0.0,0.000000,0.0,0.044974,REAL,REAL,0.9958
966,0.0,0.543003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.034204,0.067217,0.0,0.000000,0.0,0.000000,FAKE,FAKE,0.8585
967,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,FAKE,FAKE,0.9486
968,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.056388,0.000000,0.0,0.000000,0.0,0.000000,FAKE,FAKE,0.8366


## Finalizing Model after tuning

In [84]:
final_model1 = finalize_model(tuned1)

In [85]:
final_model2 = finalize_model(tuned2)

## Validation

In [86]:
unseen_pred=predict_model(final_model1, data=data2)
unseen_pred

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0,0.9776,0,0,0,0,0


Unnamed: 0,000,10,100,11,12,13,14,15,16,17,...,www,year.1,years.1,yes.1,yet.1,york.1,young.1,label,Label,Score
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,REAL,REAL,0.9994
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.00000,0.162052,0.000000,0.000000,FAKE,FAKE,0.9996
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.00000,0.000000,0.015650,0.000000,REAL,REAL,0.9999
30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,REAL,REAL,0.9953
34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.022315,0.043853,0.07973,0.028490,0.000000,0.036382,FAKE,FAKE,0.9824
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6301,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.056455,0.083207,0.00000,0.000000,0.000000,0.023010,FAKE,FAKE,0.9013
6314,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.00000,0.000000,0.089261,0.000000,FAKE,FAKE,0.9979
6317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.261756,0.00000,0.000000,0.000000,0.000000,FAKE,FAKE,0.9925
6332,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,FAKE,FAKE,0.9969


In [87]:
unseen_pred2=predict_model(final_model2, data=data2.drop(columns='label'))
unseen_pred2

Unnamed: 0,000,10,100,11,12,13,14,15,16,17,...,wrote,www,year.1,years.1,yes.1,yet.1,york.1,young.1,Label,Score
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,REAL,0.9910
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.00000,0.162052,0.000000,0.000000,FAKE,0.9949
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.00000,0.000000,0.015650,0.000000,REAL,0.9988
30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,REAL,0.9493
34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.034091,0.0,0.022315,0.043853,0.07973,0.028490,0.000000,0.036382,FAKE,0.9699
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6301,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.021561,0.0,0.056455,0.083207,0.00000,0.000000,0.000000,0.023010,FAKE,0.8565
6314,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.00000,0.000000,0.089261,0.000000,FAKE,0.8966
6317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.261756,0.00000,0.000000,0.000000,0.000000,FAKE,0.9896
6332,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,FAKE,0.9871


In [88]:
score1=accuracy_score(data2['label'],unseen_pred['Label'])
score1

0.9231578947368421

In [89]:
score2=accuracy_score(data2['label'],unseen_pred2['Label'])
score2

0.9284210526315789

In [90]:
print(f'Model and tuned parameters that give better results: \n {final_model1 if score1>score2 else final_model2}')

Model and tuned parameters that give better results: 
 LogisticRegression(C=3.826, class_weight={}, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=258, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
