>>## Fake News Classifier NLP project

In [244]:
# library imported

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import NaiveBayesClassifier
from nltk import sent_tokenize,word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier     # it gives good performance on the text data
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import pickle

In [116]:
df = pd.read_csv('fake-news/test.csv')   # from the current working directory and use the forwardslash
df.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      5200 non-null   int64 
 1   title   5078 non-null   object
 2   author  4697 non-null   object
 3   text    5193 non-null   object
dtypes: int64(1), object(3)
memory usage: 162.6+ KB


In [118]:
df.isnull().sum()

id          0
title     122
author    503
text        7
dtype: int64

In [119]:
round(df.isnull().sum()/df['id'].shape[0]*100)       # percentage wise null value

id         0.0
title      2.0
author    10.0
text       0.0
dtype: float64

In [120]:
labels = pd.read_csv('fake-news/submit.csv')
labels.head()

Unnamed: 0,id,label
0,20800,0
1,20801,1
2,20802,0
3,20803,1
4,20804,1


In [121]:
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   id      5200 non-null   int64
 1   label   5200 non-null   int64
dtypes: int64(2)
memory usage: 81.4 KB


In [122]:
df.shape,labels.shape

((5200, 4), (5200, 2))

In [123]:
df2 = pd.merge(left=df,right=labels,how='inner',on='id')      # merged with labels data

In [124]:
df2.head()

Unnamed: 0,id,title,author,text,label
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning...",0
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...,1
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,0
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different...",1
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,1


In [125]:
df2.isnull().sum()     # removed null value

id          0
title     122
author    503
text        7
label       0
dtype: int64

In [126]:
round(df2.isnull().sum()/df['id'].shape[0]*100)       # percentage wise null value

id         0.0
title      2.0
author    10.0
text       0.0
label      0.0
dtype: float64

In [127]:
df2 = df2.dropna()    # removed null value

In [128]:
round(df2.isnull().sum()/df['id'].shape[0]*100)       # percentage wise null value

id        0.0
title     0.0
author    0.0
text      0.0
label     0.0
dtype: float64

In [129]:
df2.head(2)

Unnamed: 0,id,title,author,text,label
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning...",0
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,0


In [130]:
df2['author'].nunique()      # total authors

1639

In [131]:
df2['label'].value_counts()   # according to the labels this is a balanced data

1    2362
0    2213
Name: label, dtype: int64

In [132]:
df2['id'].nunique(),df2.shape     # all are unique id

(4575, (4575, 5))

In [133]:
df2.shape

(4575, 5)

In [134]:
df2[['title','text']].head(2)       # i will use both are columns as an independent variables

Unnamed: 0,title,text
0,"Specter of Trump Loosens Tongues, if Not Purse...","PALO ALTO, Calif. — After years of scorning..."
2,#NoDAPL: Native American Leaders Vow to Stay A...,Videos #NoDAPL: Native American Leaders Vow to...


In [58]:
df2.head(20)      # i will set the index again

Unnamed: 0,id,title,author,text,label
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning...",0
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,0
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different...",1
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,1
6,20806,Pelosi Calls for FBI Investigation to Find Out...,Pam Key,"Sunday on NBC’s “Meet the Press,” House Minori...",1
7,20807,Weekly Featured Profile – Randy Shannon,Trevor Loudon,You are here: Home / *Articles of the Bound* /...,1
10,20810,184 U.S. generals and admirals endorse Trump f...,Dr. Eowyn,Have you seen that pro-Hillary TV ad of disgra...,0
11,20811,“Working Class Hero” by John Brennon,Doug Diamond,"Source: CNBC, article by Robert Ferris Arctic ...",1
12,20812,The Rise of Mandatory Vaccinations Means the E...,Shaun Bradley,Written by Shaun Bradley Mandatory vaccinati...,0
13,20813,Communists Terrorize Small Business,Steve Watson,Store Communists Terrorize Small Business The ...,1


In [135]:
df2 = df2.reset_index().drop('index',axis=1)

In [136]:
df2.head(20)

Unnamed: 0,id,title,author,text,label
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning...",0
1,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,0
2,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different...",1
3,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,1
4,20806,Pelosi Calls for FBI Investigation to Find Out...,Pam Key,"Sunday on NBC’s “Meet the Press,” House Minori...",1
5,20807,Weekly Featured Profile – Randy Shannon,Trevor Loudon,You are here: Home / *Articles of the Bound* /...,1
6,20810,184 U.S. generals and admirals endorse Trump f...,Dr. Eowyn,Have you seen that pro-Hillary TV ad of disgra...,0
7,20811,“Working Class Hero” by John Brennon,Doug Diamond,"Source: CNBC, article by Robert Ferris Arctic ...",1
8,20812,The Rise of Mandatory Vaccinations Means the E...,Shaun Bradley,Written by Shaun Bradley Mandatory vaccinati...,0
9,20813,Communists Terrorize Small Business,Steve Watson,Store Communists Terrorize Small Business The ...,1


In [137]:
# take the single column of title and text
for j in range(len(df2)):
    title = df2.iloc[j,1]
    text = df2.iloc[j,3]
    full_news = title+' '+text
    df2.loc[j,'full_news'] = full_news

In [138]:
df2.head()

Unnamed: 0,id,title,author,text,label,full_news
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning...",0,"Specter of Trump Loosens Tongues, if Not Purse..."
1,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,0,#NoDAPL: Native American Leaders Vow to Stay A...
2,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different...",1,"Tim Tebow Will Attempt Another Comeback, This ..."
3,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,1,Keiser Report: Meme Wars (E995) 42 mins ago 1 ...
4,20806,Pelosi Calls for FBI Investigation to Find Out...,Pam Key,"Sunday on NBC’s “Meet the Press,” House Minori...",1,Pelosi Calls for FBI Investigation to Find Out...


### to verify the join operation of columns

In [139]:
len(df2['title'][2])+len(df2['text'][2])

3661

In [140]:
len(df2['full_news'][2])     # i was added a single space in between, that,s why i got one extra length

3662

In [141]:
x = df2['full_news']
y = df2['label']        # set the variable 

## text pre-processing

In [142]:
ps = PorterStemmer()
lm = WordNetLemmatizer()

In [143]:
# it will may take sometime.
for i in range(len(x)):
    content = re.sub('[^a-zA-Z]',' ',x[i])
    ls_of_sentence = sent_tokenize(content)
    for sentence in ls_of_sentence:
        ls_of_word = word_tokenize(sentence)
        removed_stopword_stemmed_word = [ps.stem(word) for word in ls_of_word if word not in set(stopwords.words('english'))]
        x[i] = ' '.join(removed_stopword_stemmed_word)
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x[i] = ' '.join(removed_stopword_stemmed_word)


In [146]:
x[1]          # cleaned data

'nodapl nativ american leader vow stay all winter file lawsuit against polic video nodapl nativ american leader vow stay all winter file lawsuit against polic amnesti intern send deleg human right observ monitor respons law enforc protest be sociabl share rob wilson photo nativ american leader vow saturday protest winter north dakota oil pipelin say threaten water resourc sacr land plan lawsuit polic treatment arrest protest stand rock sioux chairman dave archambault ii said tribal leader work provid food heat shelter protest oppos billion dakota access pipelin we work technic detail far land type land use perman structur archambault told report mandan north dakota saturday morn at least shelter readi tribal land temperatur fall fahrenheit celsiu day time said it put water risk said archambault join cheyenn river sioux chairman harold frazier the two leader said consid take legal action law enforc stand rock chairman dave archambault ii said peopl injur includ broken bone welt rubber b

In [149]:
type(x),x.shape

(pandas.core.series.Series, (4575,))

In [151]:
tfidf = TfidfVectorizer(max_features = 3000,ngram_range=(1,2))

In [152]:
train_data = tfidf.fit_transform(x)

In [153]:
train_data.shape

(4575, 3000)

In [154]:
pickle.dump(tfidf,open('tfidf_model.pkl','wb'))      # save the ifidf model for the new input

In [158]:
tfidf.get_feature_names()[0:10]     # to get the feature name

['abandon',
 'abc',
 'abedin',
 'abil',
 'abl',
 'abort',
 'about',
 'abroad',
 'absolut',
 'abus']

In [182]:
len(tfidf.get_feature_names()) # total 3000 features



3000

In [162]:
tfidf.get_params()     # to get the parameters

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': 3000,
 'min_df': 1,
 'ngram_range': (1, 2),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

In [176]:
train_data = train_data.toarray()       # converted sparse metrix into numpy array

array([[0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       ...,
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.0358139, 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ]])

In [185]:
train_data

<4575x3000 sparse matrix of type '<class 'numpy.float64'>'
	with 1018854 stored elements in Compressed Sparse Row format>

In [197]:
y.shape

(4575,)

In [198]:
y[0:10]     # labels

0    0
1    0
2    1
3    1
4    1
5    1
6    0
7    1
8    0
9    1
Name: label, dtype: int64

### train test split

In [199]:
x_train,x_test,y_train,y_test = train_test_split(train_data,y,test_size=0.2,random_state=0)

In [200]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((3660, 3000), (915, 3000), (3660,), (915,))

### model building

In [202]:
mnb = MultinomialNB()    # you can change   the alpha value in btwn o to 1

In [203]:
mnb.fit(x_train,y_train)

In [205]:
mnb.score(x_train,y_train)      # on training data

0.8177595628415301

In [206]:
mnb.score(x_test,y_test)      # on testing data

0.7617486338797814

In [207]:
mnb2 = MultinomialNB(alpha=0.5)

In [208]:
mnb2.fit(x_train,y_train)

In [209]:
mnb2.score(x_train,y_train)      # on training data

0.8172131147540984

In [210]:
mnb2.score(x_test,y_test)      # on testing data

0.7551912568306011

In [211]:
#### --------->   PassiveAggressiveClassifier


pac = PassiveAggressiveClassifier()

In [212]:
pac.fit(x_train,y_train)

In [213]:
pac.score(x_train,y_train)      # on training data

1.0

In [214]:
pac.score(x_test,y_test)      # on testing data

0.8032786885245902

In [216]:
## ------------>  svc

sv = SVC()
sv.fit(x_train,y_train)

sv.score(x_test,y_test)      # on testing data

0.8743169398907104

In [217]:
sv.score(x_train,y_train)      # on training data

0.983879781420765

In [None]:
## -------------RandomForestClassifier

rdf = RandomForestClassifier()
rdf.fit(x_train,y_train)


In [220]:
rdf.score(x_train,y_train)      # on training data

1.0

In [221]:
rdf.score(x_train,y_train)      # on training data

1.0

### save the models

In [222]:
pickle.dump(rdf,open('fake_newsClas_RndmFo.pkl','wb'))

In [223]:
pickle.dump(pac,open('fake_newsClas_PAC.pkl','wb'))

In [225]:
pickle.dump(sv,open('fake_newsClas_SVC.pkl','wb'))

### prediction

In [226]:
y_pred = rdf.predict(x_test)
y_pred[0:10]

array([0, 1, 0, 1, 1, 0, 0, 0, 0, 1], dtype=int64)

In [230]:
print(y_test[0:10],end=' ')

1319    0
4020    1
2359    0
1757    1
1011    1
3627    0
2139    0
910     1
4554    0
3559    1
Name: label, dtype: int64 

In [231]:
y_test.value_counts()           # of the actual

1    493
0    422
Name: label, dtype: int64

In [233]:
pd.Series(y_pred).value_counts()     # of the predicted

1    474
0    441
dtype: int64

### model evaluating by classification report

In [234]:
print(classification_report(y_pred,y_test))         ## awesomoe

              precision    recall  f1-score   support

           0       0.95      0.91      0.93       441
           1       0.92      0.96      0.94       474

    accuracy                           0.94       915
   macro avg       0.94      0.93      0.94       915
weighted avg       0.94      0.94      0.94       915



In [235]:
confusion_matrix(y_pred,y_test)

array([[402,  39],
       [ 20, 454]], dtype=int64)

In [256]:
predicted = pd.Series(y_pred)

In [261]:
df3 = pd.DataFrame(y_test)

In [264]:
df3['predicted'] = y_pred

In [267]:
df3[0:50]

Unnamed: 0,label,predicted
1319,0,0
4020,1,1
2359,0,0
1757,1,1
1011,1,1
3627,0,0
2139,0,0
910,1,0
4554,0,0
3559,1,1


# Project completed 😁😍😋😎😋