#  Importing Libraries


In [1]:
import pandas as pd
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import PassiveAggressiveClassifier
import pickle
import nltk
from sklearn.calibration import CalibratedClassifierCV

# # Reading CSV file

In [2]:
# Read train.csv into a DataFrame, assigned to news_dataset
news_dataset= pd.read_csv('train.csv')

In [3]:
# Prints the first 5 rows of a DataFrame as default
#here "0" means REAL NEWS & "1" mean FAKE NEWS according to dataset
news_dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
# Prints no. of rows and columns of a DataFrame
news_dataset.shape

(20800, 5)

In [5]:
#Checking NULL values in dataset
#returns the number of missing values in the data set
news_dataset.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [6]:
#Pandas DataFrame dropna() function is used to remove rows and columns with Null/NaN values.
news_dataset=news_dataset.dropna()

In [7]:
#Checking again the Null values
news_dataset.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [8]:
#after dropping Null values
news_dataset.shape

(18285, 5)

In [9]:
# Here text column is assigned to x and label column assigned to y
x= news_dataset['text']
y= news_dataset['label']

In [10]:
# X contains the text Column
x

0        House Dem Aide: We Didn’t Even See Comey’s Let...
1        Ever get the feeling your life circles the rou...
2        Why the Truth Might Get You Fired October 29, ...
3        Videos 15 Civilians Killed In Single US Airstr...
4        Print \nAn Iranian woman has been sentenced to...
                               ...                        
20795    Rapper T. I. unloaded on black celebrities who...
20796    When the Green Bay Packers lost to the Washing...
20797    The Macy’s of today grew from the union of sev...
20798    NATO, Russia To Hold Parallel Exercises In Bal...
20799      David Swanson is an author, activist, journa...
Name: text, Length: 18285, dtype: object

In [11]:
# Y contains the Label column
y

0        1
1        0
2        1
3        1
4        1
        ..
20795    0
20796    0
20797    0
20798    1
20799    1
Name: label, Length: 18285, dtype: int64

#  Cleaning and preprocessing

# 1. Regex

In [12]:
#Remove punctuations from the String  
s = "!</> hello there$$ </>^t!!!h%%i&&%$s@@@^^^&&!& </>*is@# fake&&\ news@@@##%^ ^&dete!@ction# %%$"

In [13]:
#using re module (Regular Expression)
import re
s = re.sub(r'[^\w\s]','',s)

In [14]:
print(s)

 hello there this is fake news detection 


# 2. Tokenization

In [15]:
#Downloading nltk data
#NLTK data package includes a pre-trained Punkt tokenizer for English.
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shiva\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
nltk.word_tokenize("Hello there this is Fake news Detection Project")

['Hello', 'there', 'this', 'is', 'Fake', 'news', 'Detection', 'Project']

# 3. StopWords

In [17]:
from nltk.corpus import stopwords
#corpus is a large and structured set of texts.

stop_words = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [18]:
sentence = "Covid-19 pandemic has impacted many countries and what it did to economy is very stressful"

In [19]:
words = nltk.word_tokenize(sentence)
words = [w for w in words if w not in stop_words]

In [20]:
words

['Covid-19',
 'pandemic',
 'impacted',
 'many',
 'countries',
 'economy',
 'stressful']

# 4. Lemmatization (Superior than Stemming)

In [21]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

input_str="been had done languages cities mice"

In [22]:
actor,acting

NameError: name 'actor' is not defined

In [23]:
#Tokenize the sentence
input_str=nltk.word_tokenize(input_str)

#Lemmatize each word
for word in input_str:
    print(lemmatizer.lemmatize(word))

been
had
done
language
city
mouse


# Let's Apply

In [24]:
lemmatizer=WordNetLemmatizer()
for index,row in news_dataset.iterrows():
    filter_sentence = ''
    
    sentence = row['text']
    sentence = re.sub(r'[^\w\s]','',sentence) #cleaning
    
    words = nltk.word_tokenize(sentence) #tokenization
    
    words = [w for w in words if not w in stop_words]  #stopwords removal
    
    for word in words:
        filter_sentence = filter_sentence + ' ' + str(lemmatizer.lemmatize(word)).lower()
        
    news_dataset.loc[index,'text'] = filter_sentence  #.loc attribute to access a particular cell in the given Dataframe using the index and column labels.

In [25]:
news_dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,house dem aide we didnt even see comeys lette...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,ever get feeling life circle roundabout rathe...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,why truth might get you fired october 29 2016...,1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,videos 15 civilians killed in single us airst...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,print an iranian woman sentenced six year pri...,1


In [26]:
# Here text column is assigned to x and label column assigned to y
x= news_dataset['text']
y= news_dataset['label']

# Applying NLP Techniques

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Splitting data in Training and Testing

In [28]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,stratify=y,random_state=2)

# TF-iDF Vectorizer

In [29]:
tfvect = TfidfVectorizer(stop_words='english',max_df=0.7)
tfid_x_train = tfvect.fit_transform(x_train)
tfid_x_test = tfvect.transform(x_test)


In [30]:
print(tfid_x_train)

  (0, 137752)	0.036976566496463206
  (0, 85271)	0.044710881314325567
  (0, 97113)	0.04030211073922633
  (0, 58928)	0.014749977900084315
  (0, 45489)	0.03668860481133672
  (0, 48102)	0.03315794592722513
  (0, 80105)	0.01821718492854408
  (0, 86625)	0.028903281294887617
  (0, 60794)	0.025231842790051612
  (0, 25117)	0.03650350446878842
  (0, 148593)	0.024259607649512398
  (0, 128167)	0.029162438702899342
  (0, 111583)	0.017868789204711265
  (0, 6907)	0.01890303291187917
  (0, 131394)	0.032762013888570245
  (0, 28916)	0.0396031983562183
  (0, 53380)	0.018860967366340815
  (0, 51022)	0.02592417006796663
  (0, 40999)	0.03138339156523
  (0, 145785)	0.02179580841555102
  (0, 73253)	0.037648202749699926
  (0, 10061)	0.03205902925912119
  (0, 61749)	0.02290847773546937
  (0, 72933)	0.030866461452400077
  (0, 61678)	0.03573036766937597
  :	:
  (14627, 119329)	0.00900254162854916
  (14627, 95271)	0.016056332449802133
  (14627, 79765)	0.032622444999008425
  (14627, 49123)	0.01200159021766943
  (14

# Modelling

In [31]:
#Accuracy score on training data
pac= PassiveAggressiveClassifier(max_iter= 50)
clf = CalibratedClassifierCV(pac)
clf.fit(tfid_x_train,y_train)
X_train_prediction= clf.predict(tfid_x_train)
score = accuracy_score(y_train,X_train_prediction)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 99.99%


In [32]:
#Accuracy score on test data
pac= PassiveAggressiveClassifier(max_iter= 50)
clf = CalibratedClassifierCV(pac)
clf.fit(tfid_x_train,y_train)
X_test_prediction= clf.predict(tfid_x_test)
score = accuracy_score(y_test,X_test_prediction)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 96.58%


# Pipelining the Model

In [34]:
pipeline = Pipeline([('tfidf',TfidfVectorizer(stop_words='english')),
                    ('PacModel', CalibratedClassifierCV(pac))])

In [35]:
pipeline.fit(x_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(stop_words='english')),
                ('PacModel',
                 CalibratedClassifierCV(base_estimator=PassiveAggressiveClassifier(max_iter=50)))])

In [36]:
score= pipeline.score(x_test,y_test)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 96.77%


In [37]:
pred= pipeline.predict(x_test)

In [41]:
print(pred[4])

0


In [42]:
y_test

16924    1
13869    0
4264     0
9200     1
14912    0
        ..
20455    1
4495     0
6712     0
4988     0
4329     1
Name: label, Length: 3657, dtype: int64

In [43]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      2072
           1       0.96      0.96      0.96      1585

    accuracy                           0.97      3657
   macro avg       0.97      0.97      0.97      3657
weighted avg       0.97      0.97      0.97      3657



In [44]:
print(confusion_matrix(y_test,pred))

[[2015   57]
 [  61 1524]]


In [45]:
with open('Clfpac.pkl', 'wb') as handle:
    pickle.dump(pipeline, handle, protocol= pickle.HIGHEST_PROTOCOL)