# <center>Fake News Analysis</center>

### Data Ingestion

In [1]:
import pandas as pd
data = pd.read_csv('Datasets/train.csv')

In [2]:
data

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


### Describing the data

In [3]:
data.describe()

Unnamed: 0,id,label
count,20800.0,20800.0
mean,10399.5,0.500625
std,6004.587135,0.500012
min,0.0,0.0
25%,5199.75,0.0
50%,10399.5,1.0
75%,15599.25,1.0
max,20799.0,1.0


### Preprocessing the dataset

In [4]:
data.set_index('id',inplace=True)

### Checking whether the data has null values

In [5]:
for i in data.columns:
    print(f'{i} has {data[i].isnull().sum()} values')

title has 558 values
author has 1957 values
text has 39 values
label has 0 values


### Handling the missing values

In [6]:
data.dropna(axis=0,inplace=True)

In [7]:
data.shape

(18285, 4)

### Natural language processing on the text data

In [8]:
data.drop(columns=['title','author'],axis=1,inplace=True)

In [9]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [119]:
ps = PorterStemmer()
tf = TfidfVectorizer(max_features=73,ngram_range=(1,3))

In [113]:
def parser(text):
    string_manipulation = re.sub("[^a-zA-Z]+",' ',string=str(text)).lower().split()
    string_manipulation = [ps.stem(words) for words in string_manipulation if words not in stopwords.words('english')]
    string_manipulation = ' '.join(string_manipulation)
    return string_manipulation

### Independent and the dependent variable

In [82]:
data['text'] = data['text'].apply(parser)

In [83]:
X = data['text']
X = tf.fit_transform(X)
y = data['label']

### Train, Test split

In [84]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

In [85]:
print('Shape of the training data : ',(x_train.shape,y_train.shape))
print('Shape of the testing data : ',(x_test.shape,y_test.shape))

Shape of the training data :  ((12799, 73), (12799,))
Shape of the testing data :  ((5486, 73), (5486,))


In [86]:
x_train,x_val,y_train,y_val = train_test_split(x_train,y_train,test_size=0.3)

In [87]:
print('Shape of the training data : ',(x_train.shape,y_train.shape))
print('Shape of the testing data : ',(x_val.shape,y_val.shape))

Shape of the training data :  ((8959, 73), (8959,))
Shape of the testing data :  ((3840, 73), (3840,))


### Machine Learning Models

In [88]:
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC

In [89]:
models = [LogisticRegression(),BernoulliNB(),SVC(),PassiveAggressiveClassifier()]
for model in models:
    model.fit(x_val,y_val)
    print(f'Score of {model} model : {model.score(x_val,y_val)}')

Score of LogisticRegression() model : 0.8596354166666667
Score of BernoulliNB() model : 0.6778645833333333
Score of SVC() model : 0.9424479166666667
Score of PassiveAggressiveClassifier() model : 0.8325520833333333


In [90]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [91]:
model = PassiveAggressiveClassifier()
model.fit(x_train,y_train)

In [92]:
y_pred = model.predict(x_test)

### Evaluation metrics

In [93]:
print('The accuracy of the model is : ','%.2f'%(accuracy_score(y_test,y_pred)*100),'%')

The accuracy of the model is :  81.72 %


In [94]:
print(confusion_matrix(y_test,y_pred))

[[2513  598]
 [ 405 1970]]


In [95]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.81      0.83      3111
           1       0.77      0.83      0.80      2375

    accuracy                           0.82      5486
   macro avg       0.81      0.82      0.82      5486
weighted avg       0.82      0.82      0.82      5486



### Classification based on new news

In [96]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
news = '''When the lockdown started, I felt like what I needed was to get lost in another time and place. I wanted to read something totally removed from our current context, like Jane Austen, or something epic and absorbing, like Hilary Mantel’s Wolf Hall trilogy, to take me away from everything that was happening IRL. Instead I ended up watching a bunch of Jane Austen film adaptations and the BBC mini-series of Wolf Hall.
My actual reading list ended up being, well, short. Sally Rooney’s Conversations With Friends did nothing but make me yearn for the vapid pleasures of our before-pandemic social lives (remember gossip!?) and after about 150 pages of what I found to be overly florid and weirdly ethereal prose in Ocean Vuong’s On Earth We Are Briefly Gorgeous I felt like I was going to have a panic attack.
'''
t = parser(news)
new = cv.fit_transform([t])

In [97]:
print(model.predict(new))

[0]
