# Fake news detection notebook

```This note book illustrates and easy way to determine where a give social or political post is fake/remotely fake/exargurated or rightful news```

## Import necessary libraries

In [34]:
import pandas as pd
import numpy as np
from sklearn import *
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

## Extract news dataset from zip

In [35]:
from zipfile import ZipFile

news_csv = None

with ZipFile('./datasets/news.zip', 'r') as newsZip:
    news_csv = newsZip.extract('news.csv')


### Importing cvs into pandas dataframe

In [36]:
news = pd.read_csv(news_csv)

### Confirm news data was imported

In [37]:
news.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


### Split news into training and testing sets

In [38]:
x_train, x_test, y_train, y_test = train_test_split(news['text'], news['label'], test_size=0.2, random_state=7)

In [39]:
vectoriser = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train = vectoriser.fit_transform(x_train)
tfidf_test = vectoriser.transform(x_test)

In [40]:
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train, y_train)

y_pred = pac.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)

f'Fake news detection Accuracy: {round(score*100, 2)}%'

'Fake news detection Accuracy: 93.21%'

### Confusion matrix to provide insight into the number of false and true negatives and positives

In [41]:
confusion_matrix(y_test, y_pred, labels=['FAKE', 'REAL'])

array([[595,  43],
       [ 43, 586]])