# Fake news Detection

### Importing required library
Here I am going to importing some of the required library, if extra library is required to install It will be install later on.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string

### Inserting fake and real dataset

In [47]:
df = pd.read_csv("data/task_3a_sample_data.csv")

In [48]:
df.head(5)

Unnamed: 0,public_id,title,text,our rating
0,f2182a54,HUGE! Attorney Sidney Powell CONFIRMS Alleged ...,Last week Rep. Louie Gohmert told Chris Salced...,FALSE
1,c5175d8d,Paul Ryan’s Worst Ally - The New York Times,WHATEVER drama plays out when Republicans meet...,TRUE
2,213a870b,"You Can Get Jail Time Or $3,000 Fine For Not W...",Source page URL Title You Can Get Jail Time O...,FALSE
3,392886ea,Antifa gearing up for false flag violence disg...,With merchants in Democrat-run cities boarding...,FALSE
4,bc6d5d55,Remarks by President Biden on the Administrati...,State Dining Room 4:22 P.M. EST THE PRESIDEN...,partially false


In [49]:
df.shape

(50, 4)

In [50]:
df['class'] = df['our rating'].apply(lambda x: 1 if 'true' in x.lower() else ( 0 if 'partially false' in x.lower() else -1))

In [64]:
df.head(10)

Unnamed: 0,public_id,title,text,our rating,class
0,f2182a54,HUGE! Attorney Sidney Powell CONFIRMS Alleged ...,Last week Rep. Louie Gohmert told Chris Salced...,FALSE,-1
1,c5175d8d,Paul Ryan’s Worst Ally - The New York Times,WHATEVER drama plays out when Republicans meet...,TRUE,1
2,213a870b,"You Can Get Jail Time Or $3,000 Fine For Not W...",Source page URL Title You Can Get Jail Time O...,FALSE,-1
3,392886ea,Antifa gearing up for false flag violence disg...,With merchants in Democrat-run cities boarding...,FALSE,-1
4,bc6d5d55,Remarks by President Biden on the Administrati...,State Dining Room 4:22 P.M. EST THE PRESIDEN...,partially false,0
5,59960d0e,Infowars Article,Keep up to date with our latest: Have an impo...,FALSE,-1
6,b8437efb,BOMBSHELL: Covid-19 infection rate may be 440%...,(Natural News) A 2012 study published in the j...,FALSE,-1
7,faf024d6,Marine Corps. Rebukes Pelosi: “WE DON’T WORK F...,Latest Breaking News: Martial Law Imminent Ge...,FALSE,-1
8,0f086930,"You Can be Fined $2,500 And Banned From Drivin...",Smoking could be considered a distraction unde...,FALSE,-1
9,daafc154,Scott Walker still owes $1 million for preside...,Gov. Scott Walker said Friday his presidential...,partially false,0


In [65]:
df.columns

Index(['public_id', 'title', 'text', 'our rating', 'class'], dtype='object')

#### "title",  "our rating" and "public_id" columns is not required for detecting the fake news, so I am going to drop the columns.

In [69]:
df_t = df.drop([ "our rating","public_id"], axis = 1)

In [71]:
df_t = df_t.drop([ "title"], axis = 1)

In [72]:
df_t.head()

Unnamed: 0,text,class
0,Last week Rep. Louie Gohmert told Chris Salced...,-1
1,WHATEVER drama plays out when Republicans meet...,1
2,Source page URL Title You Can Get Jail Time O...,-1
3,With merchants in Democrat-run cities boarding...,-1
4,State Dining Room 4:22 P.M. EST THE PRESIDEN...,0


In [73]:
df_t.isnull().sum()


text     0
class    0
dtype: int64

#### Randomly shuffling the dataframe 

In [74]:
df_t.reset_index(inplace = True)
df_t.drop(["index"], axis = 1, inplace = True)

In [75]:
df_t.columns

Index(['text', 'class'], dtype='object')

In [76]:
df_t.head()

Unnamed: 0,text,class
0,Last week Rep. Louie Gohmert told Chris Salced...,-1
1,WHATEVER drama plays out when Republicans meet...,1
2,Source page URL Title You Can Get Jail Time O...,-1
3,With merchants in Democrat-run cities boarding...,-1
4,State Dining Room 4:22 P.M. EST THE PRESIDEN...,0


#### Creating a function to convert the text in lowercase, remove the extra space, special chr., ulr and links.

In [77]:
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

In [78]:
df_t["text"] = df_t["text"].apply(wordopt)

In [62]:
df_t["title"] = df_t["title"].apply(wordopt)

KeyError: 'title'

In [79]:
df_t.head(10)

Unnamed: 0,text,class
0,last week rep louie gohmert told chris salced...,-1
1,whatever drama plays out when republicans meet...,1
2,source page url title you can get jail time o...,-1
3,with merchants in democrat run cities boarding...,-1
4,state dining room p m est the president ...,0
5,keep up to date with our latest have an impo...,-1
6,natural news a study published in the journ...,-1
7,latest breaking news martial law imminent ge...,-1
8,smoking could be considered a distraction unde...,-1
9,gov scott walker said friday his presidential...,0


#### Defining dependent and independent variable as x and y

In [80]:
x = df_t["text"]
y = df_t["class"]

#### Splitting the dataset into training set and testing set. 

In [81]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

#### Convert text to vectors

In [82]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [83]:
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

### 1. Logistic Regression

In [84]:
from sklearn.linear_model import LogisticRegression

In [85]:
LR = LogisticRegression()
LR.fit(xv_train,y_train)

LogisticRegression()

In [86]:
pred_lr=LR.predict(xv_test)

In [87]:
LR.score(xv_test, y_test)

0.46153846153846156

In [88]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

          -1       0.42      1.00      0.59         5
           0       1.00      0.17      0.29         6
           1       0.00      0.00      0.00         2

    accuracy                           0.46        13
   macro avg       0.47      0.39      0.29        13
weighted avg       0.62      0.46      0.36        13

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 2. Decision Tree Classification

In [89]:
from sklearn.tree import DecisionTreeClassifier

In [90]:
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

DecisionTreeClassifier()

In [91]:
pred_dt = DT.predict(xv_test)

In [92]:
DT.score(xv_test, y_test)

0.5384615384615384

In [93]:
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

          -1       0.60      0.60      0.60         5
           0       0.57      0.67      0.62         6
           1       0.00      0.00      0.00         2

    accuracy                           0.54        13
   macro avg       0.39      0.42      0.41        13
weighted avg       0.49      0.54      0.51        13



### 3. Gradient Boosting Classifier

In [94]:
from sklearn.ensemble import GradientBoostingClassifier

In [95]:
GBC = GradientBoostingClassifier(random_state=0)
GBC.fit(xv_train, y_train)

GradientBoostingClassifier(random_state=0)

In [96]:
pred_gbc = GBC.predict(xv_test)

In [97]:
GBC.score(xv_test, y_test)

0.6153846153846154

In [98]:
print(classification_report(y_test, pred_gbc))

              precision    recall  f1-score   support

          -1       0.50      0.80      0.62         5
           0       0.80      0.67      0.73         6
           1       0.00      0.00      0.00         2

    accuracy                           0.62        13
   macro avg       0.43      0.49      0.45        13
weighted avg       0.56      0.62      0.57        13

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 4. Random Forest Classifier

In [99]:
from sklearn.ensemble import RandomForestClassifier

In [100]:
RFC = RandomForestClassifier(random_state=0)
RFC.fit(xv_train, y_train)

RandomForestClassifier(random_state=0)

In [101]:
pred_rfc = RFC.predict(xv_test)

In [102]:
RFC.score(xv_test, y_test)

0.46153846153846156

In [103]:
print(classification_report(y_test, pred_rfc))

              precision    recall  f1-score   support

          -1       0.40      0.80      0.53         5
           0       0.67      0.33      0.44         6
           1       0.00      0.00      0.00         2

    accuracy                           0.46        13
   macro avg       0.36      0.38      0.33        13
weighted avg       0.46      0.46      0.41        13

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Model Testing With Manual Entry

### News

In [104]:
def output_lable(n):
    if n == 0:
        return "Fake News"
    elif n == 1:
        return "Not A Fake News"
    
def manual_testing(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(wordopt) 
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_LR = LR.predict(new_xv_test)
    pred_DT = DT.predict(new_xv_test)
    pred_GBC = GBC.predict(new_xv_test)
    pred_RFC = RFC.predict(new_xv_test)

    return print("\n\nLR Prediction: {} \nDT Prediction: {} \nGBC Prediction: {} \nRFC Prediction: {}".format(output_lable(pred_LR[0]), 
                                                                                                              output_lable(pred_DT[0]), 
                                                                                                              output_lable(pred_GBC[0]), 
                                                                                                              output_lable(pred_RFC[0])))

In [105]:
news = str(input())
manual_testing(news)



LR Prediction: None 
DT Prediction: None 
GBC Prediction: Fake News 
RFC Prediction: None
