In [8]:
# import the libraries
import pandas as pd
import numpy as np
import nltk 
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , classification_report
from sklearn.linear_model import LogisticRegression

import re
import string

In [9]:
true_csv =pd.read_csv('Data/true.csv')
fake_csv = pd.read_csv('Data/fake.csv')

In [10]:
true_csv.head(3)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"


In [11]:
print(f'true:{true_csv.shape}, fake:{fake_csv.shape}')

true:(21417, 4), fake:(23481, 4)


In [12]:
true_csv.columns.tolist()

['title', 'text', 'subject', 'date']

In [13]:
true_csv['class'] = 1
fake_csv['class'] = 0

In [14]:
df = pd.concat([true_csv, fake_csv], ignore_index = True)
print(f'total {df.shape}')

total (44898, 5)


In [15]:
# shuffled the dataset 
df = df.sample(frac =1, random_state =42).reset_index(drop = True)

In [16]:
df.head(3)

Unnamed: 0,title,text,subject,date,class
0,BREAKING: GOP Chairman Grassley Has Had Enoug...,"Donald Trump s White House is in chaos, and th...",News,"July 21, 2017",0
1,Failed GOP Candidates Remembered In Hilarious...,Now that Donald Trump is the presumptive GOP n...,News,"May 7, 2016",0
2,Mike Pence’s New DC Neighbors Are HILARIOUSLY...,Mike Pence is a huge homophobe. He supports ex...,News,"December 3, 2016",0


In [17]:
df.drop(['title', 'subject','date'], axis=1,inplace=True)


In [18]:
df.isnull().sum()

text     0
class    0
dtype: int64

In [19]:
df.columns

Index(['text', 'class'], dtype='object')

In [20]:
# converting upper case to lower case

def clean_text(text):
    
    text = re.sub(r'\[.*?\]|\(.*?\)|\{.*?\}', '', text) #remove the brakets
    text = re.sub(r'<[^>]+>', '', text)               # HTML tags
    text = re.sub(r'(https?://\S+|www\.\S+)', '', text)  # URLs
    text = re.sub(r'@\w+', '', text)                  # Mentions
    text = re.sub(r'#\w+', '', text)                  # Hashtags
    text = re.sub(r'\d+', '', text)                   # Numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\n', ' ', text)               # Punctuation
    text = re.sub(r'\s+', ' ', text).strip()          # Extra spaces
    text.lower()   

    return text

In [21]:
df['text'] = df['text'].apply(clean_text)

In [22]:
X = df['text']
Y = df['class']

In [23]:
X_train, X_test,y_train, y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)

In [24]:
vect = TfidfVectorizer()
xv_train = vect.fit_transform(X_train)
xv_test = vect.transform(X_test)

In [43]:
LR = LogisticRegression()
LR.fit(xv_train, y_train)


In [44]:
y_pred = LR.predict(xv_test)
LR.score(xv_test, y_test)

0.9846325167037862

In [41]:
from sklearn.naive_bayes import MultinomialNB

NB = MultinomialNB()
NB.fit(xv_train,y_train)


In [42]:
y_pred = NB.predict(xv_test)

print(f"Accuracy Score: {accuracy_score(y_test, y_pred)}")
print(f"Classification Report\n{classification_report(y_test, y_pred)}")


Accuracy Score: 0.944097995545657
Classification Report
              precision    recall  f1-score   support

           0       0.95      0.94      0.95      4669
           1       0.94      0.94      0.94      4311

    accuracy                           0.94      8980
   macro avg       0.94      0.94      0.94      8980
weighted avg       0.94      0.94      0.94      8980



In [45]:
import pandas as pd
def output_lable(n):
    if n == 0:
        return 'This is fake news'
    elif n==1:
        return 'This is not a fake' 
def manual_testing(news):
    testing_news = {'text':[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test['text'] = new_def_test['text'].apply(clean_text)
    new_x_test = new_def_test['text']
    new_xv_test = vect.transform(new_x_test)
    pred_LR = LR.predict(new_xv_test)
    pred_NB = NB.predict(new_xv_test)
   
    print("LR Prediction  →", output_lable(pred_LR[0]))
    print("RF Prediction  →", output_lable(pred_NB[0]))
    


In [46]:

manual_testing("Breaking: Scientists confirm aliens landed in New York!")


LR Prediction  → This is fake news
RF Prediction  → This is fake news


In [49]:
# save a logistic model
import pickle
import pickle

pickle.dump(LR, open('model_LR.pkl', 'wb'))

In [50]:
# load the model
LR = pickle.load(open('model_LR.pkl', 'rb'))
news = "Government announces new economic policy."
manual_testing(news)


LR Prediction  → This is fake news
RF Prediction  → This is not a fake
