# Fake news classification with TF-IDF features and Random forset classifier

## Import libararies

In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score


## Load data

In [2]:
real_news = pd.read_csv('./data/True.csv')
fake_news = pd.read_csv('./data/Fake.csv')

In [3]:
print(f'Real new shape:{real_news.shape}\nFake news shape:{fake_news.shape}')

Real new shape:(21417, 4)
Fake news shape:(23481, 4)


In [4]:
#take ~ 30% of both datasets
real_news = real_news[0:6000:1]
fake_news = fake_news[0:6000:1]

In [5]:
#combine title and text in one column in real news
real_news['allText'] = real_news['title'] + " " + real_news['text']
real_news.head(5)

Unnamed: 0,title,text,subject,date,allText
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017","As U.S. budget fight looms, Republicans flip t..."
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",U.S. military to accept transgender recruits o...
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",Senior U.S. Republican senator: 'Let Mr. Muell...
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",FBI Russia probe helped by Australian diplomat...
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",Trump wants Postal Service to charge 'much mor...


In [6]:
#combine title and text in one column in fake news
fake_news['allText'] = fake_news['title'] + " " + fake_news['text']
fake_news.head(5)

Unnamed: 0,title,text,subject,date,allText
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",Donald Trump Sends Out Embarrassing New Year’...
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",Drunk Bragging Trump Staffer Started Russian ...
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",Sheriff David Clarke Becomes An Internet Joke...
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",Trump Is So Obsessed He Even Has Obama’s Name...
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",Pope Francis Just Called Out Donald Trump Dur...


In [7]:
#add a lebal column real = 1 , fake = 0
real_news['FakeOrNot'] = 1
fake_news['FakeOrNot'] = 0

In [8]:
#combine both datasets to one dataset
News_Dataset = pd.concat([real_news, fake_news], ignore_index=True)
News_Dataset.head()

Unnamed: 0,title,text,subject,date,allText,FakeOrNot
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017","As U.S. budget fight looms, Republicans flip t...",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",U.S. military to accept transgender recruits o...,1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",Senior U.S. Republican senator: 'Let Mr. Muell...,1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",FBI Russia probe helped by Australian diplomat...,1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",Trump wants Postal Service to charge 'much mor...,1


In [9]:
#drop columns we will not use
News_Dataset = News_Dataset.drop(['text', 'subject', 'date','title'], axis = 1)
News_Dataset

Unnamed: 0,allText,FakeOrNot
0,"As U.S. budget fight looms, Republicans flip t...",1
1,U.S. military to accept transgender recruits o...,1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,1
3,FBI Russia probe helped by Australian diplomat...,1
4,Trump wants Postal Service to charge 'much mor...,1
...,...,...
11995,"John Oliver Started A Debt Collection Agency,...",0
11996,The Reason This Latino Democrat Is Voting For...,0
11997,Hispanic Rep Just Told Trump Where He Can Sho...,0
11998,NRA’s Response To The Florida Shooting Spree ...,0


In [10]:
News_Dataset.FakeOrNot.value_counts()

1    6000
0    6000
Name: FakeOrNot, dtype: int64

In [11]:
#check if null exist
News_Dataset[News_Dataset.isnull().any(axis=1)]

Unnamed: 0,allText,FakeOrNot


In [12]:
ps = PorterStemmer()
corpus = []

for i in range(0, len(News_Dataset)):
    review = re.sub('[^a-zA-Z]', ' ', News_Dataset['allText'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

## create the TF-IDF model

In [13]:
# Creating the TF-IDF model
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(max_features=5000, ngram_range=(1,3))
corpus = News_Dataset.allText
X = cv.fit_transform(corpus).toarray()

## save model

In [14]:
import joblib
joblib.dump(cv.vocabulary_,open("tfidf_features.pkl","wb"))

## Train data

In [15]:
X.shape

(12000, 5000)

In [16]:
y=News_Dataset['FakeOrNot']

In [25]:
## Divide the dataset into Train and Test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [18]:
count_df = pd.DataFrame(X_train, columns=cv.get_feature_names())

count_df.head()



Unnamed: 0,000,10,10 years,100,11,12,13,14,15,16,...,you re,you think,you to,you ve,you want,young,your,yourself,youtube,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07163,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.039269,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.039177,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.152856,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.054661,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Random forset model

In [26]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
RF=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
RF.fit(X_train,y_train)

RF_pred=RF.predict(X_test)


## Model evaluation


In [27]:
#classification report
print('classification report:')
print(classification_report(y_test, RF_pred))
#confusion matrix
print('confusion matrix:')
print(confusion_matrix(y_test, RF_pred))

RF_f1 = round(f1_score(y_test, RF_pred, average='weighted'), 3)
RF_accuracy = round((accuracy_score(y_test, RF_pred)*100),2)

print("Accuracy : " , RF_accuracy , " %")
print("f1_score : " , RF_f1)


classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1217
           1       1.00      1.00      1.00      1183

    accuracy                           1.00      2400
   macro avg       1.00      1.00      1.00      2400
weighted avg       1.00      1.00      1.00      2400

confusion matrix:
[[1216    1]
 [   0 1183]]
Accuracy :  99.96  %
f1_score :  1.0


## save model

In [28]:
fname = 'RF_classifier.pkl'
joblib.dump(RF, fname)


['RF_classifier.pkl']

# test model

In [31]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import joblib
import re


# load models
model_RF = joblib.load('RF_classifier.pkl')
model_tfidf = joblib.load('tfidf_features.pkl')

#get data
temp = 'Just when you might have thought we d get a break from watching people kiss Donald Trump s ass and stroke his ego ad nauseam, a pro-Trump group creates an ad that s nothing but people doing even more of those exact things. America First Policies is set to release this ad, called  Thank You, President Trump,  on Christmas Day and, well, we threw up a little in our mouths trying to watch this.Basically, the spot is nothing but people fawning all over Trump for all the stuff he hasn t actually done. The ad includes a scene with a little girl thanking Trump for bringing back  Merry Christmas,  which never went away (there are even videos of President Obama saying  Merry Christmas  himself). A man thanks him for cutting his taxes. And America First says that everyday Americans everywhere are thanking Trump for being such a great and awesome president.The best president.Nobody s ever done what he s done. He s breaking all kinds of records every day.Believe us.Anyway, the word  propaganda  comes to mind when watching this. That s what it is   literal propaganda promoting someone who shouldn t need this kind of promotion anymore. Watch this ad bullshit below:The way the MAGAs are kowtowing to Orange Hitler is both disgusting and frightening. The man has done nothing, and his policies will harm the very same Americans who are thanking him. Unfortunately, it will take an obscene amount of pain before they ll open their eyes and see they ve been duped by a con man with a bad hairdo.And his ongoing need for this kind of adoration is, at best, unbecoming of his office. This ad is vile.Featured image via Al Drago-Pool/Getty Images'

#pre-procesing
def preprocessor(text):

    ps = PorterStemmer()
    corpus = []

    review = re.sub('[^a-zA-Z]', ' ', temp)
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
    return corpus

#tfidf     
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,3),vocabulary=joblib.load(open("tfidf_features.pkl","rb")))
tfidf = vectorizer.fit_transform(preprocessor(temp))
#rf classifiation
model_RF.predict(tfidf)[0]

0