In [123]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import re
import joblib
import string


In [98]:
fake = pd.read_csv('Fake.csv')
true = pd.read_csv('True.csv')

fake.head()


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [124]:
true.head()

Unnamed: 0,title,text,subject,date,class
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [None]:
# adding a new column called "class" to each DataFrame
fake['class'] = 0
true['class'] = 1



In [None]:
# Combine both datasets
data = pd.concat([fake, true], axis=0)

In [127]:
data.sample(10)

Unnamed: 0,title,text,subject,date,class
2312,"Trump Just Got Some Really, REALLY Bad News A...",Donald Trump has been playing games with his t...,News,"March 3, 2017",0
6464,Cruz Campaign Blames Rubio For Loss: A Cruz/R...,Now that the side-show is essentially over and...,News,"May 9, 2016",0
3804,Trump did not ask former FBI Director Comey to...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"May 12, 2017",1
14121,"""The people have spoken,"" Zimbabwe's Mnangagwa...",HARARE (Reuters) - Zimbabwe s former vice pres...,worldnews,"November 22, 2017",1
22990,The Changing Face of Mainstream Media?,21st Century Wire says One of the biggest tren...,Middle-east,"February 17, 2017",0
20280,BIKERS FOR TRUMP: “Not Going To Put Up With” V...,Veterans are the backbone of the biker commun...,left-news,"Jul 14, 2016",0
2852,The Numbers Are In: Trump Is The Most Hated N...,It s been less than a week since Donald Trump ...,News,"January 25, 2017",0
18452,Pakistani activist targeted by blast vows to m...,"DERA ISMAIL KHAN, Pakistan (Reuters) - A Pakis...",worldnews,"October 3, 2017",1
8072,Kansas argues against boost in school funding,(Reuters) - Kansas sought on Wednesday to avoi...,politicsNews,"September 21, 2016",1
4583,Trump Is Pretending He Doesn’t Want To Sue Th...,"Donald Trump reverted to his usual grouchy, un...",News,"September 17, 2016",0


In [128]:
print(data.columns)  # See all available columns

Index(['title', 'text', 'subject', 'date', 'class'], dtype='object')


In [129]:
columns_to_drop = ["title", "subject", "date"]
existing_cols = [col for col in columns_to_drop if col in data.columns]
data = data.drop(existing_cols, axis=1)


In [130]:
print(data.columns)


Index(['text', 'class'], dtype='object')


In [131]:
data.sample(5)

Unnamed: 0,text,class
17018,"SOCHI, Russia (Reuters) - Russian President Vl...",1
21696,Megyn Kelly interviews The Blaze s Dana Loesch...,0
5538,GOP Senator Ben Sasse of Nebraska has been one...,0
5206,WASHINGTON (Reuters) - A bipartisan group of U...,1
14668,"ANKARA (Reuters) - Turkey, Russia and Iran wil...",1


In [132]:
data.reset_index(inplace=True)




In [133]:
data.drop(['index'], axis=1, inplace=True)



In [134]:
data.sample(5)

Unnamed: 0,text,class
18978,,0
30839,(Reuters) - Former Los Angeles Mayor and forme...,1
6584,Almost a decade after targeting the black comm...,0
36892,WASHINGTON (Reuters) - The U.S. State Departme...,1
25090,WASHINGTON (Reuters) - U.S. Health Secretary T...,1


In [135]:
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove punctuation/numbers
    text = text.lower()
    return text

In [136]:
data["text"] = data["text"].apply(clean_text)


In [137]:
print(data["text"].head())        # view cleaned text
print(data["text"].isnull().sum())  # check if any rows became null


0    donald trump just couldn t wish all americans ...
1    house intelligence committee chairman devin nu...
2    on friday it was revealed that former milwauke...
3    on christmas day donald trump announced that h...
4    pope francis used his annual christmas day mes...
Name: text, dtype: object
0


In [139]:
data.sample(5)

Unnamed: 0,text,class
35956,gaza reuters two palestinian islamic jihad mi...,1
43452,wellington reuters support for new zealand s ...,1
19449,politico has terminated its contract with maga...,0
24368,reuters goldman sachs group inc gsn chief exe...,1
32948,austin texas reuters a texasbased lgbt advoca...,1


In [140]:
x = data["text"]
y = data["class"]
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.25, random_state=42)


In [141]:
print(xtrain.head())     # Shows the first 5 training text samples
print(ytrain.head())     # Shows the corresponding labels (0 = fake, 1 = real)


34830    seoul reuters  south korea predicted on tuesda...
6018     beebe arkansas mayor mike robertson loves jesu...
42549    new york reuters  eighty percent of the power ...
8670     when the black lives matter movement started g...
27243    washington reuters  the trump administrations ...
Name: text, dtype: object
34830    1
6018     0
42549    1
8670     0
27243    1
Name: class, dtype: int64


In [142]:
# Cell 1: Vectorization using TF-IDF
vectorizer = TfidfVectorizer()
xv_train = vectorizer.fit_transform(xtrain)
xv_test = vectorizer.transform(xtest)


In [143]:
lr = LogisticRegression()
lr.fit(xv_train, ytrain)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [144]:
prediction = lr.predict(xv_test)
lr.score(xv_test, ytest)


0.9898440979955456

In [145]:
from sklearn.metrics import classification_report

print(classification_report(ytest, prediction))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5895
           1       0.99      0.99      0.99      5330

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225



In [146]:
import joblib

joblib.dump(vectorizer, "vectorizer.joblib")
joblib.dump(lr, "lr_model.joblib")


['lr_model.joblib']