# Import

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import re
import string

# Prepare data

In [2]:
fake_data = pd.read_csv('Fake.csv')
true_data = pd.read_csv('True.csv')

In [3]:
fake_data.head(3)

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"


In [4]:
true_data.head(3)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"


In [5]:
# fake data - 0, true_data - 1
fake_data['is_fake'] = 0
true_data['is_fake'] = 1

In [6]:
fake_data.shape, true_data.shape

((23481, 5), (21417, 5))

In [7]:
#manual tasting 10 rows
manual_testing_fake = fake_data.tail(10)
fake_data = fake_data.iloc[0:fake_data.shape[0]-10].copy()

manual_testing_true = true_data.tail(10)
true_data = true_data.iloc[0:true_data.shape[0]-10].copy()

In [8]:
fake_data.shape, true_data.shape

((23471, 5), (21407, 5))

In [9]:
manual_testing_fake

Unnamed: 0,title,text,subject,date,is_fake
23471,Seven Iranians freed in the prisoner swap have...,"21st Century Wire says This week, the historic...",Middle-east,"January 20, 2016",0
23472,#Hashtag Hell & The Fake Left,By Dady Chery and Gilbert MercierAll writers ...,Middle-east,"January 19, 2016",0
23473,Astroturfing: Journalist Reveals Brainwashing ...,Vic Bishop Waking TimesOur reality is carefull...,Middle-east,"January 19, 2016",0
23474,The New American Century: An Era of Fraud,Paul Craig RobertsIn the last years of the 20t...,Middle-east,"January 19, 2016",0
23475,Hillary Clinton: ‘Israel First’ (and no peace ...,Robert Fantina CounterpunchAlthough the United...,Middle-east,"January 18, 2016",0
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",0
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",0
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",0
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",0
23480,10 U.S. Navy Sailors Held by Iranian Military ...,21st Century Wire says As 21WIRE predicted in ...,Middle-east,"January 12, 2016",0


In [10]:
full_data = pd.concat([fake_data, true_data])

In [11]:
full_data

Unnamed: 0,title,text,subject,date,is_fake
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
21402,Exclusive: Trump's Afghan decision may increas...,ON BOARD A U.S. MILITARY AIRCRAFT (Reuters) - ...,worldnews,"August 22, 2017",1
21403,U.S. puts more pressure on Pakistan to help wi...,WASHINGTON (Reuters) - The United States sugge...,worldnews,"August 21, 2017",1
21404,Exclusive: U.S. to withhold up to $290 million...,WASHINGTON (Reuters) - The United States has d...,worldnews,"August 22, 2017",1
21405,Trump talks tough on Pakistan's 'terrorist' ha...,ISLAMABAD (Reuters) - Outlining a new strategy...,worldnews,"August 22, 2017",1


In [12]:
data = full_data.drop(['title', 'subject', 'date'], axis = 1)

In [13]:
data.isnull().sum()

text       0
is_fake    0
dtype: int64

In [14]:
#shuffle dataset
data = data.sample(frac = 1)

In [15]:
data.reset_index(inplace=True, drop=True)

In [16]:
data

Unnamed: 0,text,is_fake
0,"Whenever Hillary cackles, you can be pretty su...",0
1,"After the Paris Accord, the United States unde...",0
2,DUBLIN (Reuters) - As the public face of the I...,1
3,BEIJING (Reuters) - Cultural exchanges between...,1
4,MSNBC anchor Joe Scarborough was caught on a h...,0
...,...,...
44873,NEW YORK/WASHINGTON (Reuters) - Longtime Donal...,1
44874,You won t believe who was spotted in the speci...,0
44875,SANTIAGO (Reuters) - Chile s stock market has ...,1
44876,I ve been a lifelong Republican all of my lif...,0


In [17]:
def clear(text):
    
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    return text

In [18]:
data['text'] = data['text'].apply(clear)

In [19]:
x = data['text']
y = data['is_fake']

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2)

# TF-IDF vectorization

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
vectorization = TfidfVectorizer()

In [23]:
xvect_train = vectorization.fit_transform(x_train)
xvect_test = vectorization.transform(x_test)

# Option 1. Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()

LR.fit(xvect_train, y_train)

In [25]:
LR.predict(xvect_train)

array([1, 1, 1, ..., 1, 0, 1])

In [26]:
LR.predict(xvect_test)

array([0, 0, 0, ..., 1, 0, 1])

In [27]:
print('accuracy -', LR.score(xvect_test, y_test))

accuracy - 0.9867424242424242


In [28]:
from sklearn.metrics import classification_report

In [29]:
print(classification_report(y_test, LR.predict(xvect_test)))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4698
           1       0.98      0.99      0.99      4278

    accuracy                           0.99      8976
   macro avg       0.99      0.99      0.99      8976
weighted avg       0.99      0.99      0.99      8976



# Option 2. Gradient Boosting

In [30]:
from sklearn.ensemble import GradientBoostingClassifier

GB = GradientBoostingClassifier(n_estimators=10, random_state=7, verbose=1)
GB.fit(xvect_train, y_train)

      Iter       Train Loss   Remaining Time 
         1           1.1982           40.43s
         2           1.0459           33.58s
         3           0.9191           28.16s
         4           0.8119           23.51s
         5           0.7201           19.22s
         6           0.6415           15.13s
         7           0.5729           11.24s
         8           0.5132            7.67s
         9           0.4609            4.40s
        10           0.4148            0.00s


In [31]:
GB.predict(xvect_test)

array([0, 0, 0, ..., 1, 0, 1])

In [32]:
print('accuracy -', GB.score(xvect_test, y_test))

accuracy - 0.9942067736185384


In [33]:
print(classification_report(y_test, GB.predict(xvect_test)))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      4698
           1       0.99      1.00      0.99      4278

    accuracy                           0.99      8976
   macro avg       0.99      0.99      0.99      8976
weighted avg       0.99      0.99      0.99      8976



# Option 3. Random Forest

In [34]:
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(n_estimators=10)

RF.fit(xvect_train, y_train)

In [35]:
RF.predict(xvect_test)

array([0, 0, 0, ..., 1, 0, 1])

In [36]:
print('accuracy -', RF.score(xvect_test, y_test))

accuracy - 0.9587789661319073


In [37]:
print(classification_report(y_test, RF.predict(xvect_test)))

              precision    recall  f1-score   support

           0       0.95      0.98      0.96      4698
           1       0.97      0.94      0.96      4278

    accuracy                           0.96      8976
   macro avg       0.96      0.96      0.96      8976
weighted avg       0.96      0.96      0.96      8976



# Manual testing

In [38]:
def isfake_label(n):
    if n ==0:
        return "Fake News"
    else:
        return "Not A Fake News"
    
def manual_testing(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test['text'] = new_def_test['text'].apply(clear)
    new_x_test = new_def_test['text']
    new_xvect_test = vectorization.transform(new_x_test)
    
    print(f"LR Prediction - {isfake_label(LR.predict(new_xvect_test))}") 
    print(f"GB Prediction - {isfake_label(GB.predict(new_xvect_test))}") 
    print(f"RF Prediction - {isfake_label(RF.predict(new_xvect_test))}") 
    

In [39]:
manual_testing_fake.head(1)

Unnamed: 0,title,text,subject,date,is_fake
23471,Seven Iranians freed in the prisoner swap have...,"21st Century Wire says This week, the historic...",Middle-east,"January 20, 2016",0


In [40]:
manual_testing_true.head(1)

Unnamed: 0,title,text,subject,date,is_fake
21407,"Mata Pires, owner of embattled Brazil builder ...","SAO PAULO (Reuters) - Cesar Mata Pires, the ow...",worldnews,"August 22, 2017",1


# Manual testing Fake News

In [41]:
text_ = manual_testing_fake.head(1)['text'].values[0]

In [42]:
text_

'21st Century Wire says This week, the historic international Iranian Nuclear Deal was punctuated by a two-way prisoner swap between Washington and Tehran, but it didn t end quite the way everyone expected. On the Iranian side, one of the U.S. citizens who was detained in Iran, Nosratollah Khosravi-Roodsari, has stayed in Iran, but on the U.S. side   all 7 of the Iranians held in U.S. prisons DID NOT show up to their flight to Geneva for the prisoner exchange   with at least 3 electing to stay in the U.S  TEHRAN SIDE: In Iran, 5 U.S. prisoners were released, with 4 of them making their way to Germany via Switzerland.Will Robinson Daily MailNone of the Iranians freed in the prisoner swap have returned home and could still be in the United States, it has been reported.The seven former inmates, who were released as part of a deal with the Islamic republic, did not show up to get a flight to Geneva, Switzerland, where the exchange was set to take place on Sunday.Three of the Iranians have 

In [43]:
manual_testing(text_)

LR Prediction - Fake News
GB Prediction - Fake News
RF Prediction - Fake News


# Manual testing Not a Fake News

In [44]:
text_ = manual_testing_true.head(1)['text'].values[0]

In [45]:
text_

'SAO PAULO (Reuters) - Cesar Mata Pires, the owner and co-founder of Brazilian engineering conglomerate OAS SA, one of the largest companies involved in Brazil s corruption scandal, died on Tuesday. He was 68. Mata Pires died of a heart attack while taking a morning walk in an upscale district of S o Paulo, where OAS is based, a person with direct knowledge of the matter said. Efforts to contact his family were unsuccessful. OAS declined to comment. The son of a wealthy cattle rancher in the northeastern state of Bahia, Mata Pires  links to politicians were central to the expansion of OAS, which became Brazil s No. 4 builder earlier this decade, people familiar with his career told Reuters last year. His big break came when he befriended Antonio Carlos Magalh es, a popular politician who was Bahia governor several times, and eventually married his daughter Tereza. Brazilians joked that OAS stood for  Obras Arranjadas pelo Sogro  - or  Work Arranged by the Father-In-Law.   After years o

In [46]:
manual_testing(text_)

LR Prediction - Not A Fake News
GB Prediction - Not A Fake News
RF Prediction - Not A Fake News
