In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer
import string
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
real_data = pd.read_csv('True.csv')
fake_data = pd.read_csv('Fake.csv')
real_data.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [3]:
fake_data.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [4]:
real_data['Real or Fake'] = 'real'

In [5]:
fake_data['Real or Fake'] = 'fake'

In [6]:
real_data.drop(columns=['title', 'subject', 'date'])

Unnamed: 0,text,Real or Fake
0,WASHINGTON (Reuters) - The head of a conservat...,real
1,WASHINGTON (Reuters) - Transgender people will...,real
2,WASHINGTON (Reuters) - The special counsel inv...,real
3,WASHINGTON (Reuters) - Trump campaign adviser ...,real
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,real
...,...,...
21412,BRUSSELS (Reuters) - NATO allies on Tuesday we...,real
21413,"LONDON (Reuters) - LexisNexis, a provider of l...",real
21414,MINSK (Reuters) - In the shadow of disused Sov...,real
21415,MOSCOW (Reuters) - Vatican Secretary of State ...,real


In [7]:
fake_data.drop(columns=['title', 'subject', 'date'])

Unnamed: 0,text,Real or Fake
0,Donald Trump just couldn t wish all Americans ...,fake
1,House Intelligence Committee Chairman Devin Nu...,fake
2,"On Friday, it was revealed that former Milwauk...",fake
3,"On Christmas day, Donald Trump announced that ...",fake
4,Pope Francis used his annual Christmas Day mes...,fake
...,...,...
23476,21st Century Wire says As 21WIRE reported earl...,fake
23477,21st Century Wire says It s a familiar theme. ...,fake
23478,Patrick Henningsen 21st Century WireRemember ...,fake
23479,21st Century Wire says Al Jazeera America will...,fake


In [8]:
real_data.shape

(21417, 5)

In [9]:
fake_data.shape

(23481, 5)

In [10]:
data = pd.concat([real_data, fake_data])
data.head()

Unnamed: 0,title,text,subject,date,Real or Fake
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",real
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",real
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",real
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",real
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",real


In [11]:
data.shape

(44898, 5)

In [12]:
data['Real or Fake'].value_counts()

fake    23481
real    21417
Name: Real or Fake, dtype: int64

In [13]:
data.drop(columns=['title', 'subject', 'date'])

Unnamed: 0,text,Real or Fake
0,WASHINGTON (Reuters) - The head of a conservat...,real
1,WASHINGTON (Reuters) - Transgender people will...,real
2,WASHINGTON (Reuters) - The special counsel inv...,real
3,WASHINGTON (Reuters) - Trump campaign adviser ...,real
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,real
...,...,...
23476,21st Century Wire says As 21WIRE reported earl...,fake
23477,21st Century Wire says It s a familiar theme. ...,fake
23478,Patrick Henningsen 21st Century WireRemember ...,fake
23479,21st Century Wire says Al Jazeera America will...,fake


In [14]:
stopwords = set(nltk.corpus.stopwords.words('english'))
stopwords_set = set(stopwords)
punctuation_set = set(string.punctuation)

In [15]:
data['text_cleaned'] = data.text.apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords_set \
                                                          and word not in punctuation_set]))

In [16]:
data.head()

Unnamed: 0,title,text,subject,date,Real or Fake,text_cleaned
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",real,WASHINGTON (Reuters) The head conservative Rep...
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",real,WASHINGTON (Reuters) Transgender people allowe...
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",real,WASHINGTON (Reuters) The special counsel inves...
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",real,WASHINGTON (Reuters) Trump campaign adviser Ge...
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",real,SEATTLE/WASHINGTON (Reuters) President Donald ...


In [17]:
data = data[['text', 'Real or Fake', 'text_cleaned']]

In [18]:
data.head()

Unnamed: 0,text,Real or Fake,text_cleaned
0,WASHINGTON (Reuters) - The head of a conservat...,real,WASHINGTON (Reuters) The head conservative Rep...
1,WASHINGTON (Reuters) - Transgender people will...,real,WASHINGTON (Reuters) Transgender people allowe...
2,WASHINGTON (Reuters) - The special counsel inv...,real,WASHINGTON (Reuters) The special counsel inves...
3,WASHINGTON (Reuters) - Trump campaign adviser ...,real,WASHINGTON (Reuters) Trump campaign adviser Ge...
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,real,SEATTLE/WASHINGTON (Reuters) President Donald ...


In [19]:
data['text_cleaned'] = data['text_cleaned'].str.lower()

In [20]:
count_vect = CountVectorizer()

In [21]:
X = count_vect.fit_transform(data.text_cleaned)

In [22]:
X.shape

(44898, 122000)

In [23]:
# each word will be a column for X

In [24]:
y=data['Real or Fake']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [26]:
lg = LogisticRegression()
lg.fit(X_train, y_train)
y_pred = lg.predict(X_test)
lg.score(X_test, y_test)



0.9966146993318485

In [27]:
confusion_matrix(y_test, y_pred)

array([[5857,   19],
       [  19, 5330]], dtype=int64)

#### Tfidfvectorizer algorithm 

In [28]:
tfidf = TfidfVectorizer()

In [29]:
X = tfidf.fit_transform(data.text_cleaned)
y = data['Real or Fake']
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [30]:
# Random forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
rf.score(X_test, y_test)



0.960890868596882

In [31]:
confusion_matrix(y_test, y_pred)

array([[5791,  115],
       [ 324, 4995]], dtype=int64)

##### When I try to use Gradient Boosting Classifier my PC just freeze and don't want to run the code.

#### So I use only Logistic Regression and Random Forest 

Gradient Boost

gb=GradientBoostingClassifier()

gb.fit(X_train, y_train)

y_pred = gb.predict(X_test)

gb.score(X_test, y_test)

confusion_matrix(y_test, y_pred)

## CONCLUSION

##### We can see from above that Logistic Regression has accuracy of 0.9966, slightly bigger than that from RandomForestClassifier
##### with 0.9608. Both are good for prediction.

The result of confusion_matrix(y_test, y_pred) shows us that:
    
    array([[5857,   19],
       [  19, 5330]], dtype=int64)
       
       
  Likely Real news are 5330, Likely Fake News are 5857 . 
  
  Predicted Fake news as Real are 19, predicted Real news as fake are 19.
  