In [35]:
import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.metrics import classification_report

In [2]:
df_fake = pd.read_csv(r"C:\ML Projects\Fake News Detection\Fake.csv")
print(df_fake.head())

                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  
0  December 31, 2017  
1  December 31, 2017  
2  December 30, 2017  
3  December 29, 2017  
4  December 25, 2017  


In [3]:
df_true = pd.read_csv(r"C:\ML Projects\Fake News Detection\True.csv")
print(df_true.head())

                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                                text       subject  \
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews   
3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...  politicsNews   

                 date  
0  December 31, 2017   
1  December 29, 2017   
2  December 31, 2017   
3  December 30, 2017   
4  December 29, 2017   


In [4]:
df_fake["target"] = 0
df_true["target"] = 1

In [5]:
df_merged = pd.concat([df_fake,df_true],ignore_index=True)
print(df_merged)

                                                   title  \
0       Donald Trump Sends Out Embarrassing New Year’...   
1       Drunk Bragging Trump Staffer Started Russian ...   
2       Sheriff David Clarke Becomes An Internet Joke...   
3       Trump Is So Obsessed He Even Has Obama’s Name...   
4       Pope Francis Just Called Out Donald Trump Dur...   
...                                                  ...   
44893  'Fully committed' NATO backs new U.S. approach...   
44894  LexisNexis withdrew two products from Chinese ...   
44895  Minsk cultural hub becomes haven from authorities   
44896  Vatican upbeat on possibility of Pope Francis ...   
44897  Indonesia to buy $1.14 billion worth of Russia...   

                                                    text    subject  \
0      Donald Trump just couldn t wish all Americans ...       News   
1      House Intelligence Committee Chairman Devin Nu...       News   
2      On Friday, it was revealed that former Milwauk...       New

In [7]:
#merging the title and text coloumn to a single coloumn "content"
df_merged["content"] = df_merged["title"] + " " + df_merged["text"] 

In [8]:
# suffling the rows so that not all fake and real are at one time
shuffled_df = df_merged.sample(frac=1).reset_index(drop=True)
shuffled_df

Unnamed: 0,title,text,subject,date,target,content
0,Powerful Koch brothers rebuff big donors' call...,"COLORADO SPRINGS, Colo. (Reuters) - The billio...",politicsNews,"August 1, 2016",1,Powerful Koch brothers rebuff big donors' call...
1,Trump Gave A Green Light To Sexist ‘Lewinsky’...,Donald Trump accused Hillary Clinton of playin...,News,"April 29, 2016",0,Trump Gave A Green Light To Sexist ‘Lewinsky’...
2,"FBI FINALLY Does Its Job, Starts Monitoring O...",Some of America s pretend patriots have taken ...,News,"January 4, 2016",0,"FBI FINALLY Does Its Job, Starts Monitoring O..."
3,WOW! Christian Author Gives UNEXPECTED And BRI...,Brigitte Gabriel was born in the Marjeyoun Dis...,left-news,"Jan 13, 2017",0,WOW! Christian Author Gives UNEXPECTED And BRI...
4,Michael Jordan ‘Can No Longer Stay Silent’ On...,Michael Jordan is quite possibly the best bask...,News,"July 25, 2016",0,Michael Jordan ‘Can No Longer Stay Silent’ On...
...,...,...,...,...,...,...
44893,Trump's son says 'happy' to work with Senate i...,WASHINGTON (Reuters) - President Donald Trump’...,politicsNews,"July 10, 2017",1,Trump's son says 'happy' to work with Senate i...
44894,Trump declines to say if he will visit Korean ...,WASHINGTON (Reuters) - U.S. President Donald T...,worldnews,"October 25, 2017",1,Trump declines to say if he will visit Korean ...
44895,OH THE IRONY! NANCY PELOSI: “Trump’s family sh...,"Appearing on MSNBC s Morning Joe, House Mino...",politics,"Jun 9, 2017",0,OH THE IRONY! NANCY PELOSI: “Trump’s family sh...
44896,It’s Not Over Yet: Jill Stein Files Federal L...,Green Party presidential nominee Jill Stein ha...,News,"December 5, 2016",0,It’s Not Over Yet: Jill Stein Files Federal L...


In [10]:
# data prepocessing (cleaning) to input coloumn (content)
# 1. converting to lower case
shuffled_df["content"] = shuffled_df["content"].str.lower()
shuffled_df["content"]

0        powerful koch brothers rebuff big donors' call...
1         trump gave a green light to sexist ‘lewinsky’...
2         fbi finally does its job, starts monitoring o...
3        wow! christian author gives unexpected and bri...
4         michael jordan ‘can no longer stay silent’ on...
                               ...                        
44893    trump's son says 'happy' to work with senate i...
44894    trump declines to say if he will visit korean ...
44895    oh the irony! nancy pelosi: “trump’s family sh...
44896     it’s not over yet: jill stein files federal l...
44897    “deal with facts!”: tom brokaw calls out polit...
Name: content, Length: 44898, dtype: object

In [16]:
# 2. remove punctuation, digits and special characters
shuffled_df["content"] = shuffled_df["content"].apply(lambda x:re.sub(r'[^a-zA-Z\s]'," ",str(x)))
shuffled_df["content"]

0        powerful koch brothers rebuff big donors  call...
1         trump gave a green light to sexist  lewinsky ...
2         fbi finally does its job  starts monitoring o...
3        wow  christian author gives unexpected and bri...
4         michael jordan  can no longer stay silent  on...
                               ...                        
44893    trump s son says  happy  to work with senate i...
44894    trump declines to say if he will visit korean ...
44895    oh the irony  nancy pelosi   trump s family sh...
44896     it s not over yet  jill stein files federal l...
44897     deal with facts    tom brokaw calls out polit...
Name: content, Length: 44898, dtype: object

In [21]:
# 3. remove stopwords
nltk.download("stopwords")
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
shuffled_df["content"] = shuffled_df['content'].apply(lambda x: " ".join([word for word in str(x).split() if word.lower() not in stop_words]))
shuffled_df["content"]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0        powerful koch brothers rebuff big donors calls...
1        trump gave green light sexist lewinsky attack ...
2        fbi finally job starts monitoring oregon milit...
3        wow christian author gives unexpected brillian...
4        michael jordan longer stay silent police shoot...
                               ...                        
44893    trump son says happy work senate intelligence ...
44894    trump declines say visit korean dmz asia trip ...
44895    oh irony nancy pelosi trump family concerned h...
44896    yet jill stein files federal lawsuit pennsylva...
44897    deal facts tom brokaw calls political hack and...
Name: content, Length: 44898, dtype: object

In [24]:
# 4.remove extra white spaces
shuffled_df["content"] = shuffled_df["content"].apply(lambda x: " ".join(str(x).split()))
shuffled_df["content"]

0        powerful koch brothers rebuff big donors calls...
1        trump gave green light sexist lewinsky attack ...
2        fbi finally job starts monitoring oregon milit...
3        wow christian author gives unexpected brillian...
4        michael jordan longer stay silent police shoot...
                               ...                        
44893    trump son says happy work senate intelligence ...
44894    trump declines say visit korean dmz asia trip ...
44895    oh irony nancy pelosi trump family concerned h...
44896    yet jill stein files federal lawsuit pennsylva...
44897    deal facts tom brokaw calls political hack and...
Name: content, Length: 44898, dtype: object

In [32]:
# feature extraction
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(shuffled_df["content"])

In [38]:
y = shuffled_df.iloc[:,-2]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
lr = LogisticRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
print("Accuracy:",accuracy_score(y_pred,y_test))

Accuracy: 0.995879732739421


In [36]:
print(confusion_matrix(y_pred,y_test))

[[4672   21]
 [  16 4271]]


In [37]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4693
           1       1.00      1.00      1.00      4287

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980

