# Fake News Detection

In [77]:
import pandas as pd

In [78]:
!pip install spacy



In [107]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.6.0/en_core_web_lg-3.6.0-py3-none-any.whl (587.7 MB)
     ------------------------------------ 587.7/587.7 MB 515.7 kB/s eta 0:00:00
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.6.0
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [79]:
df_fake=pd.read_csv(r'C:\Users\soumi\Personal_projects\NLP\Fake_news_detection\archive\Fake.csv')

In [80]:
df_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [81]:
df_true=pd.read_csv(r'C:\Users\soumi\Personal_projects\NLP\Fake_news_detection\archive\True.csv')

In [82]:
df_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [83]:
df_fake.shape, df_true.shape

((23481, 4), (21417, 4))

In [84]:
df_true['class']=1
df_fake['class']=0

In [85]:
df_true.head(2)

Unnamed: 0,title,text,subject,date,class
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1


In [86]:
# join the two datasets
df=pd.concat([df_true,df_fake])
df.head(5)

Unnamed: 0,title,text,subject,date,class
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [87]:
df=df.drop(['subject','date','title'],axis=1)
df.head()

Unnamed: 0,text,class
0,WASHINGTON (Reuters) - The head of a conservat...,1
1,WASHINGTON (Reuters) - Transgender people will...,1
2,WASHINGTON (Reuters) - The special counsel inv...,1
3,WASHINGTON (Reuters) - Trump campaign adviser ...,1
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,1


In [88]:
# check for data imbalance
df['class'].value_counts()

0    23481
1    21417
Name: class, dtype: int64

In [89]:
from sklearn.model_selection import train_test_split

In [90]:
x=df['text']
y=df['class']

In [91]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=2022)

In [92]:
x_train.shape, x_test.shape

((35918,), (8980,))

In [93]:
# create a classification pipeline using CountVectorizer and kNN classifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier

In [94]:
#  CountVectorizer with unigram
Model=Pipeline([
    ('v',CountVectorizer(ngram_range = (1, 1))),
    ('knn', KNeighborsClassifier(n_neighbors=10,metric='euclidean'))
])

In [95]:
Model.fit(x_train,y_train)
y_pred=Model.predict(x_test)

In [96]:
# get the report
from sklearn.metrics import classification_report
print('Report without text pre-processing and CountVectorizer with unigram:','\n',classification_report(y_test,y_pred))

Report without text pre-processing and CountVectorizer with unigram: 
               precision    recall  f1-score   support

           0       0.90      0.88      0.89      4774
           1       0.87      0.89      0.88      4206

    accuracy                           0.89      8980
   macro avg       0.89      0.89      0.89      8980
weighted avg       0.89      0.89      0.89      8980



In [97]:
#  CountVectorizer with bigram
Model=Pipeline([
    ('v',CountVectorizer(ngram_range = (2, 2))),
    ('knn', KNeighborsClassifier(n_neighbors=10,metric='euclidean'))
])
Model.fit(x_train,y_train)
y_pred=Model.predict(x_test)
print('Report without text pre-processing and CountVectorizer with bigram:','\n',classification_report(y_test,y_pred))

Report without text pre-processing and CountVectorizer with bigram: 
               precision    recall  f1-score   support

           0       0.55      0.99      0.71      4774
           1       0.89      0.09      0.16      4206

    accuracy                           0.57      8980
   macro avg       0.72      0.54      0.44      8980
weighted avg       0.71      0.57      0.45      8980



In [98]:
#  CountVectorizer with trigram
Model=Pipeline([
    ('v',CountVectorizer(ngram_range = (3, 3))),
    ('knn', KNeighborsClassifier(n_neighbors=10,metric='euclidean'))
])
Model.fit(x_train,y_train)
y_pred=Model.predict(x_test)
print('Report without text pre-processing and CountVectorizer with trigram:','\n',classification_report(y_test,y_pred))

Report without text pre-processing and CountVectorizer with trigram: 
               precision    recall  f1-score   support

           0       0.53      1.00      0.69      4774
           1       0.76      0.00      0.01      4206

    accuracy                           0.53      8980
   macro avg       0.65      0.50      0.35      8980
weighted avg       0.64      0.53      0.37      8980



In [99]:
# Countvectorizer with using CountVectorizer with unigram, bigram. use KNN as the classifier with n_neighbors of 10 and metric as 'cosine' distance.
Model=Pipeline([
    ('v',CountVectorizer(ngram_range = (1, 2))),
    ('knn', KNeighborsClassifier(n_neighbors=10,metric='cosine'))
])
Model.fit(x_train,y_train)
y_pred=Model.predict(x_test)
print("Report without text pre-processing and CountVectorizer with unigram, bigram and KNN with 'cosine' distance :",'\n',classification_report(y_test,y_pred))

Report without text pre-processing and CountVectorizer with unigram, bigram and KNN with 'cosine' distance : 
               precision    recall  f1-score   support

           0       0.73      0.97      0.83      4774
           1       0.94      0.60      0.73      4206

    accuracy                           0.79      8980
   macro avg       0.83      0.78      0.78      8980
weighted avg       0.83      0.79      0.78      8980



In [100]:
# CountVectorizer with only trigrams and using RandomForest as the classifier.
from sklearn.ensemble import RandomForestClassifier
Model=Pipeline([
    ('v',CountVectorizer(ngram_range = (3, 3))),
    ('rf',RandomForestClassifier())
])
Model.fit(x_train,y_train)
y_pred=Model.predict(x_test)
print("Report without text pre-processing, CountVectorizer with trigram and Random forest classifier :",'\n',classification_report(y_test,y_pred))

Report without text pre-processing, CountVectorizer with trigram and Random forest classifier : 
               precision    recall  f1-score   support

           0       0.97      0.97      0.97      4774
           1       0.97      0.97      0.97      4206

    accuracy                           0.97      8980
   macro avg       0.97      0.97      0.97      8980
weighted avg       0.97      0.97      0.97      8980



In [104]:
# CountVectorizer with both unigram and bigrams.Multinomial Naive Bayes as the classifier with an alpha value of 0.75.
from sklearn.naive_bayes import MultinomialNB
Model=Pipeline([
    ('v',CountVectorizer(ngram_range = (1, 2))),
    ('nb', MultinomialNB())
])
Model.fit(x_train,y_train)
y_pred=Model.predict(x_test)
print("Report without text pre-processing, CountVectorizer with both unigram and bigrams and naive bayes classifier :",'\n',classification_report(y_test,y_pred))

Report without text pre-processing, CountVectorizer with both unigram and bigrams and naive bayes classifier : 
               precision    recall  f1-score   support

           0       0.98      0.97      0.97      4774
           1       0.97      0.97      0.97      4206

    accuracy                           0.97      8980
   macro avg       0.97      0.97      0.97      8980
weighted avg       0.97      0.97      0.97      8980



In [108]:
# Remove strop words, punctuation
import spacy
nlp=spacy.load('en_core_web_lg')
def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [110]:
df['cleaned_text']=df['text'].apply(preprocess) 

In [111]:
df.head()

Unnamed: 0,text,class,cleaned_text
0,WASHINGTON (Reuters) - The head of a conservat...,1,WASHINGTON Reuters head conservative republica...
1,WASHINGTON (Reuters) - Transgender people will...,1,WASHINGTON Reuters Transgender people allow ti...
2,WASHINGTON (Reuters) - The special counsel inv...,1,WASHINGTON Reuters special counsel investigati...
3,WASHINGTON (Reuters) - Trump campaign adviser ...,1,WASHINGTON Reuters trump campaign adviser Geor...
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,1,SEATTLE WASHINGTON Reuters President Donald Tr...


In [114]:
x_train,x_test,y_train,y_test=train_test_split(df['cleaned_text'],df['class'],test_size=0.2,random_state=2022)

In [115]:
# CountVectorizer with only trigrams and using RandomForest as the classifier.
Model=Pipeline([
    ('v',CountVectorizer(ngram_range = (3, 3))),
    ('rf',RandomForestClassifier())
])
Model.fit(x_train,y_train)
y_pred=Model.predict(x_test)
print("Report with text pre-processing, CountVectorizer with trigram and Random forest classifier :",'\n',classification_report(y_test,y_pred))

Report with text pre-processing, CountVectorizer with trigram and Random forest classifier : 
               precision    recall  f1-score   support

           0       0.93      0.98      0.95      4774
           1       0.97      0.92      0.94      4206

    accuracy                           0.95      8980
   macro avg       0.95      0.95      0.95      8980
weighted avg       0.95      0.95      0.95      8980



In [116]:
# print the confusion matrix
from sklearn.metrics import confusion_matrix
c_m = confusion_matrix(y_test, y_pred)
c_m

array([[4668,  106],
       [ 350, 3856]], dtype=int64)