In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import pandas as pd
import spacy


In [None]:
nlp = spacy.load('en_core_web_sm')
def preprocess(text):
  doc = nlp(text)
  filtered = []
  for token in doc:
    if not token.is_stop and not token.is_punct:
      filtered.append(token.lemma_)
  return " ".join(filtered)

In [None]:
df = pd.read_csv('/content/Fake_Real_Data.csv')
df.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


# **Build a model with pre processed text**

---



In [None]:
df['label_num'] = df['label'].map({'Fake': 0, 'Real': 1})
print(df.shape)
df.head()

(9900, 3)


Unnamed: 0,Text,label,label_num
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0
1,U.S. conservative leader optimistic of common ...,Real,1
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0
4,Democrats say Trump agrees to work on immigrat...,Real,1


In [None]:
df['processed_text'] = df['Text'].apply(preprocess)

In [None]:
df.head()

Unnamed: 0,Text,label,label_num,processed_text
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0,Trump Surrogate BRUTALLY Stabs Pathetic vide...
1,U.S. conservative leader optimistic of common ...,Real,1,U.S. conservative leader optimistic common gro...
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1,trump propose U.S. tax overhaul stir concern d...
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0,Court Forces Ohio allow million illegally pu...
4,Democrats say Trump agrees to work on immigrat...,Real,1,Democrats Trump agree work immigration bill wa...


In [None]:
#with Naive Bayes
x_train, x_test, y_train, y_test = train_test_split(df['processed_text'], df['label_num'], test_size=0.2)
clf_tri = Pipeline([
    ('Count Vectorizer', CountVectorizer(ngram_range=(3,3))),
    ('Multinomial NB', MultinomialNB())
])

clf_uni_bi_tri = Pipeline([
    ('Count Vectorizer', CountVectorizer(ngram_range=(1,3))),
    ('Multinomial NB', MultinomialNB())
])

clf_tri.fit(x_train, y_train)
clf_uni_bi_tri.fit(x_train, y_train)

y1_pred = clf_tri.predict(x_test)
y2_pred = clf_uni_bi_tri.predict(x_test)

print('for trigrams:')
print(classification_report(y_test, y1_pred))
print('for unigram, Bigram, and trigrams:')
print(classification_report(y_test, y2_pred))

for trigrams:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       970
           1       0.97      0.99      0.98      1010

    accuracy                           0.98      1980
   macro avg       0.98      0.98      0.98      1980
weighted avg       0.98      0.98      0.98      1980

for unigram, Bigram, and trigrams:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       970
           1       0.99      0.98      0.99      1010

    accuracy                           0.98      1980
   macro avg       0.98      0.98      0.98      1980
weighted avg       0.98      0.98      0.98      1980



In [None]:
#with random forest
from sklearn.ensemble import RandomForestClassifier
x_train, x_test, y_train, y_test = train_test_split(df['processed_text'], df['label_num'], test_size=0.2)
clf_tri = Pipeline([
    ('Count Vectorizer', CountVectorizer(ngram_range=(3,3))),
    ('Random Forest', RandomForestClassifier())
])

clf_uni_bi_tri = Pipeline([
    ('Count Vectorizer', CountVectorizer(ngram_range=(1,3))),
    ('Random Forest', RandomForestClassifier())
])

clf_tri.fit(x_train, y_train)
clf_uni_bi_tri.fit(x_train, y_train)

y3_pred = clf_tri.predict(x_test)
y4_pred = clf_uni_bi_tri.predict(x_test)

print('for trigrams:')
print(classification_report(y_test, y3_pred))
print('for unigram, Bigram, and trigrams:')
print(classification_report(y_test, y4_pred))

for trigrams:
              precision    recall  f1-score   support

           0       0.92      0.99      0.96       974
           1       0.99      0.92      0.95      1006

    accuracy                           0.96      1980
   macro avg       0.96      0.96      0.96      1980
weighted avg       0.96      0.96      0.96      1980

for unigram, Bigram, and trigrams:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       974
           1       1.00      0.99      1.00      1006

    accuracy                           0.99      1980
   macro avg       0.99      0.99      0.99      1980
weighted avg       0.99      0.99      0.99      1980

