In [9]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer(ngram_range = (1,3))
v.fit(['Thor Hathodawala is looking for a job'])
v.vocabulary_

{'thor': 12,
 'hathodawala': 2,
 'is': 5,
 'looking': 9,
 'for': 0,
 'job': 8,
 'thor hathodawala': 13,
 'hathodawala is': 3,
 'is looking': 6,
 'looking for': 10,
 'for job': 1,
 'thor hathodawala is': 14,
 'hathodawala is looking': 4,
 'is looking for': 7,
 'looking for job': 11}

In [10]:
corpus = ['Thor ate pizza','Loki is tall','Loki is eating pizza']

In [12]:
import spacy
nlp = spacy.load('en_core_web_sm')
def preprocess(text):
    doc = nlp(text)
    filtered_tokens=[]
    for token in doc: 
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return ' '.join(filtered_tokens)
preprocess('Thor ate pizza')

'Thor eat pizza'

In [14]:
corpus_process = [preprocess(sent) for sent in corpus]
corpus_process

['Thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [15]:
v = CountVectorizer(ngram_range=(1,2))
v.fit(corpus_process)
v.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [19]:
v.transform(['Thor eat pizza']).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 1, 1]], dtype=int64)

In [22]:
import pandas as pd
df = pd.read_json('./Data/new_dataset.json')
df.head()

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [24]:
df['category'].value_counts()

BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: category, dtype: int64

In [25]:
min_samples = 1381
df_business = df[df.category=='BUSINESS'].sample(min_samples,random_state=2022)
df_sports = df[df.category=='SPORTS'].sample(min_samples,random_state=2022)
df_crime = df[df.category=='CRIME'].sample(min_samples,random_state=2022)
df_science =df[df.category=='SCIENCE']

In [26]:
df_balance = pd.concat([df_business,df_sports,df_crime,df_science],axis = 0)
df_balance.category.value_counts()

BUSINESS    1381
SPORTS      1381
CRIME       1381
SCIENCE     1381
Name: category, dtype: int64

In [28]:
target = {'BUSINESS':0,'SPORTS':1,'CRIME':2,"SCIENCE":3}
df_balance['category_num'] = df_balance.category.map({'BUSINESS':0,'SPORTS':1,'CRIME':2,"SCIENCE":3})

In [30]:
df_balance.head()

Unnamed: 0,text,category,category_num
11967,GCC Business Leaders Remain Confident in the F...,BUSINESS,0
2912,From the Other Side; an Honest Review from Emp...,BUSINESS,0
3408,"Mike McDerment, CEO of FreshBooks, Talks About...",BUSINESS,0
502,How to Market Your Business While Traveling th...,BUSINESS,0
5279,How to Leverage Intuition in Decision-making I...,BUSINESS,0


In [42]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df_balance.text,df_balance.category_num,test_size  =.2,random_state=2022,
                                                 stratify=df_balance.category_num)

In [43]:
y_test.value_counts()

1    277
0    276
3    276
2    276
Name: category_num, dtype: int64

In [45]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer_bow',CountVectorizer()),
    ('Multi NB',MultinomialNB())
])
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.78      0.88      0.83       276
           1       0.92      0.81      0.86       277
           2       0.82      0.90      0.86       276
           3       0.92      0.82      0.87       276

    accuracy                           0.85      1105
   macro avg       0.86      0.85      0.85      1105
weighted avg       0.86      0.85      0.85      1105



In [47]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer_bow',CountVectorizer(ngram_range = (1,2))),
    ('Multi NB',MultinomialNB())
])
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.69      0.91      0.79       276
           1       0.93      0.75      0.83       277
           2       0.81      0.88      0.84       276
           3       0.94      0.76      0.84       276

    accuracy                           0.82      1105
   macro avg       0.84      0.82      0.83      1105
weighted avg       0.84      0.82      0.83      1105



In [48]:
df_balance['preprocessed_text'] = df_balance.text.apply(preprocess)

In [49]:
df_balance.sample(4)

Unnamed: 0,text,category,category_num,preprocessed_text
4844,Louisiana Theater Shooter Identified As John R...,CRIME,2,Louisiana Theater Shooter identify John Russel...
6060,Records Show Numerous Complaints Against Offic...,CRIME,2,Records Numerous Complaints Officer stage suic...
10811,Will Amelia Earhart Help Unlock Secrets of Cli...,SCIENCE,3,Amelia Earhart help Unlock secret Climate Chan...
7688,Super Bowl Ad Uses Footage Of 9/11 To Sell His...,SPORTS,1,Super Bowl ad use footage 9/11 sell Historical...


In [50]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df_balance.preprocessed_text,df_balance.category_num,test_size  =.2,random_state=2022,
                                                 stratify=df_balance.category_num)

In [55]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer_bow',CountVectorizer(ngram_range = (1,1))),
    ('Multi NB',MultinomialNB())
])
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.82      0.87      0.85       276
           1       0.93      0.82      0.87       277
           2       0.82      0.91      0.86       276
           3       0.91      0.86      0.89       276

    accuracy                           0.87      1105
   macro avg       0.87      0.87      0.87      1105
weighted avg       0.87      0.87      0.87      1105



# Exercise

In [None]:
import pandas as pd

In [59]:
df = pd.read_csv('./Data/Fake_Real_Data.csv')
df.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [60]:
df.label.value_counts()

Fake    5000
Real    4900
Name: label, dtype: int64

In [61]:
df['label_num'] = df['label'].map({'Fake':0,'Real':1})

In [67]:
X_train,X_test,y_train,y_test = train_test_split(df.Text,df.label_num,test_size = .2,random_state=0)

In [68]:
y_train.value_counts()

0    3983
1    3937
Name: label_num, dtype: int64

In [80]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [81]:
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('Vectorize',CountVectorizer(ngram_range=(1,3))),
    ('KNN',RandomForestClassifier())
])
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1017
           1       1.00      0.99      0.99       963

    accuracy                           0.99      1980
   macro avg       0.99      0.99      0.99      1980
weighted avg       0.99      0.99      0.99      1980



In [83]:
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('Vectorize',CountVectorizer(ngram_range=(1,3))),
    ('NB',MultinomialNB(alpha=.75))
])
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1017
           1       0.98      0.99      0.99       963

    accuracy                           0.99      1980
   macro avg       0.99      0.99      0.99      1980
weighted avg       0.99      0.99      0.99      1980



## Use text pre-processing to remove stop words, punctuations and apply lemmatization

In [86]:
import spacy
nlp = spacy.load('en_core_web_sm')
def preprocess(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

In [90]:
df['preprocessed_text'] = df['Text'].apply(preprocess)

In [92]:
df.head()

Unnamed: 0,Text,label,label_num,preprocessed_text
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0,Trump Surrogate BRUTALLY stab pathetic video...
1,U.S. conservative leader optimistic of common ...,Real,1,U.S. conservative leader optimistic common gro...
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1,trump propose U.S. tax overhaul stir concern d...
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0,Court Forces Ohio allow Millions illegally p...
4,Democrats say Trump agrees to work on immigrat...,Real,1,democrat trump agree work immigration bill wal...


In [91]:
clf = Pipeline([
    ('vectorize',CountVectorizer(ngram_range=(1,3))),
    ('RF',RandomForestClassifier())
])
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1017
           1       1.00      1.00      1.00       963

    accuracy                           1.00      1980
   macro avg       1.00      1.00      1.00      1980
weighted avg       1.00      1.00      1.00      1980

