In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(2,2)) # (x,y) size of the window = x and y.

In [7]:
text = ['Hi who is there']
cv.fit(text)
cv.vocabulary_

{'hi who': 0, 'who is': 2, 'is there': 1}

In [9]:
# converting this corpus into vectorizer.
corpus = ["thor ate pizza", "loki is tall", "loki is eating pizza"]
# first remove the stop words:
cv.fit(corpus)
cv.vocabulary_

{'thor ate': 5,
 'ate pizza': 0,
 'loki is': 4,
 'is tall': 3,
 'is eating': 2,
 'eating pizza': 1}

In [11]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load("en_core_web_sm")

In [24]:
# pre-processing the text: 
def preprocess(text):
    doc = nlp(text)
    clean_str = list()
    for token in doc:
        if not token.is_stop:
            clean_str.append(token.lemma_) # to get the base words
    return " ".join(clean_str)

In [26]:
modi_corpus = [preprocess(cp) for cp in corpus ] # using list comprehension
modi_corpus

['thor eat pizza', 'loki tall', 'loki eat pizza']

In [27]:
# now applying the countvectorizer:
cv=CountVectorizer(ngram_range = (1,2))
cv.fit(modi_corpus)
cv.vocabulary_ # to get all the tokens.

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

* Now we have created the dictionary of n-grams:
* we are ready to get the corresponding vectors for the individual corpus:


In [37]:
cv.transform(modi_corpus).toarray()  # getting the count of the n-grams in each doc

array([[1, 1, 0, 0, 0, 1, 0, 1, 1],
       [0, 0, 1, 0, 1, 0, 1, 0, 0],
       [1, 1, 1, 1, 0, 1, 0, 0, 0]], dtype=int64)

In [32]:
cv.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [40]:
import pandas as pd
df = pd.read_json('news_dataset.json')

In [41]:
df.head(4)

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS


In [46]:
df.category.value_counts() # unbalanced target variables.

category
BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: count, dtype: int64

## Handling the unbalanced target 

In [47]:
min_sample = 1381 # lowest class count.
df_business = df[df['category'] == 'BUSINESS'].sample(min_sample, random_state =2024)
df_sports = df[df['category']=='SPORTS'].sample(min_sample, random_state=2024)
df_crime = df[df['category']== 'CRIME'].sample(min_sample, random_state=2024)
df_science = df[df['category']=='SCIENCE'].sample(min_sample, random_state=2024)

In [51]:
df_balanced = pd.concat([df_business, df_sports, df_crime, df_science], axis = 0)


In [60]:
df_balanced.category.value_counts() # now target is balanced.

category
BUSINESS    1381
SPORTS      1381
CRIME       1381
SCIENCE     1381
Name: count, dtype: int64

In [64]:
## Converting the labels in to numbers:
target_map = {'BUSINESS':0, 'SPORTS':1, 'CRIME':2, 'SCIENCE':3}
df_balanced['mapped_category'] = df_balanced['category'].map(target_map)

In [66]:
df_balanced.sample(5)

Unnamed: 0,text,category,mapped_category
9569,What Sets Buddy Hield’s March Madness Run Apar...,SPORTS,1
1921,Golden State Warriors Remain Undefeated Their ...,SPORTS,1
10837,Sheriff: Dakota Access Construction Equipment ...,CRIME,2
480,Georgia Children Shot Dead By Home Invader No ...,CRIME,2
5911,"A Day Among the Giants Upon approach, offshore...",SCIENCE,3


In [79]:
# train test split:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_balanced['text'], df_balanced['mapped_category'], test_size=0.2,stratify = df_balanced.mapped_category, random_state=2024)
# stratify selects equal no of labels in target and test :

In [82]:
y_train.value_counts()

mapped_category
3    1105
1    1105
2    1105
0    1104
Name: count, dtype: int64

# model application

In [83]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
clf = Pipeline([
    ('countvectorizer', CountVectorizer()), 
    ('nb', MultinomialNB())
])

In [84]:
clf.fit(X_train, y_train)

In [85]:
y_pred = clf.predict(X_test)

In [87]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.75      0.87      0.81       277
           1       0.91      0.84      0.87       276
           2       0.89      0.91      0.90       276
           3       0.89      0.79      0.84       276

    accuracy                           0.85      1105
   macro avg       0.86      0.85      0.85      1105
weighted avg       0.86      0.85      0.85      1105



# now using `n-grams`

In [91]:
clf = Pipeline([
    ('countvectorizer', CountVectorizer(ngram_range=(1,2))), # uni + bi-grams.
    ('nb', MultinomialNB())
])
clf.fit(X_train,y_train)
print(classification_report(y_test,clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.71      0.92      0.80       277
           1       0.95      0.82      0.88       276
           2       0.90      0.90      0.90       276
           3       0.93      0.78      0.85       276

    accuracy                           0.85      1105
   macro avg       0.87      0.85      0.85      1105
weighted avg       0.87      0.85      0.85      1105



In [92]:
# not any improvement: but for larger data it will be fine.

In [106]:
X_test[:5]

1660     12-Year-Old LeBron James Jr.'s New Highlight R...
11780    Unlocking Big Data's Value Potential Through D...
3100     NASA Launches GOES-R Weather Satellite Images ...
12156    NHL Team Makes Stirring Gesture To Honor Paris...
11912    Women in Business: Teresa Scott, Founder, Kenn...
Name: text, dtype: object

In [107]:
y_pred[:5],target_map # we can see that the prediction is working fine 

(array([1, 0, 3, 1, 0], dtype=int64),
 {'BUSINESS': 0, 'SPORTS': 1, 'CRIME': 2, 'SCIENCE': 3})

# after pre-processing: 

In [108]:
nlp = spacy.load("en_core_web_sm")

In [132]:
# let's remove the stopwords and see the changes:
def pre_process(text):
    doc = nlp(text)
    free_text = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(free_text)

In [133]:
pre_process("Hi I'm raushan. My age is 21 years. Where are you going?")

'hi raushan age 21 year go'

In [134]:
df.head(3)

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS


In [135]:
df_balanced['new_text'] = df_balanced['text'].apply(pre_process)

In [136]:
df_balanced.head(4)

Unnamed: 0,text,category,mapped_category,new_text
2487,Amazon To Challenge Alibaba In Global Delivery...,BUSINESS,0,Amazon challenge Alibaba Global Delivery Marke...
12534,"A Tale of Two Investors Like many investors, P...",BUSINESS,0,Tale Investors like investor Paul Mary trust W...
10014,"Quit Working So Hard At the end of the day, ho...",BUSINESS,0,Quit Working hard end day time purely truly pr...
4605,The Rating Game It's hard to find a restaurant...,BUSINESS,0,Rating Game hard find restaurant place little ...


In [139]:
X_train, X_test, y_train, y_test = train_test_split(df_balanced['new_text'], df_balanced['mapped_category'], test_size=0.2,stratify = df_balanced.mapped_category, random_state=2024)
# stratify selects equal no of labels in target and test :

In [140]:
# applying again the model:
clf = Pipeline([
    ('countvectorizer', CountVectorizer(ngram_range=(1,2))), # uni + bi-grams.
    ('nb', MultinomialNB())
])
clf.fit(X_train,y_train)
print(classification_report(y_test,clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85       277
           1       0.92      0.87      0.89       276
           2       0.85      0.93      0.89       276
           3       0.94      0.82      0.88       276

    accuracy                           0.88      1105
   macro avg       0.88      0.88      0.88      1105
weighted avg       0.88      0.88      0.88      1105



### we see that the accuracy and the performance has increased..

## Exercise:

About Data: Fake News Detection
Credits: https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset

This data consists of two columns. - Text - label

Text is the statements or messages regarding a particular event/situation.

label feature tells whether the given Text is Fake or Real.

As there are only 2 classes, this problem comes under the Binary Classification.

In [1]:
import pandas as pd
df  = pd.read_csv('news.csv')

In [10]:
print(df.shape)
df.sample(3)

(6335, 4)


Unnamed: 0.1,Unnamed: 0,title,text,label
4682,1581,Clinton’s 2016 makeover the latest in long lin...,"Another campaign, another reset for Hillary Cl...",REAL
5790,6520,Breaking Down the Crooked Clintons with Specia...,"In this News Brief, Joe Joseph is joined by go...",FAKE
3153,9884,Contaminated food from China now entering the ...,Contaminated food from China now entering the ...,FAKE


In [7]:
df.label.value_counts() # labelled dataset.

label
REAL    3171
FAKE    3164
Name: count, dtype: int64

In [12]:
# giving number to the label:
target_map = {'REAL': 1, 'FAKE': 0}
df['label_num'] = df['label'].map(target_map) # using .map function

In [14]:
df.sample(3)

Unnamed: 0.1,Unnamed: 0,title,text,label,label_num
3122,3478,Supreme Court Throws Out Ruling On Obamacare C...,"WASHINGTON, March 9 (Reuters) - The U.S. Supre...",REAL,1
376,3103,Pope Francis met Kim Davis: why it matters in ...,The decision by Pope Francis to give his perso...,REAL,1
4852,5989,POLL: Who will win Florida? - USAPoliticsNow,Comments \nDonald Trump has a 2 percentage poi...,FAKE,0


### Modelling without Pre-processing Text data

In [21]:
# train test split:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label_num'], test_size=0.2,stratify=df.label_num, random_state=2024)

In [22]:
X_train.shape, X_test.shape

((5068,), (1267,))

## Attempt 1 :

1. using sklearn pipeline module create a classification pipeline to classify the Data.



### Note:
* using CountVectorizer with unigram, bigram, and trigrams.
* use KNN as the classifier with n_neighbors of 10 and metric as 'euclidean' distance.
* print the classification report.

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [34]:
clf = Pipeline([
    ('cv', CountVectorizer()),
    ('knn', KNeighborsClassifier(metric='euclidean',n_neighbors=10))
])


In [35]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.83      0.79       633
           1       0.81      0.73      0.77       634

    accuracy                           0.78      1267
   macro avg       0.78      0.78      0.78      1267
weighted avg       0.78      0.78      0.78      1267



## Attempt 2 :

### Note:

* using CountVectorizer with unigram, bigram, and trigrams.
* use KNN as the classifier with n_neighbors of 10 and metric as 'cosine' distance.
* print the classification report.

In [40]:
clf2 = Pipeline([
    ('cv', CountVectorizer(ngram_range=(1,3))),
    ('knn', KNeighborsClassifier(n_neighbors=10))
])

In [41]:
clf2.fit(X_train, y_train)

In [42]:
y_pred = clf2.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.69      0.82      0.75       633
           1       0.77      0.63      0.69       634

    accuracy                           0.72      1267
   macro avg       0.73      0.72      0.72      1267
weighted avg       0.73      0.72      0.72      1267



In [None]:
# N-grams didn't increase the performance of the model:
# let's see with other model:


## Attempt 3 :

## Note:

* using CountVectorizer with only trigrams.
* use RandomForest as the classifier.
* print the classification report.


In [45]:
from sklearn.ensemble import RandomForestClassifier

In [47]:
clf3 = Pipeline([
    ('cv', CountVectorizer(ngram_range=(3,3))),
    ('rf', RandomForestClassifier())
])


In [51]:
# clf3.fit(X_train,y_train)
# y_pred = clf3.predict(X_test)
# print(classification_report(y_test,y_pred))

## Attempt 4:

## Note:
* using CountVectorizer with both unigram and bigrams.
* use Multinomial Naive Bayes as the classifier with an alpha value of 0.75.
* print the classification report.

In [53]:
from sklearn.naive_bayes import MultinomialNB

In [58]:
clf4  = Pipeline([
    ('cv', CountVectorizer(ngram_range=(1,2))),
    ('rf', MultinomialNB(alpha=0.5))
])


In [59]:
clf4.fit(X_train,y_train)

In [61]:
y_pred = clf4.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.86      0.90       633
           1       0.87      0.96      0.91       634

    accuracy                           0.91      1267
   macro avg       0.91      0.91      0.91      1267
weighted avg       0.91      0.91      0.91      1267



* We can see that the accuracy and F-scores improved using Naive bayes

## Use text pre-processing to remove stop words, punctuations and apply lemmatization

In [65]:
df.sample(2)

Unnamed: 0.1,Unnamed: 0,title,text,label,label_num
5272,9049,"Syrian War Report – November 2, 2016: ISIS and...",SouthFront Syrian War Report Leave a Reply 1 C...,FAKE,0
1558,5235,What history tells us about the health of pres...,Hillary Clinton's recent pneumonia diagnosis r...,REAL,1


In [64]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [66]:
def pre_process(text):
    doc = nlp(text)
    no_stop = list()
    for token in doc:
        if not token.is_stop or not token.is_punct:
            no_stop.append(token.lemma_)
    return " ".join(no_stop)

### taking too much time

In [68]:
# df['process_text'] = df['text'].apply(pre_process) 

In [None]:
## Build a model with pre processed text

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['process_text'], df['label_num'], test_size=0.2,stratify=df.label_num, random_state=2024)

In [None]:
# using previous models we'll check the model performance: