In [4]:
# generate n-grams using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# default=(1,1): use bag of words just one word or one token
# v = CountVectorizer()

# apply n-gram parameter
v = CountVectorizer(ngram_range=(2,2)) # create a pair, as a single unit in the vocabulary
v = CountVectorizer(ngram_range=(1,2)) # create a single token, and then a pair of tokens
v = CountVectorizer(ngram_range=(1,3)) # single token -> bi-grams -> tri-grams

v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

{'thor': 12,
 'hathodawala': 2,
 'is': 5,
 'looking': 9,
 'for': 0,
 'job': 8,
 'thor hathodawala': 13,
 'hathodawala is': 3,
 'is looking': 6,
 'looking for': 10,
 'for job': 1,
 'thor hathodawala is': 14,
 'hathodawala is looking': 4,
 'is looking for': 7,
 'looking for job': 11}

In [5]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza"
]

In [6]:
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    
    for token in doc:
        if token.is_stop or token.is_punct:
            continue #ignore
        # add the base word to array
        filtered_tokens.append(token.lemma_)
        
    # convert a python list into a string, separated by spaces
    return " ".join(filtered_tokens)

In [7]:
preprocess("Thor ate pizza")

'thor eat pizza'

In [8]:
preprocess("Loki is eating pizza")

'Loki eat pizza'

In [9]:
corpus_processed = [preprocess(text) for text in corpus]
corpus_processed

['thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [11]:
v = CountVectorizer(ngram_range=(1,2))
# create that vocabulary
v.fit(corpus_processed)
v.vocabulary_
# text -> vector conversion

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [13]:
# convert text to array
v.transform(["Thor eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 1, 1]], dtype=int64)

In [14]:
v.transform(["Hulk eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)

In [16]:
# ================ News Category Classification Problem ===================
# Okay now that we know basics of BAG of n grams vectorizer 😎 It is the time to work on a real problem. 
# Here we want to do a news category classification. We will use bag of n-grams and traing a machine learning model 
# that can categorize any news into one of the following categories,

# 1. BUSINESS
# 2. SPORTS
# 3. CRIME
# 4. SCIENCE


# Dataset
# Dataset Credits: https://www.kaggle.com/code/hengzheng/news-category-classifier-val-acc-0-65

# - This data consists of two columns. - Text - Category
# - Text is a news article
# - Category can be one of these 4: 'BUSINESS', 'SPORTS', 'CRIME', 'SCIENCE', to keep things simple 
# I trimmed additional categories from the original dataset

import pandas as pd

df = pd.read_json("news_dataset.json")

print(df.shape)

df.head()

(12695, 2)


Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [17]:
df.category.value_counts()
# => some imbalance in this dataset

BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: category, dtype: int64

In [20]:
# As you can see above, SCIENCE category has almost 1/3rd data samples compared to BUSINESS and SPORTS categories. 
# I initially trained a model without handling the imbalanced I saw a lower f1-score for SCIENCE category. 
# Hence we need to address this imbalanced.

# There are various ways of handling class imbalance which I have discussed in this video: 
# https://www.youtube.com/watch?v=JnlM4yLFNuo

# Out of those techniques, I will use undersampling technique here.

# In undersampling, we take a minor class and sample those many samples from other classes, 
# this means we are not utilizing all the data samples for training and in ML world - 
# Not using all the data for training is considered a SIN! 

# 😵 In real life, you are advised to use a technique such as SMOTE so that you can utilize all of your dataset 
# for the training but since this tutorial is more about bag of n-grams then class imbalance itself, I'd go with 
# a simple technique of undersampling.

# ============== Handle class imbalance ==============
min_samples = 1381 # we have these many SCIENCE articles and SCIENCE is our minority class

# sample: take random
# random_state: choose any random number
df_business = df[df.category=="BUSINESS"].sample(min_samples, random_state=2022)
df_business

df_sports = df[df.category=="SPORTS"].sample(min_samples, random_state=2022)

df_crime = df[df.category=="CRIME"].sample(min_samples, random_state=2022)

df_science = df[df.category=="SCIENCE"].sample(min_samples, random_state=2022)


In [21]:
# supply bunch of data frames into this argument, and it will just add them (row by row)
# concatenation at column or row level(axis=0)
df_balanced = pd.concat([df_business, df_sports, df_crime, df_science], axis=0)
df_balanced.category.value_counts()
# => it gets balanced

BUSINESS    1381
SPORTS      1381
CRIME       1381
SCIENCE     1381
Name: category, dtype: int64

In [22]:
# ========= Convert text category to a number ===========

# define a dictionary -> numbers mapped to each of this category
target = {'BUSINESS': 0, 'SPORTS': 1, 'CRIME': 2, 'SCIENCE': 3}

# convert category from string (BUSINESS) to number (0)...
df_balanced['category_num'] = df_balanced['category'].map({
    'BUSINESS': 0,
    'SPORTS': 1, 
    'CRIME': 2, 
    'SCIENCE': 3
})

In [23]:
df_balanced.head()

Unnamed: 0,text,category,category_num
11967,GCC Business Leaders Remain Confident in the F...,BUSINESS,0
2912,From the Other Side; an Honest Review from Emp...,BUSINESS,0
3408,"Mike McDerment, CEO of FreshBooks, Talks About...",BUSINESS,0
502,How to Market Your Business While Traveling th...,BUSINESS,0
5279,How to Leverage Intuition in Decision-making I...,BUSINESS,0


In [24]:
# ========== Build a model with original text (no pre processing) =========
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.text,
    df_balanced.category_num,
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    # create equal number of samples from all the classes in train and test
    stratify=df_balanced.category_num
)

In [25]:
print(X_train.shape)
X_train.head()

(4419,)


7589     Ovulating Women Prefer Images of Penetration O...
10442    Scientists Discover Spooky Influence On Baby N...
8792     Olympic Race Walker Steps Up To Propose To His...
1733     Beloved Bipedal Bear Named Pedals Believed Kil...
2526     Elizabeth Smart Gave Birth To Baby Girl, Fathe...
Name: text, dtype: object

In [26]:
y_train.value_counts()

3    1105
2    1105
0    1105
1    1104
Name: category_num, dtype: int64

In [27]:
y_test.value_counts()

1    277
0    276
3    276
2    276
Name: category_num, dtype: int64