In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
v = CountVectorizer(ngram_range=(1,3))
v.fit(["Thor Hathodawala is looking for a job"])

In [5]:
v.vocabulary_

{'thor': 12,
 'hathodawala': 2,
 'is': 5,
 'looking': 9,
 'for': 0,
 'job': 8,
 'thor hathodawala': 13,
 'hathodawala is': 3,
 'is looking': 6,
 'looking for': 10,
 'for job': 1,
 'thor hathodawala is': 14,
 'hathodawala is looking': 4,
 'is looking for': 7,
 'looking for job': 11}

In [6]:
corpus = [
    "Thor ate pizza",
    "Loci is tail",
    "Loci eating pizza"
]

In [7]:
import spacy

In [8]:
nlp = spacy.load("en_core_web_sm")

In [9]:
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

In [10]:
preprocess("thor ate pizza")

'thor eat pizza'

In [11]:
corpus_processed = [preprocess(text) for text in corpus]
corpus_processed

['thor eat pizza', 'loci tail', 'loci eat pizza']

In [12]:
v = CountVectorizer(ngram_range=(1,2))

v.fit(corpus_processed)
v.vocabulary_ #word with index number

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loci': 2,
 'tail': 6,
 'loci tail': 4,
 'loci eat': 3}

In [16]:
corpus_trans = v.transform(["Thor ate pizza"])
corpus_trans

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 2 stored elements and shape (1, 9)>

In [17]:
corpus_trans.toarray()

array([[0, 0, 0, 0, 0, 1, 0, 1, 0]], dtype=int64)

In [18]:
import pandas as pd

In [20]:
df = pd.read_json('news_cat.json', lines=True)

In [21]:
print(df.shape)

(209527, 6)


In [22]:
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [57]:
new_df = df[['short_description' , 'category']].head(10000)

In [58]:
new_df.head()

Unnamed: 0,short_description,category
0,Health experts said it is too early to predict...,U.S. NEWS
1,He was subdued by passengers and crew when he ...,U.S. NEWS
2,"""Until you have a dog you don't understand wha...",COMEDY
3,"""Accidentally put grown-up toothpaste on my to...",PARENTING
4,Amy Cooper accused investment firm Franklin Te...,U.S. NEWS


In [59]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   short_description  10000 non-null  object
 1   category           10000 non-null  object
dtypes: object(2)
memory usage: 156.4+ KB


In [60]:
new_df.category.value_counts()

category
POLITICS          3361
ENTERTAINMENT     1597
U.S. NEWS         1377
WORLD NEWS        1194
COMEDY             319
SPORTS             209
CRIME              193
STYLE & BEAUTY     165
MEDIA              163
WEIRD NEWS         142
BLACK VOICES       128
HOME & LIVING      125
ENVIRONMENT        121
WELLNESS           118
FOOD & DRINK       114
PARENTING          114
WOMEN              110
QUEER VOICES       102
BUSINESS            65
MONEY               49
CULTURE & ARTS      44
IMPACT              41
TECH                39
SCIENCE             34
RELIGION            33
TRAVEL              24
EDUCATION           12
LATINO VOICES        5
WEDDINGS             2
Name: count, dtype: int64

In [55]:
new_df.category[0]

'U.S. NEWS'

In [66]:
min_sample = 1000

df_polictics = new_df[new_df.category == "POLITICS" ].sample(min_sample, random_state=2022)
df_ENTERTAINMENT = new_df[new_df.category == "ENTERTAINMENT" ].sample(min_sample, random_state=2022)
df_U_S_NEWS = new_df[new_df.category == "U.S. NEWS" ].sample(min_sample, random_state=2022)
df_WORLD_NEWS = new_df[new_df.category == "WORLD NEWS" ].sample(min_sample, random_state=2022)

# df_sampled = df.sample(min_sample, random_state=2022,axis=0)
# df_sampled = df['category'].apply(lambda x: x.sample(min_sample, random_state=2022))


In [67]:
df_balanced = pd.concat([df_polictics,df_ENTERTAINMENT,df_U_S_NEWS,df_WORLD_NEWS],axis=0)

In [68]:
df_balanced.category.value_counts()

category
POLITICS         1000
ENTERTAINMENT    1000
U.S. NEWS        1000
WORLD NEWS       1000
Name: count, dtype: int64

In [69]:
df_balanced.head()

Unnamed: 0,short_description,category
2607,Biden had vowed during the campaign to “revers...,POLITICS
1364,"The Alabama Republican, who talked of ""blood"" ...",POLITICS
2208,The former first lady's team said Michael Besc...,POLITICS
9352,Much of the U.S. public appears not to have an...,POLITICS
9228,"It's only a temporary victory, however.",POLITICS


In [70]:
df_balanced['category_num'] = df_balanced.category.map({
    'POLITICS': 0,
    'ENTERTAINMENT': 1,
    'U.S. NEWS': 2,
    'WORLD NEWS': 3
})

In [71]:
df_balanced.head()

Unnamed: 0,short_description,category,category_num
2607,Biden had vowed during the campaign to “revers...,POLITICS,0
1364,"The Alabama Republican, who talked of ""blood"" ...",POLITICS,0
2208,The former first lady's team said Michael Besc...,POLITICS,0
9352,Much of the U.S. public appears not to have an...,POLITICS,0
9228,"It's only a temporary victory, however.",POLITICS,0


In [72]:
from sklearn.model_selection import train_test_split

In [73]:
X_train,X_test,y_train,y_test = train_test_split(
    df_balanced.short_description,
    df_balanced.category_num,
    test_size=0.2,
    random_state=2022,
    stratify=df_balanced.category_num #it create equal number of train and test sample from all the classes
    )

In [74]:
print(X_train.shape)

(3200,)


In [75]:
X_train.head()

5880    Christopher Anderson, a State Department offic...
3233    The deal follows months of negotiations with p...
5619    The Oscar- and Grammy-winning performer said s...
5005    The 18-year-old singer is hitting back at trol...
5199    Jerome Adams complimented Trump's health but o...
Name: short_description, dtype: object

In [78]:
X_train.value_counts()

short_description
Christopher Anderson, a State Department official, will testify that he was warned of Rudy Giuliani’s back-channel involvement in Ukraine policy.       1
Officers were accused of using false information to obtain the search warrant that led to Taylor's killing.                                             1
“The Drew Barrymore Show” host offered her account of what “really happened” between the “Charlie’s Angels” co-stars during an on-set confrontation.    1
The Georgia Republican was given the boot after repeatedly violating Twitter's COVID-19 misinformation policy, the company said.                        1
She knows how many Americans remain wary of big changes, and is betting she can guide them there.                                                       1
                                                                                                                                                       ..
The social media giant was fined for violations of data pr

In [79]:
y_train.value_counts()

category_num
0    800
2    800
1    800
3    800
Name: count, dtype: int64

In [81]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

In [82]:

clf = Pipeline([
    ('vect', CountVectorizer()),
    ('multinb', MultinomialNB()),
])

In [83]:
clf.fit(X_train,y_train)

In [84]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.68      0.63       200
           1       0.80      0.70      0.75       200
           2       0.52      0.54      0.53       200
           3       0.68      0.62      0.65       200

    accuracy                           0.64       800
   macro avg       0.64      0.64      0.64       800
weighted avg       0.64      0.64      0.64       800



In [87]:
clf2 = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,2))),
    ('multinb', MultinomialNB()),
])

In [88]:
clf2.fit(X_train,y_train)

In [89]:
y_pred = clf2.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.56      0.71      0.63       200
           1       0.80      0.68      0.74       200
           2       0.56      0.53      0.54       200
           3       0.69      0.64      0.66       200

    accuracy                           0.64       800
   macro avg       0.65      0.64      0.64       800
weighted avg       0.65      0.64      0.64       800



In [None]:
# 31:09