In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/news-category-dataset/News_Category_Dataset_v3.json


In [2]:
import json
with open("/kaggle/input/news-category-dataset/News_Category_Dataset_v3.json", "r") as file:
    # Read each line and append it to a list
    data = [json.loads(line) for line in file]

In [3]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


# Downsampling data

In [4]:
min_sample= df.category.value_counts().min()
balanced_df = pd.DataFrame(columns=df.columns)
for cat in df.category.unique():
    category_data = df[df.category==cat].sample(min_sample, random_state=2022)
    balanced_df = pd.concat([balanced_df, category_data])
    
balanced_df.head()


Unnamed: 0,link,headline,category,short_description,authors,date
6895,https://www.huffpost.com/entry/topeka-zoo-tige...,"Zookeeper Mauled By Topeka Zoo Tiger, Rushed T...",U.S. NEWS,The incident occurred just after the zoo opene...,,2019-04-20
1016,https://www.huffpost.com/entry/wall-street-sto...,Stocks Sway On Wall Street As Oil Slides To $1...,U.S. NEWS,Markets have careened in recent weeks amid unc...,"STAN CHOE and DAMIAN J. TROISE, AP",2022-03-14
629,https://www.huffpost.com/entry/toddler-cheeseb...,Toddler Who Pulled Off Cheeseburger 'Heist' Ge...,U.S. NEWS,"""It hurts to see other people living your drea...",Josephine Harvey,2022-05-24
8670,https://www.huffpost.com/entry/subtropical-sto...,Subtropical Storm Alberto Barrels Up Gulf Coas...,U.S. NEWS,"Florida, Alabama and Mississippi declared stat...",,2018-05-27
6573,https://www.huffpost.com/entry/bodies-found-ri...,"Woman, 3 Children Died Of Heat Exposure After ...",U.S. NEWS,Their bodies were reportedly found in or near ...,,2019-06-24


In [5]:
category_map = {category:index for index, category in enumerate(balanced_df.category.unique())}
balanced_df['category_num'] = balanced_df.category.map(category_map)
balanced_df.category_num

6895       0
1016       0
629        0
8670       0
6573       0
          ..
209355    41
160235    41
185978    41
160171    41
165852    41
Name: category_num, Length: 42588, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(balanced_df.short_description, balanced_df.category_num, test_size=0.2, random_state=42, stratify = balanced_df.category_num, shuffle=True)
X_train.shape, X_test.shape

((34070,), (8518,))

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.20      0.33      0.25       203
           1       0.31      0.06      0.11       203
           2       0.13      0.38      0.19       203
           3       0.51      0.25      0.33       203
           4       0.07      0.43      0.12       202
           5       0.65      0.17      0.27       203
           6       0.57      0.18      0.27       203
           7       0.18      0.03      0.05       203
           8       0.24      0.16      0.19       202
           9       0.70      0.03      0.07       203
          10       0.29      0.38      0.33       203
          11       0.22      0.46      0.30       203
          12       0.46      0.20      0.28       203
          13       0.72      0.20      0.32       202
          14       0.15      0.52      0.23       203
          15       0.28      0.13      0.18       203
          16       0.42      0.36      0.39       203
          17       0.43    

In [8]:
import spacy
nlp = spacy.load('en_core_web_sm')
def preprocess(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct ])

In [9]:
balanced_df['pro_describe'] = balanced_df.short_description.apply(preprocess)
balanced_df.pro_describe

6895                            incident occur zoo open day
1016      market careen recent week amid uncertainty eco...
629                       hurt people live dream user write
8670      Florida Alabama Mississippi declare state emer...
6573         body reportedly find near Anzalduas Park Texas
                                ...                        
209355    legal freedom come divorce decree necessarily ...
160235    day ago think entirely possible good friend ex...
185978    Maria Shriver estranged husband Arnold Schwarz...
160171    accord recent datum National Opinion Research ...
165852    divorce coach Mandy Walker founder divorce sup...
Name: pro_describe, Length: 42588, dtype: object

In [10]:
X_train, X_test, y_train, y_test = train_test_split(balanced_df.pro_describe, balanced_df.category_num, test_size=0.2, random_state=42, stratify = balanced_df.category_num, shuffle=True)

clf2 = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

clf2.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.22      0.26      0.24       203
           1       0.22      0.10      0.14       203
           2       0.15      0.29      0.20       203
           3       0.44      0.27      0.33       203
           4       0.07      0.40      0.11       202
           5       0.45      0.29      0.35       203
           6       0.51      0.28      0.36       203
           7       0.22      0.10      0.14       203
           8       0.25      0.26      0.26       202
           9       0.35      0.06      0.11       203
          10       0.32      0.37      0.35       203
          11       0.26      0.44      0.33       203
          12       0.29      0.28      0.28       203
          13       0.60      0.29      0.39       202
          14       0.19      0.39      0.26       203
          15       0.27      0.21      0.24       203
          16       0.34      0.52      0.41       203
          17       0.34    