## Data Preprocessing and Feature Engineering on News Category Dataset

In [33]:
# imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
from sklearn.preprocessing import LabelEncoder
import joblib

In [34]:
df=pd.read_json('../data/News_Category_Dataset_v3.json', lines=True)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209527 entries, 0 to 209526
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   link               209527 non-null  object        
 1   headline           209527 non-null  object        
 2   category           209527 non-null  object        
 3   short_description  209527 non-null  object        
 4   authors            209527 non-null  object        
 5   date               209527 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 9.6+ MB


In [36]:
df.category.unique()

array(['U.S. NEWS', 'COMEDY', 'PARENTING', 'WORLD NEWS', 'CULTURE & ARTS',
       'TECH', 'SPORTS', 'ENTERTAINMENT', 'POLITICS', 'WEIRD NEWS',
       'ENVIRONMENT', 'EDUCATION', 'CRIME', 'SCIENCE', 'WELLNESS',
       'BUSINESS', 'STYLE & BEAUTY', 'FOOD & DRINK', 'MEDIA',
       'QUEER VOICES', 'HOME & LIVING', 'WOMEN', 'BLACK VOICES', 'TRAVEL',
       'MONEY', 'RELIGION', 'LATINO VOICES', 'IMPACT', 'WEDDINGS',
       'COLLEGE', 'PARENTS', 'ARTS & CULTURE', 'STYLE', 'GREEN', 'TASTE',
       'HEALTHY LIVING', 'THE WORLDPOST', 'GOOD NEWS', 'WORLDPOST',
       'FIFTY', 'ARTS', 'DIVORCE'], dtype=object)

In [23]:
# We only need headline, description and category

df['text'] = df['headline'] + ' ' + df['short_description']
df = df[['text', 'category']]

In [24]:
df.isnull().sum()

text        0
category    0
dtype: int64

In [25]:
df.isna().sum()

text        0
category    0
dtype: int64

In [26]:
# There are no null values in the dataset nor any missing values
# Let's clean the text data
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove Punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Stem the words
    ps = PorterStemmer()
    tokens = [ps.stem(word) for word in tokens]
    return ' '.join(tokens)

# Apply the clean_text function to the text column
df['text'] = df['text'].apply(clean_text)


In [27]:
df.head()

Unnamed: 0,text,category
0,million american roll sleev omicrontarget covi...,U.S. NEWS
1,american airlin flyer charg ban life punch fli...,U.S. NEWS
2,funniest tweet cat dog week sept dog dont unde...,COMEDY
3,funniest tweet parent week sept accident put g...,PARENTING
4,woman call cop black birdwatch lose lawsuit ex...,U.S. NEWS


In [28]:
# Label Encoding for category
le = LabelEncoder()
df['category'] = le.fit_transform(df['category'])
df['category'].value_counts()
df.head()

Unnamed: 0,text,category
0,million american roll sleev omicrontarget covi...,35
1,american airlin flyer charg ban life punch fli...,35
2,funniest tweet cat dog week sept dog dont unde...,5
3,funniest tweet parent week sept accident put g...,22
4,woman call cop black birdwatch lose lawsuit ex...,35


In [37]:
# saving the cleaned data
df.to_csv('../data/cleaned_data.csv', index=False)

# saving the label encoder
joblib.dump(le, '../outputs/models/label_encoder.pkl')

['../outputs/models/label_encoder.pkl']