# 1. Data preproccesing

### 1.1 Data reading

In [1]:
import pandas as pd

In [2]:
data = pd.read_json('News_Category_Dataset_IS_course.json', lines=True)
data.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
1,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
2,https://www.huffpost.com/entry/dodgers-basebal...,"Maury Wills, Base-Stealing Shortstop For Dodge...",SPORTS,"Maury Wills, who helped the Los Angeles Dodger...","Beth Harris, AP",2022-09-20
3,https://www.huffpost.com/entry/golden-globes-r...,Golden Globes Returning To NBC In January Afte...,ENTERTAINMENT,"For the past 18 months, Hollywood has effectiv...",,2022-09-20
4,https://www.huffpost.com/entry/biden-us-forces...,Biden Says U.S. Forces Would Defend Taiwan If ...,POLITICS,President issues vow as tensions with China rise.,,2022-09-19


### 1.2 Understanding the data

All categories

In [3]:
categories = data.groupby('category').size().index.tolist()
categories

['BLACK VOICES',
 'BUSINESS',
 'COMEDY',
 'ENTERTAINMENT',
 'FOOD & DRINK',
 'HEALTHY LIVING',
 'HOME & LIVING',
 'PARENTING',
 'PARENTS',
 'POLITICS',
 'QUEER VOICES',
 'SPORTS',
 'STYLE & BEAUTY',
 'TRAVEL',
 'WELLNESS']

Are there null elements?

In [4]:
data.isnull().sum()

link                   0
headline             731
category               0
short_description    736
authors                0
date                   0
dtype: int64

In [5]:
#is there a row where the short_description is null and headline is null?
len(data[(data['short_description'].isnull()) & (data['headline'].isnull())])

0

In [6]:
#how many rows have authors ''?
len(data[data['authors'] == ''])

24416

### 1.3 Cleaning the data

As we don't need link,authors and date we drop them.

In [7]:
data = data.drop(['link', 'authors', 'date'], axis=1)

Combine headline and short_description and save it as text

In [8]:
#if headline is null use add only  short_description and vice versa
data['text'] = data.apply(lambda row: row['short_description'] if pd.isnull(row['headline']) else row['headline'], axis=1)
data = data.drop(['headline', 'short_description'], axis=1)

In [9]:
data

Unnamed: 0,category,text
0,COMEDY,23 Of The Funniest Tweets About Cats And Dogs ...
1,PARENTING,The Funniest Tweets From Parents This Week (Se...
2,SPORTS,"Maury Wills, Base-Stealing Shortstop For Dodge..."
3,ENTERTAINMENT,Golden Globes Returning To NBC In January Afte...
4,POLITICS,Biden Says U.S. Forces Would Defend Taiwan If ...
...,...,...
148117,ENTERTAINMENT,'Girl With the Dragon Tattoo' India Release Ca...
148118,SPORTS,Maria Sharapova Stunned By Victoria Azarenka I...
148119,SPORTS,"Giants Over Patriots, Jets Over Colts Among M..."
148120,SPORTS,Aldon Smith Arrested: 49ers Linebacker Busted ...


Imports

In [10]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from IPython.display import clear_output
import inflect

# Download necessary resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Initialize inflect engine
p = inflect.engine()

clear_output()

`clean_text` function transforms the text to lower case, separates the words as tokens, removes punctuations and stopwords and applies lemmatization.

In [11]:
def clean_text(text):

    # Transform to lower case
    text = text.lower()

    # Transform numbers into words
    #words = [p.number_to_words(word) if word.isdigit() else word for word in words]

    # Tokenization
    words = word_tokenize(text)

    # Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    words = [word.translate(table) for word in words if word.isalpha()]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatization
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    #can try stemming? mabye later

    # Join the words back into a string
    preprocessed_text = ' '.join(lemmatized_words)

    return preprocessed_text

In [12]:
data['clean_text'] = data['text'].apply(clean_text)