In [1]:
# IMPORTS #
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [17]:
data = pd.read_json("data/original_data.json", lines=True)

In [3]:
print(f"There are {len(data)} rows in the original dataset.")
# Check for duplicate data values

total_duplicate_titles = sum(data["headline"].duplicated())
print(f"There are {total_duplicate_titles} duplicate rows (headline).")
total_duplicate_titles = sum(data["short_description"].duplicated())
print(f"There are {total_duplicate_titles} duplicate rows (short_description).")


# Drop duplicate values
data = data[~data["headline"].duplicated()]
data = data[~data["short_description"].duplicated()]
print(f"There are {len(data)} rows in the deduplicated dataset.")

# How many unique terms?
print(data["short_description"].nunique())

There are 209527 rows in the original dataset.
There are 1531 duplicate rows (headline).
There are 22505 duplicate rows (short_description).
There are 186024 rows in the deduplicated dataset.
186024


In [4]:
data.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [5]:
# Statistics on "short_description" row contents pre processing
data["short_description"].apply(lambda x: len(x.split(" "))).describe()

count    186024.000000
mean         21.782340
std          13.021497
min           1.000000
25%          13.000000
50%          20.000000
75%          26.000000
max         241.000000
Name: short_description, dtype: float64

In [6]:
# Total word count pre processing
text = ' '.join(data['short_description'])
pre_word_count = len(text.split())
print(f'Total word count: {pre_word_count}')

Total word count: 4048442


In [7]:
# First five documents before text processing
for i in range(5):
    print(data["short_description"][i])

Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.
He was subdued by passengers and crew when he fled to the back of the aircraft after the confrontation, according to the U.S. attorney's office in Los Angeles.
"Until you have a dog you don't understand what could be eaten."
"Accidentally put grown-up toothpaste on my toddler’s toothbrush and he screamed like I was cleaning his teeth with a Carolina Reaper dipped in Tabasco sauce."
Amy Cooper accused investment firm Franklin Templeton of unfairly firing her and branding her a racist after video of the Central Park encounter went viral.


In [8]:
# Load NLTK English stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/snelson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/snelson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [23]:
# Lemmatize and remove stop words

data['short_description'] = data['short_description'].apply(
    lambda x: ' '.join(
        [lemmatizer.lemmatize(word.lower()) for word in x.split() if word.lower() not in stop_words]
    )
)

In [10]:
# Statistics on "short_description" row contents post processing
data["short_description"].apply(lambda x: len(x.split(" "))).describe()

count    186024.000000
mean         12.493823
std           6.903307
min           1.000000
25%           8.000000
50%          12.000000
75%          15.000000
max         139.000000
Name: short_description, dtype: float64

In [22]:
data["short_description"]

0         Health experts said it is too early to predict...
1         He was subdued by passengers and crew when he ...
2         "Until you have a dog you don't understand wha...
3         "Accidentally put grown-up toothpaste on my to...
4         Amy Cooper accused investment firm Franklin Te...
                                ...                        
209522    Verizon Wireless and AT&T are already promotin...
209523    Afterward, Azarenka, more effusive with the pr...
209524    Leading up to Super Bowl XLVI, the most talked...
209525    CORRECTION: An earlier version of this story i...
209526    The five-time all-star center tore into his te...
Name: short_description, Length: 209527, dtype: object

In [11]:
# First five documents before text processing
for i in range(5):
    print(data["short_description"][i])

health expert said early predict whether demand would match 171 million dos new booster u.s. ordered fall.
subdued passenger crew fled back aircraft confrontation, according u.s. attorney's office los angeles.
"until dog understand could eaten."
"accidentally put grown-up toothpaste toddler’s toothbrush screamed like cleaning teeth carolina reaper dipped tabasco sauce."
amy cooper accused investment firm franklin templeton unfairly firing branding racist video central park encounter went viral.


In [16]:
# Total word count post processing
text = ' '.join(data['short_description'])
word_count = len(text.split())
print(f'Total word count: {word_count}')
print(f'The difference between pre and post processing is {(pre_word_count - word_count)} words')

Total word count: 2324149
The difference between pre and post processing is 1724293 words


In [24]:
data.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,health expert said early predict whether deman...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,subdued passenger crew fled back aircraft conf...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""until dog understand could eaten.""",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""accidentally put grown-up toothpaste toddler’...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,amy cooper accused investment firm franklin te...,Nina Golgowski,2022-09-22


In [26]:
# Write processed text to file
data = data.loc[:, ["category","short_description"]]
data.to_csv('data/cleaned_data.csv')