# Objective

Goal:
Prepare raw text so it can be used for:

-Document Classification

-Named Entity Recognition (NER)

-Summarization

We will NOT train models here.
We will ONLY clean & prepare text.

# Imports (Code cell)

In [9]:
import pandas as pd
import re
import string
from typing import List
import sklearn

# Load Dataset (Code cell)

In [2]:
bbc_df = pd.read_csv(
    "../data/bbc-news-data.csv",
    sep="\t",
    engine="python"
)

bbc_df.head()


Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...


# Combine title + content (Business logic)

In [3]:
bbc_df["full_text"] = bbc_df["title"] + ". " + bbc_df["content"]

bbc_df[["category", "full_text"]].head()


Unnamed: 0,category,full_text
0,business,Ad sales boost Time Warner profit. Quarterly ...
1,business,Dollar gains on Greenspan speech. The dollar ...
2,business,Yukos unit buyer faces loan claim. The owners...
3,business,High fuel prices hit BA's profits. British Ai...
4,business,Pernod takeover talk lifts Domecq. Shares in ...


# Basic Text Cleaning Function

In [5]:
def clean_text(text:str)->str:
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"\d+", "", text)
    text=text.translate(str.maketrans("","", string.punctuation))
    text = re.sub(r"\s+"," ", text).strip()
    return text
    

# Apply Cleaning

In [6]:
bbc_df["clean_text"] = bbc_df["full_text"].apply(clean_text)

bbc_df[["full_text", "clean_text"]].head(2)


Unnamed: 0,full_text,clean_text
0,Ad sales boost Time Warner profit. Quarterly ...,ad sales boost time warner profit quarterly pr...
1,Dollar gains on Greenspan speech. The dollar ...,dollar gains on greenspan speech the dollar ha...


# Text Length Analysis (Sanity check)

In [7]:
bbc_df["text_length"] = bbc_df["clean_text"].str.split().apply(len)

bbc_df["text_length"].describe()


count    2225.000000
mean      377.326292
std       235.333573
min        88.000000
25%       240.000000
50%       327.000000
75%       465.000000
max      4396.000000
Name: text_length, dtype: float64

# Prepare Final Dataset for Modeling

In [8]:
final_df = bbc_df[["clean_text", "category"]].copy()
final_df.head()

Unnamed: 0,clean_text,category
0,ad sales boost time warner profit quarterly pr...,business
1,dollar gains on greenspan speech the dollar ha...,business
2,yukos unit buyer faces loan claim the owners o...,business
3,high fuel prices hit bas profits british airwa...,business
4,pernod takeover talk lifts domecq shares in uk...,business


# Encode Labels (for ML models)

In [10]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
final_df["label"] = label_encoder.fit_transform(final_df["category"])

final_df.head()


Unnamed: 0,clean_text,category,label
0,ad sales boost time warner profit quarterly pr...,business,0
1,dollar gains on greenspan speech the dollar ha...,business,0
2,yukos unit buyer faces loan claim the owners o...,business,0
3,high fuel prices hit bas profits british airwa...,business,0
4,pernod takeover talk lifts domecq shares in uk...,business,0
