# Data Preprocessing

## Text Cleaning

### Import Libraries

In [1]:
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\thabi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thabi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\thabi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Load Raw Data

In [2]:
train_df = pd.read_csv('../data/raw_train.csv')
test_df = pd.read_csv('../data/raw_test.csv')

### Define Preprocessing Function

In [3]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Join tokens back to string
    return ' '.join(tokens)

### Apply Preprocessing

In [4]:
# Apply preprocessing to training data
train_df['clean_text'] = train_df['text'].apply(preprocess_text)

# Apply preprocessing to testing data
test_df['clean_text'] = test_df['text'].apply(preprocess_text)

### Verify Preprocessing

In [8]:
display(train_df[['text', 'clean_text']].head())

Unnamed: 0,text,clean_text
0,Bromwell High is a cartoon comedy. It ran at t...,bromwell high cartoon comedy ran time program ...
1,Homelessness (or Houselessness as George Carli...,homelessness houselessness george carlin state...
2,Brilliant over-acting by Lesley Ann Warren. Be...,brilliant overacting lesley ann warren best dr...
3,This is easily the most underrated film inn th...,easily underrated film inn brook cannon sure f...
4,This is not the typical Mel Brooks film. It wa...,typical mel brook film much less slapstick mov...


## Saving Preprocessed Data

### Save Preprocessed Data

In [9]:
train_df.to_csv('../data/preprocessed_train.csv', index=False)
test_df.to_csv('../data/preprocessed_test.csv', index=False)