# 02_preprocessing.ipynb

# Step 1: Import Libraries

In [3]:
import pandas as pd
import numpy as np
import re
import nltk
import spacy
from nltk.corpus import stopwords

# Download NLTK resources
nltk.download('stopwords')

# Load spaCy language model
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\parks\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Step 2: Load Dataset


In [4]:
df_train = pd.read_csv("../data/train.csv")

# Step 3: Check for Missing Values


In [5]:
df_train = df_train.dropna(subset=['text'])

# Step 4: Define Text Preprocessing Function

In [6]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.text not in stopwords.words('english') and not token.is_punct and not token.is_space]
    return " ".join(tokens)

# Step 5: Apply Preprocessing


In [9]:
df_train['clean_text'] = df_train['text'].apply(preprocess_text)

# Step 6: Verify Cleaned Data

In [10]:
print(df_train[['text', 'clean_text']].head())

                                                text  \
0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1  Ever get the feeling your life circles the rou...   
2  Why the Truth Might Get You Fired October 29, ...   
3  Videos 15 Civilians Killed In Single US Airstr...   
4  Print \nAn Iranian woman has been sentenced to...   

                                          clean_text  
0  house dem aide not even see comey letter jason...  
1  ever get feeling life circle roundabout rather...  
2  truth might get fire october tension intellige...  
3  videos civilian kill single us airstrike ident...  
4  print iranian woman sentence six year prison i...  


# Step 7: Save Cleaned Data

In [11]:
df_train.to_csv("../data/cleaned_train.csv", index=False)