In [1]:
import pandas as pd
import re
import contractions
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

# NLTK setup
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Load data
train_df = pd.read_csv(r"../data/train.csv")
test_df = pd.read_csv(r"../data/test.csv")

In [3]:
# Step 1: Quick cleaning using pandas
def fast_pandas_clean(text):
    text = contractions.fix(text)                          # Expand contractions
    text = text.lower()                                    # Lowercase
    text = re.sub(r"http\S+|www\S+", "", text)             # Remove URLs
    text = re.sub(r"@\w+", "", text)                       # Remove @mentions
    text = text.replace("#", "")                           # Remove '#' but keep the word
    text = re.sub(r"[^\w\s]", "", text)                    # Remove other punctuation
    text = re.sub(r"\s+", " ", text).strip()               # Remove extra spaces
    return text

for df in [train_df, test_df]:
    df['text_clean'] = df['text'].astype(str).apply(fast_pandas_clean)

In [4]:
# Step 2: Discover important stopwords from the dataset
all_stopwords = set(stopwords.words('english'))

# Disaster vs Non-disaster split (only train set has target)
disaster_texts = train_df[train_df['target'] == 1]['text_clean'].dropna().tolist()
non_disaster_texts = train_df[train_df['target'] == 0]['text_clean'].dropna().tolist()

def count_stopwords(texts, stopword_list):
    counter = Counter()
    for text in texts:
        tokens = word_tokenize(text)
        for token in tokens:
            if token in stopword_list:
                counter[token] += 1
    return counter

disaster_counts = count_stopwords(disaster_texts, all_stopwords)
non_disaster_counts = count_stopwords(non_disaster_texts, all_stopwords)

# Important stopwords discovery
important_stopwords = []
for word in all_stopwords:
    disaster_freq = disaster_counts[word]
    non_disaster_freq = non_disaster_counts[word]
    total = disaster_freq + non_disaster_freq
    if total > 5:  # minimum appearances
        if disaster_freq / (total + 1e-6) > 0.6:  # heavily disaster-weighted
            important_stopwords.append(word)

print(f"Important stopwords discovered: {important_stopwords}")

Important stopwords discovered: ['after', 'than', 'under', 's', 'during', 'over', 'those', 'were', 'm', 't']


In [5]:
# Step 3: Update stopwords set
updated_stopwords = all_stopwords - set(important_stopwords)


In [6]:
# Step 4: Deeper cleaning
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    try:
        text = re.sub(r'\d+', '', text)
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        tokens = word_tokenize(text)
        tokens = [word for word in tokens if word.lower() not in updated_stopwords]
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return ' '.join(tokens)
    except LookupError as e:
        print("NLTK resource missing:", e)
        return text

# Apply deep cleaning to train and test
for df in [train_df, test_df]:
    df['text_final'] = df['text_clean'].apply(preprocess_text)

In [7]:
# Step 5: Save cleaned datasets
train_df.to_csv("cleaned_train.csv", index=False)
test_df.to_csv("cleaned_test.csv", index=False)

print("\nCleaned train and test sets saved!")


Cleaned train and test sets saved!
