In [21]:
import pandas as pd
import re

def preprocess_track_a(df):
    """
    Preprocesses the Track-A dataset:
    - Lowercases text
    - Removes punctuation (except intra-word apostrophes)
    - Removes extra whitespace
    - Ensures label columns are integers (0/1)
    Returns a cleaned DataFrame.
    """
    # label columns as per given dataset
    label_cols = ['anger', 'fear', 'joy', 'sadness', 'surprise']
    
    # Lowercase and clean text
    def clean_text(text):
        if pd.isnull(text):
            return ""
        text = text.lower()
        # to retain alphanum
        text = re.sub(r"[^a-z0-9\s']", ' ', text)
        # to normalize whitespace
        text = re.sub(r"\s+", ' ', text)
        return text.strip()
    
    # Initially, cleaning only the 'text'
    df['text'] = df['text'].astype(str).apply(clean_text)
    
    # then, to ensure labels are integers (0/1)
    for col in label_cols:
        df[col] = df[col].fillna(0).astype(int)
    
    
    return df

if __name__ == "__main__":
    # location to the track-a.csv
    df = pd.read_csv('track-a.csv')
    # print(df.head())
    print(df.loc[df['id'] == "eng_train_track_a_01304"])
    # df.info()

                           id                   text  anger  fear  joy  \
1303  eng_train_track_a_01304  &lt;/crazy-nutter&gt;      0     1    0   

      sadness  surprise  
1303        0         1  


In [22]:
# df_clean = preprocess_track_a(df)
# # print(df_clean.head())

# df_clean.head()

if __name__ == "__main__":
    # location to the track-a.csv
    df = pd.read_csv('track-a.csv')
    df_clean = preprocess_track_a(df)

    print(df_clean.head())

    print(df.loc[df['id'] == "eng_train_track_a_01304"])
    # df.info()




                        id                                               text  \
0  eng_train_track_a_00001                         colorado middle of nowhere   
1  eng_train_track_a_00002  this involved swimming a pretty large lake tha...   
2  eng_train_track_a_00003         it was one of my most shameful experiences   
3  eng_train_track_a_00004  after all i had vegetables coming out my ears ...   
4  eng_train_track_a_00005                         then the screaming started   

   anger  fear  joy  sadness  surprise  
0      0     1    0        0         1  
1      0     1    0        0         0  
2      0     1    0        1         0  
3      0     0    0        0         0  
4      0     1    0        1         1  
                           id                text  anger  fear  joy  sadness  \
1303  eng_train_track_a_01304  lt crazy nutter gt      0     1    0        0   

      surprise  
1303         1  


# temp

In [None]:
import re
from bs4 import BeautifulSoup
import spacy
from emot.emo_unicode import EMOTICONS_EMO  # For emoji handling (install: pip install emot)

# Initialize spaCy without parser/ner for efficiency
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

def preprocess_text(
    text, 
    custom_emotion_map=None, 
    remove_stopwords=False, 
    lemmatize=True,
    preserve_punct=True
):
    """
    Generic text cleaner for emotion detection.
    
    Args:
        text (str): Raw input text.
        custom_emotion_map (dict): Optional {symbol: replacement} for emotional cues.
        remove_stopwords (bool): Remove stopwords if True.
        lemmatize (bool): Lemmatize tokens if True.
        preserve_punct (bool): Keep [!?] if True (critical for emotion detection).
        
    Returns:
        str: Cleaned text.
    """
    # 1. Remove HTML/XML tags (handles malformed tags)
    text = BeautifulSoup(text, "html.parser").get_text()
    
    # 2. Dynamic emotion symbol handling
    emotion_map = {
        **EMOTICONS_EMO,  # Auto-loads common emojis/emoticons (e.g., <3 → 'love')
        **{k: f' {v} ' for k, v in (custom_emotion_map or {}).items()}  # User overrides
    }
    
    # Replace symbols/emojis (regex-safe)
    for symbol, replacement in emotion_map.items():
        text = re.sub(re.escape(symbol), replacement, text)
    
    # 3. Preserve or strip punctuation
    punct_pattern = r'[^\w\s]' if not preserve_punct else r'[^\w\s!?]'
    text = re.sub(punct_pattern, ' ', text)
    
    # 4. Normalize whitespace and lowercase
    text = re.sub(r'\s+', ' ', text).strip().lower()
    
    # 5. Advanced tokenization (optional)
    doc = nlp(text)
    tokens = [
        token.lemma_ if lemmatize and token.lemma_ != '-PRON-' else token.text
        for token in doc
        if not (remove_stopwords and token.is_stop)
    ]
    
    return ' '.join(tokens)

# Example usage with custom symbols
custom_map = {
    '!!!': ' high_intensity ',  # Custom intensity marker
    '?': ' uncertainty '       # Label questions
}

df['cleaned_text'] = df['text'].apply(
    preprocess_text, 
    custom_emotion_map=custom_map,
    preserve_punct=True
)