In [21]:
import pandas as pd
import re

def preprocess_track_a(df_dataset):
    """
    # Function takes i/p - dataframe
    Preprocesses the Track-A dataset:
    - Lowercases text
    - Removes punctuation (except intra-word apostrophes)
    - Removes extra whitespace
    - Ensures label columns are integers (0/1)
    Returns a cleaned DataFrame.
    """
    # label columns as per given dataset
    label_cols = ['anger', 'fear', 'joy', 'sadness', 'surprise']
    
    # Lowercase and clean text
    def clean_text(text):
        if pd.isnull(text):
            return ""
        text = text.lower()
        # to retain alphanum
        text = re.sub(r"[^a-z0-9\s']", ' ', text)
        # to normalize whitespace
        text = re.sub(r"\s+", ' ', text)
        return text.strip()
    
    # Initially, cleaning only the 'text'
    df_dataset['text'] = df_dataset['text'].astype(str).apply(clean_text)
    
    # then, to ensure labels are integers (0/1)
    for col in label_cols:
        df_dataset[col] = df_dataset[col].fillna(0).astype(int)
    
    
    return df_dataset



                           id                   text  anger  fear  joy  \
1303  eng_train_track_a_01304  &lt;/crazy-nutter&gt;      0     1    0   

      sadness  surprise  
1303        0         1  


In [22]:
def main():
    # location to the track-a.csv
    df_dataset = pd.read_csv('track-a.csv')
    df_clean = preprocess_track_a(df_dataset)
    
    # To view the dataset after preprocessing
  
    # print(df_clean.head())  # uncomment

    """
    # may be here, one can invoke, further functions
    # after defining them
    """

if __name__ == "__main__":
    main()




                        id                                               text  \
0  eng_train_track_a_00001                         colorado middle of nowhere   
1  eng_train_track_a_00002  this involved swimming a pretty large lake tha...   
2  eng_train_track_a_00003         it was one of my most shameful experiences   
3  eng_train_track_a_00004  after all i had vegetables coming out my ears ...   
4  eng_train_track_a_00005                         then the screaming started   

   anger  fear  joy  sadness  surprise  
0      0     1    0        0         1  
1      0     1    0        0         0  
2      0     1    0        1         0  
3      0     0    0        0         0  
4      0     1    0        1         1  
                           id                text  anger  fear  joy  sadness  \
1303  eng_train_track_a_01304  lt crazy nutter gt      0     1    0        0   

      surprise  
1303         1  
