In [2]:
import pandas as pd
import numpy as np
import re

In [3]:
df = pd.read_csv('event_level_data_dirty.csv')
df.head()

Unnamed: 0,record_id,timestamp,day_of_week,hour_of_day,is_weekend,is_public_holiday,lat,long,weather,temperature,humidity,location_id,location_name,timezone_info
0,1,2025-01-01 07:01:00,2.0,7.0,False,True,1.280884,{}103.869885{},rainy,24.8,93.5,0,SEMBAWANG EATING HOUSE,
1,2,2025-01-01 07:16:00,2.0,7.0,\r\nFalse\r\n,\nTrue\n,1.280884,103.869885,rainy,60.0,91.6,0,SEMBAWANG EATING HOUSE,
2,3,,2.0,7.0,False,,1.280884,103.869885,cloudy,()23.7,86.9,0,SEMBAWANG EATING HOUSE,
3,4,2025-01-01 07:38:00,2.0,7.0,@False,True,1.280884,103.869885,cloudy,24.2,85.7,0,SEMBAWANG EATING HOUSE,
4,5,2025-01-01 07:39:00,,7.0,\r\nFalse\r\n,True,\r\n1.280884\r\n,103.869885,rainy,24.7,91().2,0,SEMBAWANG EATING HOUSE,


In [27]:
def clean_data(input_path, output_path):
    # Load data
    df = pd.read_csv(input_path)

    # Drop duplicates and unnecessary columns
    df = df.drop_duplicates(subset=['record_id'], keep='first')
    if 'timezone_info' in df.columns:
        df = df.drop(columns=['timezone_info'])

    # Clean String Noise
    def basic_clean(text):
        if pd.isna(text): return text
        text = str(text).strip().lower()
        text = re.sub(r'[{}()\[\]@*&|°\\ø$%//#/]', '', text)
        text = text.replace('á', 'a').replace('ë', 'e').replace('ï', 'i').replace('ü', 'u').replace('û', 'ou')
        return text

    for col in ['weather', 'is_weekend', 'is_public_holiday', 'location_name']:
        df[col] = df[col].apply(basic_clean)

    def clean_weather(value):
        if pd.isna(value):
            return 'unknown'
        elif 'night_clear' in value:
            return 'night_clear'
        elif 'rain' in value:
            return 'rainy'
        elif 'cloud' in value:
            return 'cloudy'
        elif 'clear' in value:
            return 'clear'
        else:
            return 'other'
    df['weather'] = df['weather'].apply(clean_weather)

    # Handle Timestamps and Date Features
    # Fill missing timestamps based on sequential record_ids
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['timestamp'] = df['timestamp'].interpolate(method='linear')

    # Recalculate time features to ensure consistency
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['hour_of_day'] = df['timestamp'].dt.hour
    df['is_weekend'] = df['day_of_week'].isin([5, 6])

    # Clean Numeric Columns (Lat, Long, Temp, Humidity)
    def clean_numeric(val):
        if pd.isna(val): return np.nan
        # Extract digits, dots, and minus signs only
        clean_val = re.sub(r'[^0-9.\-]', '', str(val))
        try:
            return float(clean_val)
        except:
            return np.nan

    num_cols = ['lat', 'long', 'temperature', 'humidity']
    for col in num_cols:
        df[col] = df[col].apply(clean_numeric)

    # Impute Location Data using Location ID
    # Use location_id to fix incorrect Lat/Long/Name (e.g., -100.0 or 999.0)
    for col in ['lat', 'long', 'location_name']:
        # Create a mapping of location_id to the most common (mode) valid value
        mapping = df[df[col].notna() & (df[col] != 0) & (df[col] != -100) & (df[col] != 999)] \
                    .groupby('location_id')[col].agg(lambda x: x.value_counts().index[0])
        df[col] = df['location_id'].map(mapping)

    # Handle Outliers in Environment Data
    # Cap temperatures to reasonable ranges (e.g., 20-40) or interpolate
    df.loc[(df['temperature'] < 15) | (df['temperature'] > 45), 'temperature'] = np.nan
    df['temperature'] = df.groupby('location_id')['temperature'].transform(lambda x: x.interpolate().ffill().bfill())
    df['humidity'] = df.groupby('location_id')['humidity'].transform(lambda x: x.interpolate().ffill().bfill())

    # Final Formatting
    df['is_public_holiday'] = df['is_public_holiday'].map({'true': True, 'false': False}).fillna(False).astype(bool)
    df['location_id'] = df['location_id'].astype(int)

    # Sort and Save
    df = df.sort_values('record_id')
    df.to_csv(output_path, index=False)
    return df

In [28]:
# Run the cleaning
clean_df_result = clean_data('event_level_data_dirty.csv', 'restored_clean_data.csv')

  df['is_public_holiday'] = df['is_public_holiday'].map({'true': True, 'false': False}).fillna(False).astype(bool)


In [31]:
clean_df_result.head()

Unnamed: 0,record_id,timestamp,day_of_week,hour_of_day,is_weekend,is_public_holiday,lat,long,weather,temperature,humidity,location_id,location_name
0,1,2025-01-01 07:01:00,2,7,0,1,1.280884,103.869885,rainy,24.8,93.5,0,sembawang eating house
1,2,2025-01-01 07:16:00,2,7,0,1,1.280884,103.869885,rainy,24.25,91.6,0,sembawang eating house
2,3,2025-01-01 07:27:00,2,7,0,0,1.280884,103.869885,cloudy,23.7,86.9,0,sembawang eating house
3,4,2025-01-01 07:38:00,2,7,0,1,1.280884,103.869885,cloudy,24.2,85.7,0,sembawang eating house
4,5,2025-01-01 07:39:00,2,7,0,1,1.280884,103.869885,rainy,24.7,91.2,0,sembawang eating house


In [21]:
clean_df_result['is_weekend'].dtype
clean_df_result['is_public_holiday'].dtype

dtype('bool')

In [30]:
# Changing True/False values to numeric 1/0
clean_df_result['is_weekend'] = clean_df_result['is_weekend'].astype(int)
clean_df_result['is_public_holiday'] = clean_df_result['is_public_holiday'].astype(int)

In [32]:
clean_df_result['weather'].unique()

array(['rainy', 'cloudy', 'unknown', 'other', 'clear', 'night_clear'],
      dtype=object)

In [35]:
# Drop unkown(nan) values from weather
clean_df_result = clean_df_result[clean_df_result['weather'] != 'unknown']
clean_df_result['weather'].unique()

array(['rainy', 'cloudy', 'other', 'clear', 'night_clear'], dtype=object)