In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

# -----------------------------
# 1. Safe log transform
# -----------------------------
def safe_log1p(x):
    x = np.array(x)
    x = np.clip(x, a_min=0, a_max=None)  # no negatives
    return np.log1p(x)

# -----------------------------
# 2. Load data
# -----------------------------
df = pd.read_csv(r'./../Data Given for Challenge/data/train.csv')

# Replace inf/-inf and drop NaNs
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

# -----------------------------
# 3. Define feature groups
# -----------------------------
zscore_cols = [
    't2m','d2m','sh2','t','pres','pres_1','pres_2','sp','pt',
    'gh','gh_1','gh_2','gh_3','gh_4','gh_5','gh_6','gh_7',
    'u','v','u10','v10','gust','max_10si','vucsh','vvcsh',
    'ustm','vstm','wz','wz_1','lftx','lftx4','orog','vis',
    'blh','fsr','gflux','veril','tcolw','tcoli','plpl','mstav',
    'sdwe','sdwe_1','layth','bgrun','ssrun','mslma','pwat',
    'refc','refd','refd_1'
]

log_cols = [
    'cape','cape_1','cin','sdswrf','sdlwrf','suswrf','sulwrf',
    'prate','tp','crain','cfrzr','cicep','csnow'
]

robust_cols = ['hail','hail_1','hail_2','ltng']

minmax_cols = [
    'r2','r','r_1','tcc','tcc_1','hcc','mcc','lcc',
    'cpofp','aod','veg','lai'
]

non_feature_cols = ['out','location','timestamp']

# -----------------------------
# 4. Extract datetime features (only if timestamp exists)
# -----------------------------
if 'timestamp' in df.columns:
    df['year'] = pd.to_datetime(df['timestamp']).dt.year
    df['month'] = pd.to_datetime(df['timestamp']).dt.month
    df['day'] = pd.to_datetime(df['timestamp']).dt.day
    df['hour'] = pd.to_datetime(df['timestamp']).dt.hour
    df.drop(columns=['timestamp'], inplace=True)

non_feature_cols = ['out','location']  # drop timestamp here


# -----------------------------
# 5. Normalize by group
# -----------------------------
df_norm = df.copy()

# Z-score
scaler = StandardScaler()
for col in zscore_cols:
    if col in df_norm.columns:
        df_norm[col] = scaler.fit_transform(df_norm[[col]])

# Log + z-score
for col in log_cols:
    if col in df_norm.columns:
        df_norm[col] = safe_log1p(df_norm[col])
        df_norm[col] = StandardScaler().fit_transform(df_norm[[col]])

# Robust scaling
for col in robust_cols:
    if col in df_norm.columns:
        df_norm[col] = RobustScaler().fit_transform(df_norm[[col]])

# Min-max scaling (specified)
for col in minmax_cols:
    if col in df_norm.columns:
        df_norm[col] = MinMaxScaler().fit_transform(df_norm[[col]])

# Min-max scaling (rest)
assigned = set(zscore_cols + log_cols + robust_cols + minmax_cols + non_feature_cols)
rest_cols = [c for c in df_norm.columns if c not in assigned]
for col in rest_cols:
    if df_norm[col].dtype != 'object':  # skip categorical
        df_norm[col] = MinMaxScaler().fit_transform(df_norm[[col]])

# -----------------------------
# 6. Save result
# -----------------------------
df_norm.to_csv("./../Data Given for Challenge/data/normalized_features.csv", index=False)
print("✅ Normalized file saved as normalized_features.csv with same shape:", df_norm.shape)


✅ Normalized file saved as normalized_features.csv with same shape: (179363, 112)
