In [3]:
import os
import pandas as pd

# Dynamic path handling
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
DATA_PATH = os.path.join(BASE_DIR, "data", "processed", "combined_disaster_data.csv")
OUTPUT_PATH = os.path.join(BASE_DIR, "data", "processed", "cleaned_disaster_data.csv")

# Load dataset
print(f"Loading dataset from: {DATA_PATH}")
df = pd.read_csv(DATA_PATH)

# -------------------------------
# 1. Create binary disaster columns
# -------------------------------

# Flood -> based on "disaster_type" column
df['is_flood'] = df['disaster_type'].apply(lambda x: 1 if str(x).lower() == "flood" else 0)

# Landslide -> assuming column 'landslide' exists with "Yes"/"No"
if 'landslide' in df.columns:
    df['is_landslide'] = df['landslide'].apply(lambda x: 1 if str(x).lower() == "yes" else 0)
else:
    df['is_landslide'] = 0  # default if not present

# Earthquake -> assuming column 'earthquake' exists with "Yes"/"No"
if 'earthquake' in df.columns:
    df['is_earthquake'] = df['earthquake'].apply(lambda x: 1 if str(x).lower() == "yes" else 0)
else:
    df['is_earthquake'] = 0

# Cyclone -> no direct column, so infer from windspeed & pressure
if 'windspeed' in df.columns and 'pressure' in df.columns:
    df['is_cyclone'] = df.apply(
        lambda row: 1 if (row['windspeed'] >= 120 and row['pressure'] < 990) else 0,
        axis=1
    )
else:
    df['is_cyclone'] = 0

# -------------------------------
# 2. Clean and preprocess
# -------------------------------

# Drop duplicates
df = df.drop_duplicates()

# Fill missing numeric values with column mean
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    df[col] = df[col].fillna(df[col].mean())

# Fill missing categorical values with mode
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna(df[col].mode()[0])

# -------------------------------
# 3. Save cleaned dataset
# -------------------------------
df.to_csv(OUTPUT_PATH, index=False)
print(f"Cleaned dataset saved to: {OUTPUT_PATH}")

print("Preview of cleaned dataset:")
print(df.head())


NameError: name '__file__' is not defined