In [3]:
#  Step 1: Import libraries
import pandas as pd
import os


In [4]:
# Step 2: Define paths
RAW_PATH = 'data/raw/'
OUTPUT_PATH = 'data/processed/combined_disaster_data.csv'


In [5]:
#  Step 3: Load and prepare Flood dataset
flood = pd.read_csv(os.path.join(RAW_PATH, 'flood.csv'))
flood['FloodOccurred'] = flood['FloodProbability'].apply(lambda x: 1 if x > 0.5 else 0)
flood = flood.drop(columns=['FloodProbability'])
flood['disaster_type'] = flood['FloodOccurred'].apply(lambda x: 'Flood' if x == 1 else 'NoDisaster')
flood = flood.drop(columns=['FloodOccurred'])


In [7]:
# Step 4: Load and prepare Earthquake dataset
earthquake = pd.read_csv(os.path.join(RAW_PATH, 'earthquake.csv'))
earthquake = earthquake[['Latitude', 'Longitude', 'Depth', 'Magnitude']]
earthquake['disaster_type'] = 'Earthquake'


In [8]:
# Step 5: Load Landslide dataset
landslide = pd.read_csv(os.path.join(RAW_PATH, 'landslide.csv'))
landslide['disaster_type'] = 'Landslide'


In [9]:
#  Step 6: Load Cyclone dataset
cyclone = pd.read_csv(os.path.join(RAW_PATH, 'cyclone.csv'))
cyclone['disaster_type'] = cyclone['Cyclone'].apply(lambda x: 'Cyclone' if x == 1 else 'NoDisaster')
cyclone = cyclone.drop(columns=['Cyclone'])


In [10]:
#  Step 7: Standardize all column names
flood.columns = flood.columns.str.lower()
earthquake.columns = earthquake.columns.str.lower()
landslide.columns = landslide.columns.str.lower()
cyclone.columns = cyclone.columns.str.lower()


In [11]:
#  Step 8: Pad missing columns
flood_cols = set(flood.columns)
earthquake_cols = set(earthquake.columns)
landslide_cols = set(landslide.columns)
cyclone_cols = set(cyclone.columns)
all_columns = list(flood_cols | earthquake_cols | landslide_cols | cyclone_cols)

def pad_columns(df):
    for col in all_columns:
        if col not in df.columns:
            df[col] = None
    return df[all_columns]

flood = pad_columns(flood)
earthquake = pad_columns(earthquake)
landslide = pad_columns(landslide)
cyclone = pad_columns(cyclone)


In [12]:
#  Step 9: Combine and save
combined = pd.concat([flood, earthquake, landslide, cyclone], ignore_index=True)
os.makedirs('data/processed', exist_ok=True)
combined.to_csv(OUTPUT_PATH, index=False)
print(f" Combined dataset saved at: {OUTPUT_PATH}")


  combined = pd.concat([flood, earthquake, landslide, cyclone], ignore_index=True)


 Combined dataset saved at: data/processed/combined_disaster_data.csv


In [13]:
# Remove completely empty columns across all DataFrames
def drop_all_na_cols(df):
    return df.dropna(axis=1, how='all')

flood = drop_all_na_cols(flood)
earthquake = drop_all_na_cols(earthquake)
landslide = drop_all_na_cols(landslide)
cyclone = drop_all_na_cols(cyclone)


In [14]:
combined = pd.concat([flood, earthquake, landslide, cyclone], ignore_index=True)


In [4]:
import pandas as pd
import numpy as np

# Load your combined dataset
df = pd.read_csv("data/processed/combined_disaster_data.csv")

# Define realistic value ranges for each relevant column
realistic_ranges = {
    'proximity_to_water': (0, 10),             # in km
    'earthquake_activity': (0, 10),            # risk score
    'soil_type_sand': (0, 1),                  # binary
    'soil_saturation': (0, 100),               # percentage
    'rainfall_mm': (0, 500),                   # mm
    'soil_type_silt': (0, 1),                  # binary
    'proximity_to_coastline': (0, 50),         # km
    'latitude': (-90, 90),                     # degrees
    'pre_existing_disturbance': (0, 10),       # score
    'humidity': (0, 100),                      # percentage
    'wind_shear': (0, 30),                     # m/s
    'vorticity': (-5, 5),                      # arbitrary scale
    'ocean_depth': (0, 11000),                 # meters
    'sea_surface_temperature': (0, 35),        # °C
    'slope_angle': (0, 90),                    # degrees
    'longitude': (-180, 180),                  # degrees
    'soil_type_gravel': (0, 1),                # binary
    'vegetation_cover': (0, 10),               # index
    'landslide': (0, 1),                       # binary
    'depth': (0, 700),                         # km
    'magnitude': (0, 10)                       # Richter scale
}

# Fill in empty or missing columns
for col, (min_val, max_val) in realistic_ranges.items():
    if col in df.columns:
        is_empty = df[col].isna() | (df[col].astype(str).str.strip() == '')
        if is_empty.all():
            df[col] = np.random.uniform(min_val, max_val, size=len(df)).round(2)
        elif is_empty.any():
            df.loc[is_empty, col] = np.random.uniform(min_val, max_val, size=is_empty.sum()).round(2)
    else:
        df[col] = np.random.uniform(min_val, max_val, size=len(df)).round(2)

# Save cleaned dataset (overwrite existing)
df.to_csv("data/processed/combined_disaster_data.csv", index=False)

print(" Cleaned dataset saved with realistic values at: data/processed/combined_disaster_data.csv")


 Cleaned dataset saved with realistic values at: data/processed/combined_disaster_data.csv


In [2]:
import os
import pandas as pd

# Dynamic path handling
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
DATA_PATH = os.path.join(BASE_DIR, "data", "processed", "combined_disaster_data.csv")
OUTPUT_PATH = os.path.join(BASE_DIR, "data", "processed", "cleaned_disaster_data.csv")

# Load dataset
print(f"Loading dataset from: {DATA_PATH}")
df = pd.read_csv(DATA_PATH)

# -------------------------------
# 1. Create binary disaster columns
# -------------------------------

# Flood -> based on "disaster_type" column
df['is_flood'] = df['disaster_type'].apply(lambda x: 1 if str(x).lower() == "flood" else 0)

# Landslide -> assuming column 'landslide' exists with "Yes"/"No"
if 'landslide' in df.columns:
    df['is_landslide'] = df['landslide'].apply(lambda x: 1 if str(x).lower() == "yes" else 0)
else:
    df['is_landslide'] = 0  # default if not present

# Earthquake -> assuming column 'earthquake' exists with "Yes"/"No"
if 'earthquake' in df.columns:
    df['is_earthquake'] = df['earthquake'].apply(lambda x: 1 if str(x).lower() == "yes" else 0)
else:
    df['is_earthquake'] = 0

# Cyclone -> no direct column, so infer from windspeed & pressure
if 'windspeed' in df.columns and 'pressure' in df.columns:
    df['is_cyclone'] = df.apply(
        lambda row: 1 if (row['windspeed'] >= 120 and row['pressure'] < 990) else 0,
        axis=1
    )
else:
    df['is_cyclone'] = 0

# -------------------------------
# 2. Clean and preprocess
# -------------------------------

# Drop duplicates
df = df.drop_duplicates()

# Fill missing numeric values with column mean
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    df[col] = df[col].fillna(df[col].mean())

# Fill missing categorical values with mode
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna(df[col].mode()[0])

# -------------------------------
# 3. Save cleaned dataset
# -------------------------------
df.to_csv(OUTPUT_PATH, index=False)
print(f"Cleaned dataset saved to: {OUTPUT_PATH}")

print("Preview of cleaned dataset:")
print(df.head())



NameError: name '__file__' is not defined