<a href="https://colab.research.google.com/github/Shelly10-10/flood_prediction/blob/main/disaster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 📌 Step 1: Import libraries and upload the dataset
import pandas as pd
from google.colab import files
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

uploaded = files.upload()

# 📌 Step 2: Load the dataset
df = pd.read_csv('merged_dataset_final_renamed.csv', encoding='ISO-8859-1')  # Try changing encoding if needed

# 📌 Step 3: Basic Info
print("Shape of dataset:", df.shape)
print("Column names:", df.columns.tolist())
print("\nData Types:\n", df.dtypes)
print("\nMissing values per column:\n", df.isnull().sum())
print("\nFirst few rows:\n", df.head())

# 📌 Step 4: Handle missing values (you can modify this)
df.fillna(method='ffill', inplace=True)  # Forward fill for missing values

# 📌 Step 5: Check duplicates but DON'T DROP
duplicate_rows = df.duplicated().sum()
print(f"\nNumber of duplicate rows (not dropped): {duplicate_rows}")

# 📌 Step 6: Convert datetime column (if present)
if 'time' in df.columns:
    df['time'] = pd.to_datetime(df['time'], errors='coerce')

# 📌 Step 7: Normalize numeric features
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
scaler = MinMaxScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# 📌 Step 8: Encode categorical columns (if any)
for col in df.select_dtypes(include='object').columns:
    if col != 'time':  # Skip datetime
        df[col] = df[col].astype('category').cat.codes

# 📌 Step 9: Split into features and label (Replace 'target_column' with actual target)
# Example: if you're predicting 'flood_occurred' column
# df['flood_occurred'] = df['flood_occurred'].astype(int)
# X = df.drop('flood_occurred', axis=1)
# y = df['flood_occurred']

# Temporary placeholder to avoid crash
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 📌 Step 10: Save cleaned data
df.to_csv('cleaned_dataset.csv', index=False)
print("\n✅ Data preprocessing complete. Cleaned data saved as 'cleaned_dataset.csv'")
from google.colab import files
files.download('cleaned_dataset.csv')



Saving merged_dataset_final_renamed.csv to merged_dataset_final_renamed.csv
Shape of dataset: (1048575, 17)
Column names: ['valid_time', 'latitude', 'longitude', 'number', 'Experiment Version', 'Total Precipitation (mm)', 'Evaporation (mm)', 'Surface Runoff (mm)', 'Mean Wave Direction (°)', '10m U-Component of Wind (m/s)', '10m V-Component of Wind (m/s)', '2m Temperature', 'Mean Sea Level (°C)', 'Sea Surface Temperature (°C)', 'Soil Temperature Level 1 (°C)', 'Soil Type', 'Soil Water Volume Level 1 (m³/m³)']

Data Types:
 valid_time                            object
latitude                             float64
longitude                            float64
number                                 int64
Experiment Version                     int64
Total Precipitation (mm)             float64
Evaporation (mm)                     float64
Surface Runoff (mm)                  float64
Mean Wave Direction (°)              float64
10m U-Component of Wind (m/s)        float64
10m V-Component of Win

  df.fillna(method='ffill', inplace=True)  # Forward fill for missing values



Number of duplicate rows (not dropped): 0

✅ Data preprocessing complete. Cleaned data saved as 'cleaned_dataset.csv'


In [None]:
from google.colab import files
files.download('cleaned_dataset.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [6]:
import pandas as pd
df = pd.read_csv('merged_dataset_final_renamed.csv', encoding='ISO-8859-1')


In [15]:
df['Total Precipitation (mm)'].describe()


Unnamed: 0,Total Precipitation (mm)
count,26491.0
mean,0.229548
std,0.84615
min,0.0
25%,0.0
50%,0.0
75%,0.0477
max,27.039528


In [34]:
df['valid_time'] = pd.to_datetime(df['valid_time'], dayfirst=True)


In [35]:
df_daily = df.groupby([
    df['valid_time'].dt.date,
    'latitude',
    'longitude'
]).agg({
    'Total Precipitation (mm)': 'sum',
    'Evaporation (mm)': 'mean',
    'Surface Runoff (mm)': 'sum',
    'Mean Wave Direction (°)': 'mean',
    '10m U-Component of Wind (m/s)': 'mean',
    '10m V-Component of Wind (m/s)': 'mean',
    '2m Temperature': 'mean',
    'Mean Sea Level (°C)': 'mean',
    'Sea Surface Temperature (°C)': 'mean',
    'Soil Temperature Level 1 (°C)': 'mean',
    'Soil Type': 'first',  # assuming fixed per location
    'Soil Water Volume Level 1 (m³/m³)': 'mean',
    'number': 'first',
    'Experiment Version': 'first'
}).reset_index()


In [53]:
df_daily.fillna({
    'Total Precipitation (mm)': 0,
    'Evaporation (mm)': df_daily['Evaporation (mm)'].mean(),
    'Surface Runoff (mm)': 0,
    'Mean Wave Direction (°)': df_daily['Mean Wave Direction (°)'].mean(),
    '10m U-Component of Wind (m/s)': df_daily['10m U-Component of Wind (m/s)'].mean(),
    '10m V-Component of Wind (m/s)': df_daily['10m V-Component of Wind (m/s)'].mean(),
    '2m Temperature': df_daily['2m Temperature'].mean(),
    'Mean Sea Level (°C)': df_daily['Mean Sea Level (°C)'].mean(),
    'Sea Surface Temperature (°C)': df_daily['Sea Surface Temperature (°C)'].mean(),
    'Soil Temperature Level 1 (°C)': df_daily['Soil Temperature Level 1 (°C)'].mean(),
    'Soil Type': df_daily['Soil Type'].mode()[0],
    'Soil Water Volume Level 1 (m³/m³)': df_daily['Soil Water Volume Level 1 (m³/m³)'].mean()
}, inplace=True)


In [54]:
precip_high = df_daily['Total Precipitation (mm)'] > df['Total Precipitation (mm)'].quantile(0.75)
runoff_high = df_daily['Surface Runoff (mm)'] > df['Surface Runoff (mm)'].quantile(0.75)
evap_low = df_daily['Evaporation (mm)'] < df['Evaporation (mm)'].quantile(0.25)
soil_saturated = df_daily['Soil Water Volume Level 1 (m³/m³)'] > 0.22
bad_soil = df_daily['Soil Type'].isin([3, 4, 5])  # modify based on your soil legend
wind_active = (abs(df_daily['10m U-Component of Wind (m/s)']) > 2) | (abs(df['10m V-Component of Wind (m/s)']) > 2)
sea_warm = df_daily['Sea Surface Temperature (°C)'] > df['Sea Surface Temperature (°C)'].quantile(0.75)



In [55]:
df_daily['Flood_Occurred'] = (
    (
        precip_high & runoff_high & soil_saturated
    ) |
    (
        precip_high & evap_low & bad_soil
    ) |
    (
        runoff_high & wind_active & sea_warm
    )
).astype(int)


In [56]:
print(df_daily['Flood_Occurred'].value_counts())

Flood_Occurred
0    10382
1     3059
Name: count, dtype: int64


In [57]:
df_daily.head(10)

Unnamed: 0,valid_time,latitude,longitude,Total Precipitation (mm),Evaporation (mm),Surface Runoff (mm),Mean Wave Direction (°),10m U-Component of Wind (m/s),10m V-Component of Wind (m/s),2m Temperature,Mean Sea Level (°C),Sea Surface Temperature (°C),Soil Temperature Level 1 (°C),Soil Type,Soil Water Volume Level 1 (m³/m³),number,Experiment Version,Flood_Occurred
0,2019-06-01,17.0,80.0,0.754356,-7.7e-05,0.00381,191.530016,-0.432724,0.391968,306.9552,100267.03,302.927927,310.63391,3.0,0.141076,0,1,0
1,2019-06-01,17.0,80.25,0.0,-8e-05,0.0,191.530016,-0.372177,0.942261,307.1183,100264.845,302.927927,310.50305,3.0,0.155663,0,1,0
2,2019-06-01,17.0,80.5,0.00143,-8.3e-05,0.0,191.530016,-0.260361,1.416382,306.94153,100271.28,302.927927,309.77747,4.0,0.194566,0,1,0
3,2019-06-01,17.0,80.75,0.00572,-7.4e-05,0.0,191.530016,-0.365341,1.589722,306.19055,100287.905,302.927927,308.5802,4.0,0.264169,0,1,0
4,2019-06-01,17.0,81.0,0.0,-8e-05,0.0,191.530016,-0.611435,1.629272,306.07239,100296.03,302.927927,307.95227,4.0,0.270448,0,1,0
5,2019-06-01,17.0,81.25,0.00954,-8.4e-05,0.0,191.530016,-0.615829,1.21228,306.314575,100289.28,302.927927,308.391725,4.0,0.252602,0,1,0
6,2019-06-01,17.0,81.5,0.00572,-8.5e-05,0.0,191.530016,-0.370224,1.239136,306.44445,100279.78,302.927927,308.63879,4.0,0.258607,0,1,0
7,2019-06-01,17.0,81.75,0.00191,-8.6e-05,0.0,191.530016,-0.34581,1.630249,307.045035,100276.345,302.927927,309.271605,3.0,0.170922,0,1,0
8,2019-06-01,17.0,82.0,0.000477,-6.3e-05,0.0,191.530016,-0.78331,2.411499,306.698365,100291.405,302.927927,309.20032,4.0,0.199319,0,1,0
9,2019-06-01,17.0,82.25,0.00381,-4.2e-05,0.0,191.530016,-1.113388,3.781616,304.482545,100312.22,302.927927,305.94641,4.0,0.143288,0,1,0


In [60]:
df_daily.to_csv('labelled_dataset_flood1.csv', index=False)


In [61]:
from google.colab import files
files.download('labelled_dataset_flood1.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>