Kaggle Competition
UNT HydroInsight 2024

Group: 
Rendi King
Namita Victor
Lakshmi Triveni Muthyala

In [1]:
#Libraries needed
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')


In [2]:
#Uploaded files & Review Data
train_file = 'floodprediction-comp.csv'  
test_file = 'floodprediction-submission.csv'  

train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)

print("Training Data Info:")
print(train_data.info())
print("\nTest Data Info:")
print(test_data.info())


Training Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35000 entries, 0 to 34999
Data columns (total 27 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   MonsoonIntensity                 35000 non-null  int64  
 1   TopographyDrainage               35000 non-null  int64  
 2   RiverManagement                  35000 non-null  int64  
 3   Deforestation                    35000 non-null  int64  
 4   Urbanization                     35000 non-null  int64  
 5   ClimateChange                    35000 non-null  int64  
 6   DamsQuality                      35000 non-null  int64  
 7   Siltation                        35000 non-null  int64  
 8   AgriculturalPractices            35000 non-null  int64  
 9   Encroachments                    35000 non-null  int64  
 10  IneffectiveDisasterPreparedness  35000 non-null  int64  
 11  DrainageSystems                  35000 non-null  int64  
 12

In [3]:
#Numeric vs non-numeric columns
numeric_columns = train_data.select_dtypes(include=['number']).columns.drop('FloodProbability')
non_numeric_columns = train_data.select_dtypes(exclude=['number']).columns

#Replace missing values in numeric columns with the median
train_data[numeric_columns] = train_data[numeric_columns].fillna(train_data[numeric_columns].median())
test_data[numeric_columns] = test_data[numeric_columns].fillna(test_data[numeric_columns].median())

#Replace missing values in non-numeric columns with the mode
for column in non_numeric_columns:
    train_data[column].fillna(train_data[column].mode()[0], inplace=True)
    if column in test_data.columns:
        test_data[column].fillna(test_data[column].mode()[0], inplace=True)


In [4]:
#Training
train_data = pd.get_dummies(train_data, columns=non_numeric_columns, drop_first=True)
test_data = pd.get_dummies(test_data, columns=non_numeric_columns, drop_first=True)

train_data, test_data = train_data.align(test_data, join='left', axis=1, fill_value=0)


In [8]:
#Separate out the target variable
X = train_data.drop(columns=['FloodProbability'])
y = train_data['FloodProbability']

#Drop Flood Probability from test_data
if 'FloodProbability' in test_data.columns:
    test_data = test_data.drop(columns=['FloodProbability'])

#Align the training and testing datasets
X, test_data = X.align(test_data, join='left', axis=1, fill_value=0)

#Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

#Standardize numeric features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
test_data_scaled = scaler.transform(test_data)

In [None]:
#Random Forest model
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

In [None]:
#Predict and evaluate on validation data
y_pred = rf.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Random Forest RMSE: {rmse}")

In [None]:
# Predict on test data
test_predictions = rf.predict(test_data_scaled)