# Flood Prediction Baseline

In [45]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import missingno as msno

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [56]:
train_path = '/kaggle/input/playground-series-s4e5/train.csv'
test_path = '/kaggle/input/playground-series-s4e5/test.csv'
submission_path = '/kaggle/input/playground-series-s4e5/sample_submission.csv'

In [26]:
def read_data(path, index_col=None):
    df = pd.read_csv(path)
    if index_col != None:
        df = df.set_index(index_col)
    return df

train_df = read_data(train_path, 'id')
test_df = read_data(test_path, 'id')

In [27]:
print(train_df.shape)
print(test_df.shape)

(1117957, 21)
(745305, 20)


In [37]:
label = 'FloodProbability'
X = train_df.drop(columns=[label])
y = train_df[[label]]

# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [50]:
def find_best_n_estimator(X_train, y_train, X_val, y_val, max_range=100):
    estimators = []
    scores = []
    for i in range(1, max_range + 1):
        estimators.append(i)
        rf_regressor = RandomForestRegressor(n_estimators=i)
        rf_regressor.fit(X_train, y_train.values.ravel())
        score = rf_regressor.score(X_val, y_val)
        scores.append(score)
    
    plt.plot(scores, estimators, marker='o', linestyle='-')
    plt.xlabel('r2 scores')
    plt.ylabel('Num estimators')
    plt.grid(True)
    plt.show()

In [53]:
rf_regressor = RandomForestRegressor(n_estimators=20)
rf_regressor.fit(X_train, y_train.values.ravel())
score = rf_regressor.score(X_val, y_val)
print(score)

### Submission

In [61]:
test_data = read_data(test_path, 'id')
submission = pd.read_csv(submission_path)

In [62]:
preds = rf_regressor.predict(test_data)

array([0.5585, 0.4703, 0.467 , 0.4727, 0.4885, 0.501 , 0.5466, 0.5151,
       0.4844, 0.5239, 0.503 ])

In [64]:
submission['FloodProbability'] = preds

Unnamed: 0,id,FloodProbability
0,1117957,0.5585
1,1117958,0.4703
2,1117959,0.467
3,1117960,0.4727
4,1117961,0.4885


In [65]:
submission.to_csv('submission.csv', index=False)
!head submission.csv

id,FloodProbability
1117957,0.5585000000000001
1117958,0.4702999999999999
1117959,0.46699999999999986
1117960,0.4726999999999999
1117961,0.4885000000000001
1117962,0.501
1117963,0.5466
1117964,0.5151
1117965,0.48440000000000005
