# HackerEarth Water Day Hackathon Submission

Submitted by - Sparsh Saxena

In [44]:
# Importing the libraries for use

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error, r2_score
from warnings import filterwarnings
filterwarnings('ignore')

In [45]:
# Connection with the google drive for dataset files

from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [46]:
# Importing the train and test datasets into the notebook

train_df = pd.read_csv('/content/gdrive/My Drive/train.csv')
test_df = pd.read_csv('/content/gdrive/My Drive/test.csv')
submission_ids = test_df['Timestamp']

In [47]:
train_df.head(5)

Unnamed: 0,Timestamp,Residents,Apartment_Type,Temperature,Humidity,Water_Price,Period_Consumption_Index,Income_Level,Guests,Amenities,Appliance_Usage,Water_Consumption
0,01/01/2002 00,1,Studio,15.31,46.61,1.06,0.97,Low,0,Swimming Pool,0.0,64.85
1,01/01/2002 08,4,,21.01,66.11,2.98,0.91,Upper Middle,1,Swimming Pool,1.0,192.5
2,01/01/2002 16,2,Cottage,12.86,60.86,1.44,1.43,Middle,0,,1.0,116.62
3,02/01/2002 00,2,1BHK,20.16,50.58,1.48,0.91,Middle,-1,Garden,0.0,76.96
4,02/01/2002 08,2,Cottage,16.23,52.25,1.14,1.11,Middle,0,Fountain,0.0,104.7


In [48]:
test_df.head(5)

Unnamed: 0,Timestamp,Residents,Apartment_Type,Temperature,Humidity,Water_Price,Period_Consumption_Index,Income_Level,Guests,Amenities,Appliance_Usage
0,11/10/2014 16,5,Bungalow,11.89,57.88,2.77,1.48,Upper Middle,1,Jacuzzi,0.0
1,12/10/2014 00,4,Bungalow,29.22,61.41,2.85,1.3,Upper Middle,0,Garden,
2,12/10/2014 08,3,Cottage,10.27,64.9,1.66,0.97,Middle,0,,0.0
3,12/10/2014 16,3,1BHK,27.03,52.67,1.48,1.31,Low,0,,0.0
4,13/10/2014 00,2,Cottage,12.32,55.23,1.19,1.3,Middle,0,Swimming Pool,0.0


In [49]:
col_names = train_df.columns
col_names

Index(['Timestamp', 'Residents', 'Apartment_Type', 'Temperature', 'Humidity',
       'Water_Price', 'Period_Consumption_Index', 'Income_Level', 'Guests',
       'Amenities', 'Appliance_Usage', 'Water_Consumption'],
      dtype='object')

In [50]:
# Checking for null values in train.csv

train_df.isnull().sum()

Unnamed: 0,0
Timestamp,0
Residents,0
Apartment_Type,426
Temperature,441
Humidity,0
Water_Price,0
Period_Consumption_Index,0
Income_Level,426
Guests,0
Amenities,5997


In [51]:
# Checking for null values in test.csv

test_df.isnull().sum()

Unnamed: 0,0
Timestamp,0
Residents,0
Apartment_Type,166
Temperature,150
Humidity,0
Water_Price,0
Period_Consumption_Index,0
Income_Level,165
Guests,0
Amenities,2513


In [52]:
# Fill missing values in train_df
train_df.fillna({
    'Temperature': train_df['Temperature'].mean(),  # Direct mean calculation
    'Appliance_Usage': train_df['Appliance_Usage'].mean(),
    'Apartment_Type': 'non',
    'Income_Level': 'non',
    'Amenities': 'non'
}, inplace=True)

# Fill missing values in test_df using train_df statistics
test_df.fillna({
    'Temperature': train_df['Temperature'].mean(),
    'Appliance_Usage': train_df['Appliance_Usage'].mean(),  # Fixed column name
    'Apartment_Type': 'non',
    'Income_Level': 'non',
    'Amenities': 'non'
}, inplace=True)

In [53]:
# Dropping the Timestamp column as it is irrelevant here

train_df.drop('Timestamp', axis=1, inplace=True)
test_df.drop('Timestamp', axis=1, inplace=True)

In [54]:
# Checking out the types of columns

print(train_df.dtypes)

Residents                     int64
Apartment_Type               object
Temperature                 float64
Humidity                     object
Water_Price                 float64
Period_Consumption_Index    float64
Income_Level                 object
Guests                        int64
Amenities                    object
Appliance_Usage             float64
Water_Consumption           float64
dtype: object


In [55]:
# List of numerical columns to convert to float64
numerical_cols = ['Residents', 'Guests']

# List of object columns to convert to float64
float_conversion_cols = ['Temperature', 'Humidity', 'Appliance_Usage']

# Convert numerical columns to float64
train_df[numerical_cols] = train_df[numerical_cols].astype('float64')
test_df[numerical_cols] = test_df[numerical_cols].astype('float64')

# Convert 'Humidity' to numeric, replacing invalid values with NaN
train_df['Humidity'] = pd.to_numeric(train_df['Humidity'], errors='coerce')
test_df['Humidity'] = pd.to_numeric(test_df['Humidity'], errors='coerce')

# Fill NaN values in 'Humidity' with column mean
humidity_mean = train_df['Humidity'].mean()
train_df['Humidity'].fillna(humidity_mean, inplace=True)
test_df['Humidity'].fillna(humidity_mean, inplace=True)

# Convert other object columns to float64 safely
for col in ['Temperature', 'Appliance_Usage']:
    train_df[col] = pd.to_numeric(train_df[col], errors='coerce')
    test_df[col] = pd.to_numeric(test_df[col], errors='coerce')

# Fill NaN values in numeric columns with their respective means
for col in float_conversion_cols:
    train_df[col].fillna(train_df[col].mean(), inplace=True)
    test_df[col].fillna(train_df[col].mean(), inplace=True)  # Using train mean for consistency

# Fill categorical missing values
train_df.fillna({'Apartment_Type': 'non', 'Income_Level': 'non', 'Amenities': 'non'}, inplace=True)
test_df.fillna({'Apartment_Type': 'non', 'Income_Level': 'non', 'Amenities': 'non'}, inplace=True)

# Convert Apartment_Type to object
train_df['Apartment_Type'] = train_df['Apartment_Type'].astype('object')
test_df['Apartment_Type'] = test_df['Apartment_Type'].astype('object')

# Display updated types
print("Train Data Types for columns : \n")
print(train_df.dtypes)

print()

print("Test Data Types for columns : \n")
print(test_df.dtypes)

Train Data Types for columns : 

Residents                   float64
Apartment_Type               object
Temperature                 float64
Humidity                    float64
Water_Price                 float64
Period_Consumption_Index    float64
Income_Level                 object
Guests                      float64
Amenities                    object
Appliance_Usage             float64
Water_Consumption           float64
dtype: object

Test Data Types for columns : 

Residents                   float64
Apartment_Type               object
Temperature                 float64
Humidity                    float64
Water_Price                 float64
Period_Consumption_Index    float64
Income_Level                 object
Guests                      float64
Amenities                    object
Appliance_Usage             float64
dtype: object


In [56]:
# Identifying categorical columns
categorical_cols = train_df.select_dtypes(include=['object']).columns

# Applying LabelEncoder to each categorical column
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])  # Transforming the column in train_df
    test_df[col] = le.fit_transform(test_df[col])  # Transforming the column in test_df
    label_encoders[col] = le  # Storing encoder for future inverse transform if needed

In [57]:
# Converting the int64 columns to float64

int64_cols = train_df.select_dtypes(include=['int64']).columns
for col in int64_cols:
    train_df[col] = train_df[col].astype('float64')
    test_df[col] = test_df[col].astype('float64')

In [58]:
# Splitting the dataset into feature matrix and the target column

target = 'Water_Consumption'

X = train_df.drop(target, axis=1)
y = train_df[target]

In [59]:
# Train Test Split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [60]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

In [61]:
pred = model.predict(X_test)
mse = mean_squared_error(y_test, pred)
score = max(0, 100 - np.sqrt(mse))
print("Training Score : ", score)

Training Score :  80.64434995374616


In [62]:
!pip install optuna



In [69]:
# Using Bayesian Optimization to find the best hyperparams
import optuna

# Split train data further into internal train and validation sets
X_train_opt, X_val, y_train_opt, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

def objective(trial):
    # Define a thorough search space
    n_estimators = trial.suggest_int('n_estimators', 100, 1000, step=50)
    max_depth = trial.suggest_int('max_depth', 5, 100, step=5)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])
    criterion = trial.suggest_categorical('criterion', ['squared_error', 'absolute_error'])

    # Train the Random Forest model
    rf = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        bootstrap=bootstrap,
        criterion=criterion,
        random_state=42,
        n_jobs=-1
    )

    rf.fit(X_train_opt, y_train_opt)  # Train on smaller training set
    pred = rf.predict(X_val)  # Validate on internal validation set

    # Compute the custom metric
    mse = mean_squared_error(y_val, pred)
    score = max(0, 100 - np.sqrt(mse))

    return score  # Optuna will try to maximize this

In [68]:
# Carrying out the study using optimization process for best hyperparams

study = optuna.create_study(direction='maximize')  # Maximize the custom score
study.optimize(objective, n_trials=50)  # Run 50 trials for thorough tuning

# Print best hyperparameters
print("Best Hyperparameters:", study.best_params)

[I 2025-03-24 15:22:46,779] A new study created in memory with name: no-name-7404f05f-da94-4f15-ad6a-9c66c8d3077f
[I 2025-03-24 15:23:03,544] Trial 0 finished with value: 80.78350141461284 and parameters: {'n_estimators': 850, 'max_depth': 60, 'min_samples_split': 15, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': False, 'criterion': 'squared_error'}. Best is trial 0 with value: 80.78350141461284.
[I 2025-03-24 15:23:09,417] Trial 1 finished with value: 77.98420406679745 and parameters: {'n_estimators': 450, 'max_depth': 15, 'min_samples_split': 17, 'min_samples_leaf': 8, 'max_features': 'sqrt', 'bootstrap': True, 'criterion': 'squared_error'}. Best is trial 0 with value: 80.78350141461284.
[I 2025-03-24 15:26:58,281] Trial 2 finished with value: 64.13065855603892 and parameters: {'n_estimators': 250, 'max_depth': 5, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'log2', 'bootstrap': False, 'criterion': 'absolute_error'}. Best is trial 0 with value: 80.783

KeyboardInterrupt: 

In [75]:
# Testing model after hyperparam tuning from an earlier run

params = {'n_estimators': 850, 'max_depth': 60, 'min_samples_split': 15, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': False, 'criterion': 'squared_error'}

model_test = RandomForestRegressor(
        n_estimators=params['n_estimators'],
        max_depth=params['max_depth'],
        min_samples_split=params['min_samples_split'],
        min_samples_leaf=params['min_samples_leaf'],
        max_features=params['max_features'],
        bootstrap=params['bootstrap'],
        criterion=params['criterion'],
        random_state=42,
        n_jobs=-1
)

model_test.fit(X_train, y_train)

In [76]:
# Testing model after hyperparam tuning

predictions = model_test.predict(X_test)
mse = mean_squared_error(y_test, pred)
score = max(0, 100 - np.sqrt(mse))
print("Training Score after tuning : ", score)

Training Score after tuning :  80.64434995374616


In [79]:
# Training on full train data

X_train = train_df.drop(target, axis=1)
y_train = train_df[target]

model_test.fit(X_train, y_train)

In [80]:
# Storing the file to submit submission.csv

predicted = pd.DataFrame(model_test.predict(test_df),columns=['Water_Consumption'])
answer = pd.concat([submission_ids,predicted],axis=1)
answer.to_csv('submission.csv',index=False)