In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
train = pd.read_csv("dataset/train.csv")
test = pd.read_csv("dataset/test.csv")
print(train.shape, test.shape)

(14000, 12) (6000, 11)


In [3]:
# Convert timestamp to datetime format with dayfirst=True
train["Timestamp"] = pd.to_datetime(train["Timestamp"], dayfirst=True)
test["Timestamp"] = pd.to_datetime(test["Timestamp"], dayfirst=True)

In [4]:
# Extract time-based features
for df in [train, test]:
    df["Hour"] = df["Timestamp"].dt.hour
    df["Day"] = df["Timestamp"].dt.day
    df["Month"] = df["Timestamp"].dt.month

print(train.head())
print(test.head())


            Timestamp  Residents Apartment_Type  Temperature Humidity  \
0 2002-01-01 00:00:00          1         Studio        15.31    46.61   
1 2002-01-01 08:00:00          4            NaN        21.01    66.11   
2 2002-01-01 16:00:00          2        Cottage        12.86    60.86   
3 2002-01-02 00:00:00          2           1BHK        20.16    50.58   
4 2002-01-02 08:00:00          2        Cottage        16.23    52.25   

   Water_Price  Period_Consumption_Index  Income_Level  Guests      Amenities  \
0         1.06                      0.97           Low       0  Swimming Pool   
1         2.98                      0.91  Upper Middle       1  Swimming Pool   
2         1.44                      1.43        Middle       0            NaN   
3         1.48                      0.91        Middle      -1         Garden   
4         1.14                      1.11        Middle       0       Fountain   

   Appliance_Usage  Water_Consumption  Hour  Day  Month  
0              0

In [5]:
# Preserve Timestamp for final submission
test_timestamps = test["Timestamp"].dt.strftime("%d/%m/%Y %H")

In [6]:
# Drop the Timestamp column
train.drop(columns=["Timestamp"], inplace=True)
test.drop(columns=["Timestamp"], inplace=True)

print(train.head())
print(test.head())


   Residents Apartment_Type  Temperature Humidity  Water_Price  \
0          1         Studio        15.31    46.61         1.06   
1          4            NaN        21.01    66.11         2.98   
2          2        Cottage        12.86    60.86         1.44   
3          2           1BHK        20.16    50.58         1.48   
4          2        Cottage        16.23    52.25         1.14   

   Period_Consumption_Index  Income_Level  Guests      Amenities  \
0                      0.97           Low       0  Swimming Pool   
1                      0.91  Upper Middle       1  Swimming Pool   
2                      1.43        Middle       0            NaN   
3                      0.91        Middle      -1         Garden   
4                      1.11        Middle       0       Fountain   

   Appliance_Usage  Water_Consumption  Hour  Day  Month  
0              0.0              64.85     0    1      1  
1              1.0             192.50     8    1      1  
2              1.0  

In [7]:
# Ensure categorical columns are treated as strings
categorical_features = ["Apartment_Type", "Income_Level", "Amenities"]
for col in categorical_features:
    train[col] = train[col].astype(str)
    test[col] = test[col].astype(str)

In [8]:
# Define features and target
X = train.drop("Water_Consumption", axis=1)
y = train["Water_Consumption"]

In [9]:
# Identify categorical and numerical columns
numerical_features = [col for col in X.columns if col not in categorical_features]

In [10]:
# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', dtype=np.float32), categorical_features)
])

In [11]:
# Model pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [12]:
# Split data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Ensure numerical columns contain valid numeric data
for col in numerical_features:
	X_train[col] = pd.to_numeric(X_train[col], errors='coerce')
	X_val[col] = pd.to_numeric(X_val[col], errors='coerce')

# Fill missing values in numerical columns with the median
for col in numerical_features:
	X_train[col].fillna(X_train[col].median(), inplace=True)
	X_val[col].fillna(X_val[col].median(), inplace=True)

# Handle missing or invalid values in categorical columns
for col in categorical_features:
	X_train[col].fillna("Unknown", inplace=True)
	X_val[col].fillna("Unknown", inplace=True)

# Train the model
model.fit(X_train, y_train)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(X_train[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val[col].fillna(X_val[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we ar

In [14]:
# Validate the model
val_preds = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_preds))
score = max(0, 100 - rmse)
print(f"Validation Score: {score}")


Validation Score: 81.92460834445686


In [15]:
# Ensure numerical columns in the test set contain valid numeric data
for col in numerical_features:
	test[col] = pd.to_numeric(test[col], errors='coerce')

# Fill missing values in numerical columns with the median
for col in numerical_features:
	test[col].fillna(test[col].median(), inplace=True)

# Make predictions on the test set
test_preds = model.predict(test)
submission = pd.DataFrame({"Timestamp": test_timestamps, "Water_Consumption": test_preds})


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(test[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(test[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

In [16]:

# Save submission
submission.to_csv("submission.csv", index=False)