In [135]:
import numpy as np
import pandas as pd
from sklearn.ensemble import StackingRegressor, GradientBoostingRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [136]:
df_train = pd.read_csv('/kaggle/input/swms-data/dataset/train.csv')
print(df_train.head())
df_test = pd.read_csv('/kaggle/input/swms-data/dataset/test.csv')
print(df_test.head())
df_sample_sub = pd.read_csv('/kaggle/input/swms-data/dataset/sample_submission.csv')
print(df_sample_sub.head())

       Timestamp  Residents Apartment_Type  Temperature Humidity  Water_Price  \
0  01/01/2002 00          1         Studio        15.31    46.61         1.06   
1  01/01/2002 08          4            NaN        21.01    66.11         2.98   
2  01/01/2002 16          2        Cottage        12.86    60.86         1.44   
3  02/01/2002 00          2           1BHK        20.16    50.58         1.48   
4  02/01/2002 08          2        Cottage        16.23    52.25         1.14   

   Period_Consumption_Index  Income_Level  Guests      Amenities  \
0                      0.97           Low       0  Swimming Pool   
1                      0.91  Upper Middle       1  Swimming Pool   
2                      1.43        Middle       0            NaN   
3                      0.91        Middle      -1         Garden   
4                      1.11        Middle       0       Fountain   

   Appliance_Usage  Water_Consumption  
0              0.0              64.85  
1              1.0      

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


In [137]:
# Save original Timestamp from test for submission
test_timestamps = df_test["Timestamp"].copy()

In [138]:
# Convert categorical NaNs to 'missing'
categorical_cols = ['Apartment_Type', 'Income_Level', 'Amenities']
df_train[categorical_cols] = df_train[categorical_cols].fillna("missing")
df_test[categorical_cols] = df_test[categorical_cols].fillna("missing")

In [139]:
# One-Hot Encode Apartment_Type and Amenities
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
encoded_train = encoder.fit_transform(df_train[['Apartment_Type', 'Amenities']])
encoded_test = encoder.transform(df_test[['Apartment_Type', 'Amenities']])



In [140]:
encoded_train = pd.DataFrame(encoded_train, columns=encoder.get_feature_names_out(['Apartment_Type', 'Amenities']))
encoded_test = pd.DataFrame(encoded_test, columns=encoder.get_feature_names_out(['Apartment_Type', 'Amenities']))

In [141]:
df_train = df_train.reset_index(drop=True).join(encoded_train)
df_test = df_test.reset_index(drop=True).join(encoded_test)
df_train.drop(columns=['Apartment_Type', 'Amenities'], inplace=True)
df_test.drop(columns=['Apartment_Type', 'Amenities'], inplace=True)

In [142]:
# Label Encode Income_Level
label_encoder = LabelEncoder()
label_encoder.fit(df_train['Income_Level'].astype(str))
df_train['Income_Level'] = label_encoder.transform(df_train['Income_Level'].astype(str))
df_test['Income_Level'] = df_test['Income_Level'].apply(lambda x: label_encoder.transform([x])[0] if x in label_encoder.classes_ else -1)


In [143]:
# Impute Temperature using rolling mean of previous 3 & next 3 non-NaN values
def impute_temperature(df):
    df['Temperature'] = df['Temperature'].astype(float)
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H')
    df = df.sort_values(by=['Timestamp'])
    df['Temperature'] = df['Temperature'].fillna(df['Temperature'].rolling(window=7, center=True, min_periods=1).mean())
    return df

df_train = impute_temperature(df_train)
df_test = impute_temperature(df_test)

In [144]:
# Fill missing Appliance_Usage with median
df_train['Appliance_Usage'].fillna(df_train['Appliance_Usage'].median(), inplace=True)
df_test['Appliance_Usage'].fillna(df_test['Appliance_Usage'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['Appliance_Usage'].fillna(df_train['Appliance_Usage'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test['Appliance_Usage'].fillna(df_test['Appliance_Usage'].median(), inplace=True)


In [145]:
# Extract time-based features
def extract_time_features(df):
    df['Timestamp_dt'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H')
    df['Year'] = df['Timestamp_dt'].dt.year
    df['Month'] = df['Timestamp_dt'].dt.month
    df['Day'] = df['Timestamp_dt'].dt.day
    df['Hour'] = df['Timestamp_dt'].dt.hour
    df['Weekday'] = df['Timestamp_dt'].dt.weekday
    df['Season'] = df['Month'] % 12 // 3 + 1  # 1: Winter, 2: Spring, 3: Summer, 4: Fall
    return df

df_train = extract_time_features(df_train)
df_test = extract_time_features(df_test)

In [147]:
df_train['Residents'] = pd.to_numeric(df_train['Residents'], errors='coerce')
df_test['Residents'] = pd.to_numeric(df_test['Residents'], errors='coerce')

df_train['Guests'] = pd.to_numeric(df_train['Guests'], errors='coerce')
df_test['Guests'] = pd.to_numeric(df_test['Guests'], errors='coerce')

df_train['Appliance_Usage'] = pd.to_numeric(df_train['Appliance_Usage'], errors='coerce')
df_test['Appliance_Usage'] = pd.to_numeric(df_test['Appliance_Usage'], errors='coerce')

df_train['Income_Level'] = pd.to_numeric(df_train['Income_Level'], errors='coerce')
df_test['Income_Level'] = pd.to_numeric(df_test['Income_Level'], errors='coerce')

df_train['Temperature'] = pd.to_numeric(df_train['Temperature'], errors='coerce')
df_test['Temperature'] = pd.to_numeric(df_test['Temperature'], errors='coerce')

df_train['Humidity'] = pd.to_numeric(df_train['Humidity'], errors='coerce')
df_test['Humidity'] = pd.to_numeric(df_test['Humidity'], errors='coerce')


In [148]:
# Create lag features for previous water consumption periods
def create_lag_features(df, lags=[1, 2, 3]):
    for lag in lags:
        df[f'Water_Consumption_Lag{lag}'] = df['Water_Consumption'].shift(lag)
    return df

df_train = create_lag_features(df_train)
df_train.fillna(df_train.mean(), inplace=True)  # Fill NaNs after shifting

In [149]:
# Drop unneeded columns
df_train.drop(columns=['Timestamp', 'Timestamp_dt', 'Humidity'], inplace=True)
df_test.drop(columns=['Timestamp', 'Timestamp_dt', 'Humidity'], inplace=True)

# Split features and target
X = df_train.drop(columns=['Water_Consumption'])
y = df_train['Water_Consumption']

In [150]:
# Base models with best hyperparameters
estimators = [
    ('cat', CatBoostRegressor(iterations=300, depth=6, learning_rate=0.31, random_state=42, verbose=0)),
    ('gbr', GradientBoostingRegressor(n_estimators=100, max_depth=5, learning_rate=0.2, random_state=42)),
    ('lgbm', LGBMRegressor(n_estimators=100, max_depth=5, learning_rate=0.2, random_state=42))
]

# Weighted Linear Regression as the meta-estimator
final_estimator = LinearRegression()
model = StackingRegressor(estimators=estimators, final_estimator=final_estimator, cv=5)
model.fit(X, y)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001458 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1904
[LightGBM] [Info] Number of data points in the train set: 14000, number of used features: 31
[LightGBM] [Info] Start training from score 164.461230
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000926 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1899
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 31
[LightGBM] [Info] Start training from score 164.797520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001600 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Tota

In [152]:
# Ensure test set has the lag features present in training
lag_cols = [col for col in X.columns if 'Water_Consumption_Lag' in col]
for col in lag_cols:
    if col not in df_test.columns:
        # Fill with the last known Water_Consumption value from training, or use the mean
        df_test[col] = df_train['Water_Consumption'].iloc[-1]


In [153]:
preds = model.predict(df_test)

In [154]:
# Create submission file:
# Use the original test timestamps (or df_sample_sub if format-specific) for the Timestamp column.
df_submission = pd.DataFrame({
    'Timestamp': test_timestamps,  # or you could use df_sample_sub['Timestamp'] if that's required
    'Water_Consumption': preds
})
df_submission.to_csv("submission.csv", index=False)

print("Submission file created: submission.csv")

Submission file created: submission.csv


In [155]:
df_submission.head()

Unnamed: 0,Timestamp,Water_Consumption
0,11/10/2014 16,322.265981
1,12/10/2014 00,199.095846
2,12/10/2014 08,81.35933
3,12/10/2014 16,124.025667
4,13/10/2014 00,123.807088
