In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
# Store original Timestamp from test for submission
test_timestamps = test['Timestamp'].copy()

In [4]:
# Check Timestamp
print("Train Timestamp sample:", train['Timestamp'].head(10).tolist())
print("Missing values in train Timestamp:", train['Timestamp'].isna().sum(), "/", len(train))
print("Test Timestamp sample:", test_timestamps.head(10).tolist())
print("Missing values in test Timestamp:", test_timestamps.isna().sum(), "/", len(test))

Train Timestamp sample: ['01/01/2002 00', '01/01/2002 08', '01/01/2002 16', '02/01/2002 00', '02/01/2002 08', '02/01/2002 16', '03/01/2002 00', '03/01/2002 08', '03/01/2002 16', '04/01/2002 00']
Missing values in train Timestamp: 0 / 14000
Test Timestamp sample: ['11/10/2014 16', '12/10/2014 00', '12/10/2014 08', '12/10/2014 16', '13/10/2014 00', '13/10/2014 08', '13/10/2014 16', '14/10/2014 00', '14/10/2014 08', '14/10/2014 16']
Missing values in test Timestamp: 0 / 6000


In [5]:
# Handle anomalies in numeric columns
numeric_cols = ['Temperature', 'Humidity', 'Water_Price', 'Residents', 'Guests', 'Period_Consumption_Index']
for col in numeric_cols:
    train[col] = pd.to_numeric(train[col], errors='coerce')
    test[col] = pd.to_numeric(test[col], errors='coerce')
    train_median = train[col].median()
    test_median = test[col].median()
    train[col] = train[col].fillna(train_median)
    test[col] = test[col].fillna(test_median)

In [6]:
# Fix negative values
train['Residents'] = train['Residents'].apply(lambda x: max(x, 1))
test['Residents'] = test['Residents'].apply(lambda x: max(x, 1))
train['Water_Price'] = train['Water_Price'].apply(lambda x: max(x, 0))
test['Water_Price'] = test['Water_Price'].apply(lambda x: max(x, 0))

In [7]:
# Encode categorical columns
categorical_cols = ['Apartment_Type', 'Income_Level', 'Appliance_Usage']
for col in categorical_cols:
    train[col] = train[col].fillna('Missing').astype('category').cat.codes
    test[col] = test[col].fillna('Missing').astype('category').cat.codes

In [8]:
# Check for any remaining string columns (excluding Timestamp)
object_cols = train.select_dtypes(include='object').columns
for col in object_cols:
    if col != 'Timestamp':
        print(f"Encoding unexpected column: {col}")
        train[col] = train[col].fillna('Missing').astype('category').cat.codes
        test[col] = test[col].fillna('Missing').astype('category').cat.codes

Encoding unexpected column: Amenities


In [9]:
# Features and target
X_train = train.drop(['Timestamp', 'Water_Consumption'], axis=1)
y_train = train['Water_Consumption']
X_test = test.drop('Timestamp', axis=1)

In [10]:
# Verify all numeric
print("X_train dtypes:\n", X_train.dtypes)
print("X_test dtypes:\n", X_test.dtypes)

X_train dtypes:
 Residents                     int64
Apartment_Type                 int8
Temperature                 float64
Humidity                    float64
Water_Price                 float64
Period_Consumption_Index    float64
Income_Level                  int16
Guests                        int64
Amenities                      int8
Appliance_Usage                int8
dtype: object
X_test dtypes:
 Residents                     int64
Apartment_Type                 int8
Temperature                 float64
Humidity                    float64
Water_Price                 float64
Period_Consumption_Index    float64
Income_Level                  int16
Guests                        int64
Amenities                      int8
Appliance_Usage                int8
dtype: object


In [11]:
# Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
test['Water_Consumption'] = rf.predict(X_test)

In [12]:
# Submission using original test Timestamps
submission = pd.DataFrame({
    'Timestamp': test_timestamps,
    'Water_Consumption': test['Water_Consumption']
})
submission.to_csv('submission.csv', index=False)

In [13]:
# Verify submission
print("Submission sample:")
print(submission.head())
print("Submission shape:", submission.shape)

Submission sample:
       Timestamp  Water_Consumption
0  11/10/2014 16           301.6945
1  12/10/2014 00           256.8390
2  12/10/2014 08            86.5053
3  12/10/2014 16           133.1509
4  13/10/2014 00           132.6131
Submission shape: (6000, 2)
