In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')

In [2]:
file_path = r"E:\Supervised learning model\household_power_consumption.txt"  # replace with your local path

# Read the large file efficiently
df = pd.read_csv(
    file_path,
    sep=';',
    low_memory=False,
    parse_dates={'DateTime': ['Date', 'Time']},
    na_values='?'
)

In [3]:
# Drop missing values
df = df.dropna()

In [4]:
cols = ["Global_active_power", "Global_reactive_power", "Voltage", 
        "Global_intensity", "Sub_metering_1", "Sub_metering_2", "Sub_metering_3"]
df[cols] = df[cols].apply(pd.to_numeric)

print("Dataset loaded:", df.shape)
print(df.head())

Dataset loaded: (2049280, 8)
             DateTime  Global_active_power  Global_reactive_power  Voltage  \
0 2006-12-16 17:24:00                4.216                  0.418   234.84   
1 2006-12-16 17:25:00                5.360                  0.436   233.63   
2 2006-12-16 17:26:00                5.374                  0.498   233.29   
3 2006-12-16 17:27:00                5.388                  0.502   233.74   
4 2006-12-16 17:28:00                3.666                  0.528   235.68   

   Global_intensity  Sub_metering_1  Sub_metering_2  Sub_metering_3  
0              18.4             0.0             1.0            17.0  
1              23.0             0.0             1.0            16.0  
2              23.0             0.0             2.0            17.0  
3              23.0             0.0             1.0            17.0  
4              15.8             0.0             1.0            17.0  


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2049280 entries, 0 to 2075258
Data columns (total 8 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   DateTime               datetime64[ns]
 1   Global_active_power    float64       
 2   Global_reactive_power  float64       
 3   Voltage                float64       
 4   Global_intensity       float64       
 5   Sub_metering_1         float64       
 6   Sub_metering_2         float64       
 7   Sub_metering_3         float64       
dtypes: datetime64[ns](1), float64(7)
memory usage: 140.7 MB


In [6]:
daily_df = df.resample('D', on='DateTime').mean().dropna()
print("Daily aggregated data:", daily_df.shape)
print(daily_df.head())

Daily aggregated data: (1433, 7)
            Global_active_power  Global_reactive_power     Voltage  \
DateTime                                                             
2006-12-16             3.053475               0.088187  236.243763   
2006-12-17             2.354486               0.156949  240.087028   
2006-12-18             1.530435               0.112356  241.231694   
2006-12-19             1.157079               0.104821  241.999313   
2006-12-20             1.545658               0.111804  242.308062   

            Global_intensity  Sub_metering_1  Sub_metering_2  Sub_metering_3  
DateTime                                                                      
2006-12-16         13.082828        0.000000        1.378788       12.439394  
2006-12-17          9.999028        1.411806        2.907639        9.264583  
2006-12-18          6.421667        0.738194        1.820139        9.734722  
2006-12-19          4.926389        0.582639        5.279167        4.303472  
20

In [7]:
# Regression target: next day's consumption
daily_df["next_day_consumption"] = daily_df["Global_active_power"].shift(-1)

# Classification target: energy plan based on usage
daily_df["energy_plan"] = pd.cut(
    daily_df["Global_active_power"],
    bins=[0, 1, 3, 6],
    labels=["Plan A", "Plan B", "Plan C"]
)

# Drop rows with NaN targets
daily_df = daily_df.dropna()


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
import numpy as np
import pickle

# Features
X = daily_df[["Global_reactive_power", "Voltage", "Global_intensity",
              "Sub_metering_1", "Sub_metering_2", "Sub_metering_3"]]

# Regression
y_reg = daily_df["next_day_consumption"]
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X, y_reg, test_size=0.2, random_state=42)
regressor = RandomForestRegressor(n_estimators=100, random_state=42)
regressor.fit(X_train_r, y_train_r)
y_pred_r = regressor.predict(X_test_r)
print("Regression RMSE:", np.sqrt(mean_squared_error(y_test_r, y_pred_r)))

# Classification
y_clf = daily_df["energy_plan"]
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_clf, test_size=0.2, random_state=42)
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train_c, y_train_c)
y_pred_c = classifier.predict(X_test_c)
print("Classification Accuracy:", accuracy_score(y_test_c, y_pred_c))
print(classification_report(y_test_c, y_pred_c))

# Example prediction
sample = np.array([[0.3, 240, 12, 1.2, 0.5, 0.3]])
future_use = regressor.predict(sample)[0]
recommended_plan = classifier.predict(sample)[0]
print(f"Predicted next-day consumption: {future_use:.2f} kWh")
print(f"Recommended Plan: {recommended_plan}")

# Save both models together as one dictionary
model = {
    "regressor": regressor,
    "classifier": classifier
}

with open("churn_model.pkl", "wb") as f:
    pickle.dump(model, f)


Regression RMSE: 0.32430039617362305
Classification Accuracy: 0.9965156794425087
              precision    recall  f1-score   support

      Plan A       1.00      0.99      1.00       116
      Plan B       0.99      1.00      1.00       171

    accuracy                           1.00       287
   macro avg       1.00      1.00      1.00       287
weighted avg       1.00      1.00      1.00       287

Predicted next-day consumption: 1.87 kWh
Recommended Plan: Plan B
