In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [11]:
# --- Step 1 & 2: Load, Clean, and Prepare Data ---

# Load the dataset
try:
    df = pd.read_csv('load_data.csv')
except FileNotFoundError:
    print("Error: 'load_data.csv' not found. Please make sure the file is in the correct directory.")
    exit()

# Clean column names by replacing special characters with '_'
df.columns = df.columns.str.strip().str.replace('[^A-Za-z0-9]+', '_', regex=True).str.lower()

# Convert 'date_time' column, ensuring day comes first
df['date_time'] = pd.to_datetime(df['date_time'], dayfirst=True)

In [12]:
# Feature Engineering
df['hour'] = df['date_time'].dt.hour
df['dayofweek'] = df['date_time'].dt.dayofweek
df['month'] = df['date_time'].dt.month

# Sort dataframe by date
df = df.sort_values(by='date_time').reset_index(drop=True)

# Encode the target variable
le = LabelEncoder()
df['load_type_encoded'] = le.fit_transform(df['load_type'])
load_type_mapping = {index: label for index, label in enumerate(le.classes_)}

print("--- Data Preparation Summary ---")
print("Load Type Mapping:", load_type_mapping)
print("Cleaned Column Names:", df.columns.tolist())
print("\n")

# --- Step 3: Data Splitting (Time-Based) ---
last_month = df['date_time'].dt.month.iloc[-1]
last_year = df['date_time'].dt.year.iloc[-1]
split_date = df[(df['date_time'].dt.month == last_month) & (df['date_time'].dt.year == last_year)]['date_time'].min()

train_df = df[df['date_time'] < split_date]
test_df = df[df['date_time'] >= split_date]

print("--- Data Splitting Summary ---")
print(f"Training data runs from {train_df['date_time'].min()} to {train_df['date_time'].max()}")
print(f"Test data runs from {test_df['date_time'].min()} to {test_df['date_time'].max()}")
print("\n")


--- Data Preparation Summary ---
Load Type Mapping: {0: 'Light_Load', 1: 'Maximum_Load', 2: 'Medium_Load'}
Cleaned Column Names: ['date_time', 'usage_kwh', 'lagging_current_reactive_power_kvarh', 'leading_current_reactive_power_kvarh', 'co2_tco2_', 'lagging_current_power_factor', 'leading_current_power_factor', 'nsm', 'load_type', 'hour', 'dayofweek', 'month', 'load_type_encoded']


--- Data Splitting Summary ---
Training data runs from 2018-01-01 00:00:00 to 2018-11-30 23:45:00
Test data runs from 2018-12-01 00:00:00 to 2018-12-31 23:45:00




In [13]:
# THIS IS THE FINAL, ROBUST FIX:
# Dynamically create the list of features instead of hardcoding it.
non_feature_cols = ['date_time', 'load_type', 'load_type_encoded']
features = [col for col in df.columns if col not in non_feature_cols]
target = 'load_type_encoded'

print("--- Features Used for Training ---")
print(features)
print("\n")


X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]
y_test = test_df[target]


--- Features Used for Training ---
['usage_kwh', 'lagging_current_reactive_power_kvarh', 'leading_current_reactive_power_kvarh', 'co2_tco2_', 'lagging_current_power_factor', 'leading_current_power_factor', 'nsm', 'hour', 'dayofweek', 'month']




In [16]:
# --- FIX: Add this right before the model training loop ---
# This ensures any and all missing values are filled.
X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_test.median())


# --- Step 4 & 5: Model Training and Evaluation ---
models = {
    "Logistic Regression": LogisticRegression(max_iter=2000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42, n_estimators=100)
}

results = {}

# This loop will now work correctly
for model_name, model in models.items():
    print(f"--- Training and Evaluating: {model_name} ---")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0)

    results[model_name] = {
        "accuracy": accuracy,
        "classification_report": report
    }

    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(report)
    print("\n")

--- Training and Evaluating: Logistic Regression ---


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.6480
Classification Report:
              precision    recall  f1-score   support

  Light_Load       0.80      0.89      0.84      1745
Maximum_Load       0.31      0.23      0.26       528
 Medium_Load       0.39      0.36      0.37       704

    accuracy                           0.65      2977
   macro avg       0.50      0.49      0.49      2977
weighted avg       0.61      0.65      0.63      2977



--- Training and Evaluating: Decision Tree ---
Accuracy: 0.8905
Classification Report:
              precision    recall  f1-score   support

  Light_Load       0.98      0.86      0.91      1745
Maximum_Load       0.82      0.91      0.86       528
 Medium_Load       0.79      0.96      0.87       704

    accuracy                           0.89      2977
   macro avg       0.86      0.91      0.88      2977
weighted avg       0.90      0.89      0.89      2977



--- Training and Evaluating: Random Forest ---
Accuracy: 0.9348
Classification Report:
              precis

In [17]:
# --- Conclusion ---
best_model_name = max(results, key=lambda k: results[k]['accuracy'])
print(f"--- Final Conclusion ---")
print(f"The best performing model is the '{best_model_name}' with an accuracy of {results[best_model_name]['accuracy']:.4f}.")

--- Final Conclusion ---
The best performing model is the 'Random Forest' with an accuracy of 0.9348.
