In [63]:
import pandas as pd
import numpy as np
import plotly.express as px
import datetime

In [64]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
val_df = pd.read_csv('val.csv')

In [65]:
X_train,y_train = train_df.drop('Load_Type',axis=1), np.array(train_df['Load_Type'].apply(lambda x: 0 if x == 'Light_Load' else (1 if x == 'Medium_Load' else 2)))
X_test,y_test = test_df.drop('Load_Type',axis=1), np.array(test_df['Load_Type'].apply(lambda x: 0 if x == 'Light_Load' else (1 if x == 'Medium_Load' else 2)))
X_val,y_val = val_df.drop('Load_Type',axis=1), np.array(val_df['Load_Type'].apply(lambda x: 0 if x == 'Light_Load' else (1 if x == 'Medium_Load' else 2)))

In [66]:
def preprocess_2(train):
    train['Date_Time'] = pd.to_datetime(train['Date_Time'], format="%d-%m-%Y %H:%M")
    train['NSM'] = train['Date_Time'].dt.hour * 3600 + train['Date_Time'].dt.minute * 60
    train['day'] = train['Date_Time'].dt.day
    train['month']=train['Date_Time'].dt.month
    train['weekday']=pd.to_datetime(train['Date_Time'],format="%d-%m-%Y %H:%M").dt.weekday
    # Rename column 'Lagging_Current_Reactive.Power_kVarh' to 'Lagging_Current_Reactive_Power_kVarh'
    train = train.rename(columns={'Lagging_Current_Reactive.Power_kVarh': 'Lagging_Current_Reactive_Power_kVarh'})
    # Rename column 'CO2(tCO2)' to 'CO2'
    train = train.rename(columns={'CO2(tCO2)': 'CO2'})
    # # Scale column 'NSM_calculated' between 0 and 1
    # new_min, new_max = 0, 1
    # old_min, old_max = 0,85500
    # train['NSM'] = (train['NSM'] - old_min) / (old_max - old_min) * (new_max - new_min) + new_min
    Impute_dataframe=train.groupby(['weekday']).agg(Lagging_Current_Reactive_Power_kVarh_mean=('Lagging_Current_Reactive_Power_kVarh','mean'), Leading_Current_Reactive_Power_kVarh_mean=('Leading_Current_Reactive_Power_kVarh','mean'), CO2_mean=('CO2','mean'),
                                    Usage_kWh_mean=('Usage_kWh','mean')).reset_index()
    weekday_means = Impute_dataframe.set_index('weekday').to_dict()


    columns_to_fill=['Lagging_Current_Reactive_Power_kVarh','Leading_Current_Reactive_Power_kVarh','CO2','Usage_kWh']
    for column in columns_to_fill:
        train[column].fillna(train['weekday'].map(weekday_means[f'{column}_mean']), inplace=True)
    
    train['actual_load_from_formula']=(train['Usage_kWh']**2+abs(train['Lagging_Current_Reactive_Power_kVarh']-train['Leading_Current_Reactive_Power_kVarh'])**2)**0.5
    train = train.drop(columns=['Lagging_Current_Power_Factor','Leading_Current_Power_Factor','Date_Time'])

    return train,weekday_means


In [67]:
def inference_preprocess(X_val,weekday_means):
    X_val['Date_Time'] = pd.to_datetime(X_val['Date_Time'], format='%d-%m-%Y %H:%M')
    X_val['day'] = X_val['Date_Time'].dt.day
    X_val['month']=X_val['Date_Time'].dt.month
    X_val['weekday'] = X_val['Date_Time'].dt.dayofweek
    X_val['NSM'] =  X_val['Date_Time'].dt.hour * 3600 + X_val['Date_Time'].dt.minute * 60

    X_val = X_val.rename(columns={'Lagging_Current_Reactive.Power_kVarh': 'Lagging_Current_Reactive_Power_kVarh'})
    # Rename column 'CO2(tCO2)' to 'CO2'
    X_val = X_val.rename(columns={'CO2(tCO2)': 'CO2'})
    # Scale column 'NSM_calculated' between 0 and 1


    columns_to_fill=['Lagging_Current_Reactive_Power_kVarh','Leading_Current_Reactive_Power_kVarh','CO2','Usage_kWh']
    for column in columns_to_fill:
        X_val[column].fillna(X_val['weekday'].map(weekday_means[f'{column}_mean']), inplace=True)
    
    X_val['actual_load_from_formula']=(X_val['Usage_kWh']**2+abs(X_val['Lagging_Current_Reactive_Power_kVarh']-X_val['Leading_Current_Reactive_Power_kVarh'])**2)**0.5

    return X_val[['Usage_kWh', 'Lagging_Current_Reactive_Power_kVarh',
       'Leading_Current_Reactive_Power_kVarh', 'CO2', 'NSM', 'day', 'month',
       'weekday', 'actual_load_from_formula']]
    

In [68]:
a,w=preprocess_2(X_train)
a

Unnamed: 0,Usage_kWh,Lagging_Current_Reactive_Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2,NSM,day,month,weekday,actual_load_from_formula
0,8.753692,2.95,0.0,0.0,900,1,1,0,9.237404
1,4.000000,4.46,0.0,0.0,1800,1,1,0,5.990960
2,3.240000,3.28,0.0,0.0,2700,1,1,0,4.610423
3,3.310000,3.56,0.0,0.0,3600,1,1,0,4.861039
4,3.820000,4.50,0.0,0.0,4500,1,1,0,5.902745
...,...,...,...,...,...,...,...,...,...
25646,2.630000,5.58,0.0,0.0,13500,25,9,1,6.168736
25647,2.630000,5.65,0.0,0.0,14400,25,9,1,6.232126
25648,2.590000,5.65,0.0,0.0,15300,25,9,1,6.215352
25649,5.839866,5.76,0.0,0.0,16200,25,9,1,8.202539


In [69]:
inference_preprocess(X_val,w)
# .shape,y_val.shape

Unnamed: 0,Usage_kWh,Lagging_Current_Reactive_Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2,NSM,day,month,weekday,actual_load_from_formula
0,2.66,5.650000,0.00,0.0,18000,25,9,1,6.244846
1,2.63,5.510000,0.00,0.0,18900,25,9,1,6.105489
2,2.88,18.956808,0.00,0.0,19800,25,9,1,19.174331
3,3.85,7.060000,0.00,0.0,20700,25,9,1,8.041523
4,2.84,15.366589,0.00,0.0,21600,25,9,1,15.626825
...,...,...,...,...,...,...,...,...,...
6408,3.92,3.060000,0.25,0.0,82800,30,11,4,4.823121
6409,3.92,2.770000,0.36,0.0,83700,30,11,4,4.601576
6410,3.89,4.319224,0.50,0.0,84600,30,11,4,5.451474
6411,3.82,2.450000,0.54,0.0,85500,30,11,4,4.270890


In [70]:
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier

# Define models
models = {
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'XGBoost': XGBClassifier(),
    'CatBoost': CatBoostClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Naive Bayes': MultinomialNB(),
    'AdaBoost': AdaBoostClassifier()
    
}


# Initialize empty lists to store evaluation results
model_names = []
accuracies = []
precisions = []
recalls = []
f1_scores = []
test_accuracies=[]
test_precisions=[]
test_recalls=[]
test_f1_scores=[]

# Train and evaluate models
for name, model in models.items():
    print(f'Training and evaluating {name}...')
    a,w=preprocess_2(X_train)
    model.fit(a, y_train)
    y_pred = model.predict(inference_preprocess(X_val,w))
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred, average='weighted')
    recall = recall_score(y_val, y_pred, average='weighted')
    f1 = f1_score(y_val, y_pred, average='weighted')

     # Append results to lists
    model_names.append(name)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    y_pred = model.predict(inference_preprocess(X_test,w))
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    test_accuracies.append(accuracy)
    test_precisions.append(precision)
    test_recalls.append(recall)
    test_f1_scores.append(f1)

# Create DataFrame
results_df = pd.DataFrame({
    'Model': model_names,
    'Accuracy': accuracies,
    'Precision': precisions,
    'Recall': recalls,
    'F1-Score': f1_scores,
    'Test_Accuracy': test_accuracies,
    'Test_Precision': test_precisions,
    'Test_Recall': test_recalls,
    'Test_F1-Score': test_f1_scores
})


Training and evaluating SVM...


Training and evaluating Random Forest...
Training and evaluating Logistic Regression...



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



Training and evaluating Decision Tree...
Training and evaluating K-Nearest Neighbors...
Training and evaluating XGBoost...
Training and evaluating CatBoost...
Learning rate set to 0.093366
0:	learn: 0.9703559	total: 34ms	remaining: 34s
1:	learn: 0.8648428	total: 82.1ms	remaining: 41s
2:	learn: 0.7776090	total: 127ms	remaining: 42.3s
3:	learn: 0.7034497	total: 168ms	remaining: 41.9s
4:	learn: 0.6471496	total: 197ms	remaining: 39.2s
5:	learn: 0.5941638	total: 220ms	remaining: 36.5s
6:	learn: 0.5506666	total: 237ms	remaining: 33.6s
7:	learn: 0.5112193	total: 256ms	remaining: 31.7s
8:	learn: 0.4762198	total: 278ms	remaining: 30.6s
9:	learn: 0.4457778	total: 300ms	remaining: 29.7s
10:	learn: 0.4198481	total: 321ms	remaining: 28.8s
11:	learn: 0.3972928	total: 342ms	remaining: 28.1s
12:	learn: 0.3741766	total: 363ms	remaining: 27.6s
13:	learn: 0.3527909	total: 384ms	remaining: 27.1s
14:	learn: 0.3365467	total: 405ms	remaining: 26.6s
15:	learn: 0.3194900	total: 429ms	remaining: 26.4s
16:	learn





In [76]:
results_df.sort_values(by=['Test_Accuracy','Accuracy'], ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score,Test_Accuracy,Test_Precision,Test_Recall,Test_F1-Score
9,AdaBoost,0.822704,0.822604,0.822704,0.822653,0.69298,0.718511,0.69298,0.703953
4,K-Nearest Neighbors,0.820209,0.825873,0.820209,0.822695,0.692308,0.724479,0.692308,0.705835
3,Decision Tree,0.793544,0.796317,0.793544,0.794801,0.674169,0.697436,0.674169,0.684771
7,Gradient Boosting,0.841728,0.847893,0.841728,0.844389,0.670474,0.728527,0.670474,0.693651
6,CatBoost,0.841728,0.843092,0.841728,0.842389,0.663755,0.710609,0.663755,0.683534
5,XGBoost,0.832528,0.836272,0.832528,0.834246,0.661068,0.707757,0.661068,0.680906
1,Random Forest,0.833151,0.839216,0.833151,0.835768,0.660396,0.715918,0.660396,0.682454
8,Naive Bayes,0.639638,0.652231,0.639638,0.604871,0.653342,0.637232,0.653342,0.628963
2,Logistic Regression,0.646655,0.626997,0.646655,0.623083,0.643937,0.62251,0.643937,0.631886
0,SVM,0.688601,0.726131,0.688601,0.700932,0.527377,0.650062,0.527377,0.57168


In [81]:
y_train,y_val

(array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64))

In [82]:
import pandas as pd

X_train_Full = pd.concat([X_train, X_val], axis=0)
y_train_Full = np.concatenate([y_train, y_val])

In [84]:
from sklearn.ensemble import AdaBoostClassifier
import joblib

# Instantiate the AdaBoost classifier
adaboost_model = AdaBoostClassifier()
a,w=preprocess_2(X_train_Full)

# Train the model on the training dataset (assuming X_train and y_train are already defined)
adaboost_model.fit(a, y_train_Full)

y_pred = model.predict(inference_preprocess(X_test,w))
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(accuracy,precision,recall,f1)
# Save the trained model to a file
joblib.dump(adaboost_model, 'adaboost_model.pkl')





0.692979509573396 0.7185112061182212 0.692979509573396 0.7039531181674321


['adaboost_model.pkl']

In [85]:

joblib.dump(w,'w.pkl')

['w.pkl']