In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import datetime

In [2]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
val_df = pd.read_csv('data/val.csv')

In [3]:
train_df.head()

Unnamed: 0,Date_Time,Usage_kWh,Lagging_Current_Reactive.Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2(tCO2),Lagging_Current_Power_Factor,Leading_Current_Power_Factor,NSM,Load_Type
0,01-01-2018 00:15,8.753692,2.95,0.0,0.0,73.21,100.0,900.0,Light_Load
1,01-01-2018 00:30,4.0,4.46,0.0,0.0,66.77,100.0,1800.0,Light_Load
2,01-01-2018 00:45,3.24,3.28,0.0,0.0,70.28,100.0,8070.880991,Light_Load
3,01-01-2018 01:00,3.31,3.56,0.0,0.0,68.09,100.0,3600.0,Light_Load
4,01-01-2018 01:15,3.82,4.5,0.0,0.0,133.655666,,4500.0,Light_Load


In [4]:
from sklearn.model_selection import train_test_split

X_train,y_train = train_df.drop('Load_Type',axis=1), np.array(train_df['Load_Type'].apply(lambda x: 0 if x == 'Light_Load' else (1 if x == 'Medium_Load' else 2)))
X_test,y_test = test_df.drop('Load_Type',axis=1), np.array(test_df['Load_Type'].apply(lambda x: 0 if x == 'Light_Load' else (1 if x == 'Medium_Load' else 2)))
X_val,y_val = val_df.drop('Load_Type',axis=1), np.array(val_df['Load_Type'].apply(lambda x: 0 if x == 'Light_Load' else (1 if x == 'Medium_Load' else 2)))

# First Model 

Taking Only NSM, day and month [ as other characteristics isn't distinguishable in the eda process]

In [5]:
import pandas as pd

def preprocess(df):
    df['Date_Time'] = pd.to_datetime(df['Date_Time'], format="%d-%m-%Y %H:%M")
    df['NSM'] = (df['Date_Time'].dt.hour * 3600 + df['Date_Time'].dt.minute * 60)
    df['day'] = df['Date_Time'].dt.day
    df['month']=df['Date_Time'].dt.month
    return np.array(df[['NSM','day','month']])

preprocess_X_train = preprocess(X_train)

In [6]:
preprocess_X_train

array([[  900,     1,     1],
       [ 1800,     1,     1],
       [ 2700,     1,     1],
       ...,
       [15300,    25,     9],
       [16200,    25,     9],
       [17100,    25,     9]])

In [7]:
preprocess_X_train[0][0]

900

In [8]:
y_train

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [9]:
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier

# Define models
models = {
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'XGBoost': XGBClassifier(),
    'CatBoost': CatBoostClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Naive Bayes': MultinomialNB(),
    'AdaBoost': AdaBoostClassifier()
    
}



# Initialize empty lists to store evaluation results
model_names = []
accuracies = []
precisions = []
recalls = []
f1_scores = []

# Train and evaluate models
for name, model in models.items():
    print(f'Training and evaluating {name}...')
    model.fit(preprocess(X_train), y_train)
    y_pred = model.predict(preprocess(X_val))
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred, average='weighted')
    recall = recall_score(y_val, y_pred, average='weighted')
    f1 = f1_score(y_val, y_pred, average='weighted')
    
    # Append results to lists
    model_names.append(name)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Create DataFrame
results_df = pd.DataFrame({
    'Model': model_names,
    'Accuracy': accuracies,
    'Precision': precisions,
    'Recall': recalls,
    'F1-Score': f1_scores
})


Training and evaluating SVM...
Training and evaluating Random Forest...
Training and evaluating Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training and evaluating Decision Tree...
Training and evaluating K-Nearest Neighbors...
Training and evaluating XGBoost...
Training and evaluating CatBoost...
Learning rate set to 0.093366
0:	learn: 0.9820017	total: 160ms	remaining: 2m 39s
1:	learn: 0.8919978	total: 166ms	remaining: 1m 23s
2:	learn: 0.8212646	total: 173ms	remaining: 57.5s
3:	learn: 0.7618000	total: 179ms	remaining: 44.6s
4:	learn: 0.7038200	total: 186ms	remaining: 36.9s
5:	learn: 0.6553513	total: 192ms	remaining: 31.9s
6:	learn: 0.6201065	total: 199ms	remaining: 28.3s
7:	learn: 0.5870403	total: 207ms	remaining: 25.6s
8:	learn: 0.5605032	total: 215ms	remaining: 23.6s
9:	learn: 0.5319217	total: 221ms	remaining: 21.8s
10:	learn: 0.5111501	total: 226ms	remaining: 20.3s
11:	learn: 0.4873337	total: 234ms	remaining: 19.3s
12:	learn: 0.4696003	total: 241ms	remaining: 18.3s
13:	learn: 0.4543316	total: 247ms	remaining: 17.4s
14:	learn: 0.4406201	total: 253ms	remaining: 16.6s
15:	learn: 0.4270916	total: 259ms	remaining: 15.9s
16:



In [10]:
results_df.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
4,K-Nearest Neighbors,0.764541,0.770035,0.764541,0.766964
7,Gradient Boosting,0.738032,0.733437,0.738032,0.734976
5,XGBoost,0.731795,0.725302,0.731795,0.727847
0,SVM,0.724154,0.750034,0.724154,0.733062
6,CatBoost,0.720568,0.708383,0.720568,0.709079
1,Random Forest,0.686886,0.673583,0.686886,0.665194
3,Decision Tree,0.686886,0.673583,0.686886,0.665194
9,AdaBoost,0.681272,0.663955,0.681272,0.670067
8,Naive Bayes,0.580072,0.540923,0.580072,0.552248
2,Logistic Regression,0.516763,0.421893,0.516763,0.389049


In [11]:
import plotly.graph_objects as go

# Create a list of models and metrics
models = results_df.sort_values(by='Accuracy', ascending=True)['Model'].tolist() 
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']

# Create a 2D list of values for the heatmap
values =results_df.sort_values(by='Accuracy', ascending=True).iloc[:,1:]


# Create the heatmap
fig = go.Figure(data=go.Heatmap(
    z=values,
    x=metrics,
    y=models,
    hoverongaps=False))

# Update layout
fig.update_layout(
    title='Model Evaluation Metrics',
    xaxis_title='Metrics',
    yaxis_title='Models')

# Show the plot
fig.show()


In [12]:
def simple_model_from_eda(df):
    if df[0] <=32400 or df[0]>=83700:
        return 0
    elif df[0]>32400 and df[0]<=36000:
        return 1
    elif df[0]>36000 and df[0]<=43200:
        return 2
    elif df[0]>43200 and df[0]<=46800:
        return 1
    else:
        if datetime.datetime(2018, df[2], df[1])<=datetime.datetime(2018, 2, 28):
            if df[0]>46800 and df[0]<=61200:
                return 1
            elif df[0]>61200 and df[0]<=72000:
                return 2
            elif df[0]>72000 and df[0]<=79200:
                return 1
            else:
                return 2
        else:
            if df[0]>46800 and df[0]<=61200:
                return 2
            else:
                return 1



In [13]:
y_pred=list(map(lambda x: simple_model_from_eda(x), preprocess(X_train)))

In [14]:
print(accuracy_score(y_train, y_pred))
print(recall_score(y_train, y_pred, average='weighted'))
print(f1_score(y_train, y_pred, average='weighted'))
print(precision_score(y_train, y_pred, average='weighted'))

0.9061245175626681
0.9061245175626681
0.905487356707332
0.9212430409514518


In [15]:
accuracy = sum(y_train == y_pred) / len(y_train)

In [16]:
1-(43*56)/len(y_train) # this 44 is unique light_load_dates and 56 is number of minutes from 9am to 11:00 pm

0.9061245175626681

In [17]:
val_pred=list(map(lambda x: simple_model_from_eda(x), preprocess(X_val)))

In [18]:
print(accuracy_score(val_pred, y_val))
print(recall_score(val_pred, y_val, average='weighted'))
print(f1_score(val_pred, y_val, average='weighted'))
print(precision_score(val_pred, y_val, average='weighted'))

0.7779510369561827
0.7779510369561827
0.7677217425448057
0.7694966662440178


# Still simple_model_from_eda outperforms the rest

It results in the same accuracy as the above model. There is 13% decrease in the accuracy with the validation model. We need to investigate about other datavalues.

In [19]:
val_df['NSM_calculated']=val_df['Date_Time'].apply(lambda x: datetime.datetime.strptime(x,"%d-%m-%Y %H:%M").hour*3600+datetime.datetime.strptime(x,"%d-%m-%Y %H:%M").minute*60)
fig = px.scatter(val_df, x="NSM_calculated",y=np.arange(len(val_df)), color="Load_Type",hover_data=["Date_Time"])
fig.show()

## there is change in the behaviour from nov 1, 2018 which is the unseen dataset