## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn import model_selection
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score, log_loss
from sklearn.preprocessing import OneHotEncoder
pd.set_option('display.max_columns', None)

## Read/Load Data

In [None]:
data = pd.read_csv('train_project/train.csv')

In [None]:
test = pd.read_csv('train_project/test.csv')
test_data = test.drop(columns=['id'],axis=1)

In [None]:
data.head()

## Exploratory Data Analysis

In [None]:
counter = Counter(data['cpu_load'])
for k,v in counter.items():
    per = v / len(data['cpu_load']) * 100
    print(f'Load {k}, Count: {v} ({round(per,2)} %)')
# plot the distribution
plt.bar(counter.keys(), counter.values())
plt.title('Distribution of Load types')
plt.show()

In [None]:
sns.catplot(x="cpu_load", y="syst_direct_ipo_rate", data=data)

In [None]:
sns.catplot(x="cpu_load", y="syst_process_count", data=data)

In [None]:
sns.catplot(x="cpu_load", y="page_global_valid_fault_rate", data=data)

In [None]:
sns.catplot(x="cpu_load", y="ewc0_pkts_recvpsec", data=data)

In [None]:
sns.catplot(x="cpu_load", y="lla0_pkts_recvpsec", data=data)

## Pre processing

In [None]:
def cat_encoder(X,column_name):
    cat = X[[column_name]]
    cat_encoder = OneHotEncoder(sparse=False)
    cat_1hot = cat_encoder.fit_transform(cat)
    for i in range(cat_1hot.shape[1]):
        X[cat_encoder.categories_[0][i]] = cat_1hot[:,i]
    X = X.drop(columns=[column_name],axis=1)
    return X

In [None]:
X = data.drop(columns=['cpu_load'],axis=1)
y = data['cpu_load']

In [None]:
X = cat_encoder(X,'m_id')
X = X.drop(columns=['a','b','c','d','e','g'],axis=1)

## Model Building

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y)

In [None]:
clf = HistGradientBoostingClassifier(loss='categorical_crossentropy',max_iter=170,l2_regularization=1,early_stopping=False)
clf.fit(X_train, y_train)
cross_hgbc = cross_val_score(clf, X, y, cv=5,scoring='neg_log_loss')
print(cross_hgbc.mean())

In [None]:
# Making predictions
predictions = clf.predict(X_test)
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))

In [None]:
clf = HistGradientBoostingClassifier(loss='categorical_crossentropy',max_iter=170,l2_regularization=1,early_stopping=False)
clf.fit(X, y)

In [None]:
test_data = cat_encoder(test_data,'m_id')
test_data = test_data.drop(columns=['a','b','c','d','e','g'],axis=1)
predictions = clf.predict_proba(test_data)

In [None]:
def save_results(x_test,predictions,file_name):
    data_demo = pd.read_csv('train_project/sample_submission.csv')
    data_demo['id'] = x_test['id']
    data_demo['low'] = predictions[:,1]
    data_demo['medium'] = predictions[:,2]
    data_demo['high'] = predictions[:,0]
    data_demo.to_csv(f'solution_{file_name}.csv',index=False)

In [None]:
save_results(test,predictions,'add_h_l_m')

## Tried Approaches

### Eliminate highly correlated features(>0.85)
### Find important features using tree based algorithms
### Feature selection using SelectKBest()
### Tried Recursive Feature Elimination, But its taking very long to give important features
### Build very intuite model(with 8 features) and acheived 90% accuracy
### Tried to derive new variables, but unable to understand some of the variables.
### Tried to remove outliers, but unable to increase accuracy
### Tried Decision Tree, ExtraTree Classifier, Random Forest, Gradient Boosting Classifier, SGDClassifier, MLP, Stacking classifier, Voting classifier
### Used confidently(probabliity > 0.99) predicted data points from test data to train the model along with train data and log loss reduced by 0.003.