In [1]:
import numpy as np 
import pandas as pd 

In [2]:
df = pd.read_csv("../data/ml_df.csv")

In [3]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
num_labels = label_encoder.fit_transform(df.best_calibration_method)
cat_labels = label_encoder.inverse_transform(num_labels) 

df['best_calibration_method_label'] = num_labels 

In [4]:
del_cols = ['best_mae', 'best_calibration_method', 'second_best_mae',
       'second_best_calibration_method', 'third_best_mae',
       'third_best_calibration_method', 'model_name'] 

ml_df = df.drop(del_cols, axis=1) 
ml_df 

Unnamed: 0,amount_of_data,noise,num_compartments,best_calibration_method_label
0,27,9,3,11
1,28,9,3,11
2,29,9,3,11
3,30,9,3,11
4,31,9,3,11
...,...,...,...,...
1375,68,8,4,11
1376,69,8,4,4
1377,70,8,4,11
1378,71,8,4,11


In [5]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report 

In [6]:
TEST_SPLIT_SIZE = 0.15 

In [7]:
# Function to split the dataset
def splitdataset(balance_data):

    # Separating the target variable
    X = balance_data.values[:, 0:3]
    Y = balance_data.values[:, 3]

    # Splitting the dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size = TEST_SPLIT_SIZE, random_state = 100) 

    return X, Y, X_train, X_test, y_train, y_test

In [8]:
# Function to calculate accuracy
def cal_accuracy(y_test, y_pred):

#     print("Confusion Matrix: ",
#         confusion_matrix(y_test, y_pred))

    print ("Accuracy : ",
    accuracy_score(y_test,y_pred)*100)

#     print("Report : ",
#     classification_report(y_test, y_pred))


In [9]:
def prediction(X_test, clf): 
    y_pred = clf.predict(X_test) 
    return y_pred 

In [10]:
""" 
Decision Trees 
""" 

# Importing the required packages
from sklearn.tree import DecisionTreeClassifier

# Function to perform training with giniIndex.
def train_using_gini(X_train, X_test, y_train):

    # Creating the classifier object
    clf_gini = DecisionTreeClassifier(criterion = "gini",
            random_state = 100,max_depth=3, min_samples_leaf=5)

    # Performing training
    clf_gini.fit(X_train, y_train)
    return clf_gini

# Function to perform training with entropy.
def train_using_entropy(X_train, X_test, y_train):

    # Decision tree with entropy
    clf_entropy = DecisionTreeClassifier(
            criterion = "entropy", random_state = 100,
            max_depth = 3, min_samples_leaf = 5)

    # Performing training
    clf_entropy.fit(X_train, y_train)
    return clf_entropy

data = ml_df 
X, Y, X_train, X_test, y_train, y_test = splitdataset(data) 

clf_gini = train_using_gini(X_train, X_test, y_train)
clf_entropy = train_using_entropy(X_train, X_test, y_train)

# Prediction using gini
print("Results Using Gini Index:")
y_pred_gini = prediction(X_test, clf_gini)
cal_accuracy(y_test, y_pred_gini)


# Prediction using entropy
print("Results Using Entropy:")
y_pred_entropy = prediction(X_test, clf_entropy)

cal_accuracy(y_test, y_pred_entropy) 

Results Using Gini Index:
Accuracy :  63.76811594202898
Results Using Entropy:
Accuracy :  66.66666666666666


In [11]:
""" 
Logistic Regression 
""" 

from sklearn.linear_model import LogisticRegression 

logreg = LogisticRegression()
logreg.fit(X_train, y_train) 

# Logistic Regression Results 
y_pred_logi = prediction(X_test, logreg)
cal_accuracy(y_test, y_pred_logi) 

Accuracy :  53.62318840579711


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
""" 
C-Support Vector Classification 
""" 

from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

clf = make_pipeline(StandardScaler(), SVC(gamma='auto')) 
clf.fit(X_train, y_train) 

# SVC Results 
y_pred_svc = prediction(X_test, clf)
cal_accuracy(y_test, y_pred_svc) 

Accuracy :  70.04830917874396


In [13]:
""" 
KNN 
""" 

from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train) 

# Logistic Regression Results 
print("Results Using Gini Index:")
y_pred_knn = prediction(X_test, neigh)
cal_accuracy(y_test, y_pred_knn) 

Results Using Gini Index:
Accuracy :  63.76811594202898


In [14]:
""" 
Naive-Bayes 
"""
from sklearn.naive_bayes import GaussianNB 

gnb = GaussianNB() 
gnb.fit(X_train, y_train) 

y_pred_nb = prediction(X_test, gnb)
cal_accuracy(y_test, y_pred_nb)  

Accuracy :  17.874396135265698


In [15]:
from sklearn.neural_network import MLPClassifier 

clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(10, 10), random_state=1) 
clf_2 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(32, 32), random_state=1) 
clf_3 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(64, 64), random_state=1) 
clf_4 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(128, 128), random_state=1) 
clf_5 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(32, 32, 32), random_state=1) 

for i in [clf, clf_2, clf_3, clf_4, clf_5]: 
    i.fit(X_train, y_train) 

    y_pred_nn = prediction(X_test, i) 
    print(i)
    cal_accuracy(y_test, y_pred_nn)  

MLPClassifier(alpha=1e-05, hidden_layer_sizes=(10, 10), random_state=1,
              solver='lbfgs')
Accuracy :  46.3768115942029


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(alpha=1e-05, hidden_layer_sizes=(32, 32), random_state=1,
              solver='lbfgs')
Accuracy :  66.18357487922705


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(alpha=1e-05, hidden_layer_sizes=(64, 64), random_state=1,
              solver='lbfgs')
Accuracy :  67.14975845410628


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(alpha=1e-05, hidden_layer_sizes=(128, 128), random_state=1,
              solver='lbfgs')
Accuracy :  62.31884057971014
MLPClassifier(alpha=1e-05, hidden_layer_sizes=(32, 32, 32), random_state=1,
              solver='lbfgs')
Accuracy :  68.11594202898551


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
