In [51]:
import numpy as np
from sklearn.ensemble import VotingClassifier 
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier 
from scipy.io import arff
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

In [16]:
data, meta = arff.loadarff('dataSet.arff')
df = pd.DataFrame(data)
df = df.astype(int)
X = df.drop('Result', axis=1)
Y = df['Result']

label_encoder = LabelEncoder()
Y_integer = label_encoder.fit_transform(Y)

# Y_binary = (Y + 1) // 2
# print(Y_binary[0:5])
# print(Y.head())


from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y_integer, test_size = 0.2, random_state = 2023)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [52]:
lr=LogisticRegression(max_iter=5000, C=0.1)
svm = SVC(kernel="rbf")
rf = RandomForestClassifier(max_depth=20, n_estimators=200)
xgb = XGBClassifier()

voting_clf = VotingClassifier(estimators=[('lr', lr), ('svm', svm), ('rf', rf), ('xgb', xgb)], voting="hard")

voting_clf.fit(X_train, Y_train)

In [53]:
print("Accuracy on test: " + str(voting_clf.score(X_test, Y_test)))
print("Accuracy on train: "+ str(voting_clf.score(X_train, Y_train)))

Accuracy on test: 0.9601990049751243
Accuracy on train: 0.978629579375848


In [56]:
param_grid = {
    'lr__C': [0.1, 1, 10],
    'svm__C': [0.1, 1, 10],
    'xgb__learning_rate': [0.01, 0.1, 0.2],
    'xgb__n_estimators': [50, 100, 200],
    'xgb__max_depth': [3, 5, 7]
}
grid_search = GridSearchCV(estimator=voting_clf, param_grid=param_grid, cv=5, scoring='accuracy', verbose = 1)
grid_search.fit(X_train, Y_train)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


In [57]:
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print('Best parameters: ', best_params)
print('Best model', best_model)

Best parameters:  {'lr__C': 0.1, 'svm__C': 10, 'xgb__learning_rate': 0.2, 'xgb__max_depth': 7, 'xgb__n_estimators': 200}
Best model VotingClassifier(estimators=[('lr', LogisticRegression(C=0.1, max_iter=5000)),
                             ('svm', SVC(C=10)),
                             ('rf',
                              RandomForestClassifier(max_depth=20,
                                                     n_estimators=200)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval

In [58]:
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9624604251469923


In [60]:
import joblib
joblib.dump(best_model, 'vclf.pkl')

['vclf.pkl']

In [64]:
import MetricFunctions
MetricFunctions.printAccuracy(best_model, X_train, X_test, Y_train, Y_test)
MetricFunctions.printLoss(best_model, X_train, X_test, Y_train, Y_test)

ModuleNotFoundError: No module named 'scikitplot'