In [1]:
#imports
import matplotlib.pyplot as plt
import cv2
import os
import numpy as np
from math import sqrt
import pandas as pd

from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder

from numpy import random
from sklearn.model_selection import train_test_split
import sys

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

In [2]:
#read training and test data
trainset = pd.read_csv("Training.csv")
testset = pd.read_csv("Testing.csv")

In [3]:
trainset = trainset.drop_duplicates() #drop duplicate rows in training set
#label encoder to assign numeric values to prognosis column 
le = LabelEncoder()
trainset['prognosis_encoded'] = le.fit_transform(trainset.prognosis)
y = trainset['prognosis_encoded'].values
x = trainset.drop(columns=['prognosis_encoded', 'prognosis']).values

In [4]:
#seperate train and validation data
x_train,x_val,y_train,y_val = train_test_split(x,y,test_size = 0.2,stratify = y,random_state = 45)

In [5]:
#view training dataset
print(x)

[[1 3 4 ... 0 0 0]
 [0 3 4 ... 0 0 0]
 [1 0 4 ... 0 0 0]
 ...
 [0 3 0 ... 0 2 3]
 [0 3 0 ... 4 0 3]
 [0 3 0 ... 4 2 0]]


In [6]:
#label encoder to assign numeric values to prognosis column for testset
testset['prognosis_encoded'] = le.fit_transform(testset.prognosis)
target = testset['prognosis']
y_test = testset['prognosis_encoded'].values
x_test = testset.drop(columns=['prognosis_encoded', 'prognosis']).values

In [7]:
#logistic regression model
model = LogisticRegression(random_state=42, C=0.1, solver='saga', penalty='l2', multi_class='multinomial', max_iter = 10000)
model.fit(x_train,y_train)

#prediction on validation data
y_pred = model.predict(x_val)

#accuracy score of prediction
acc_score = accuracy_score(y_val, y_pred)
print("val score: ",acc_score)

#cross validation score on full training dataset
scores = cross_val_score(model, x, y, cv=5, n_jobs=1)
print("cross val score",scores)

val score:  1.0
cross val score [1.         1.         1.         1.         0.98333333]


In [8]:
#DISCARDED MODEL
#gaussian naive bayesian network model
nbn = GaussianNB()
nbn.fit(x_train, y_train)

#prediction on validation data
y_pred  =  nbn.predict(x_val)

#accuracy score of prediction
acc_score = accuracy_score(y_val, y_pred)
print("val score: ",acc_score)

#cross validation score on full training dataset
scores = cross_val_score(nbn, x, y, cv=5, n_jobs=1)
print("cross val score",scores)

val score:  0.9672131147540983
cross val score [0.98360656 0.96721311 0.93442623 0.96721311 0.91666667]


In [9]:
#categorical naive bayesian network model
cbn = CategoricalNB(alpha=0.3)
cbn.fit(x_train, y_train)

#prediction on validation data
y_pred  =  cbn.predict(x_val)

#accuracy score of prediction
acc_score = accuracy_score(y_val, y_pred)
print("val score: ",acc_score)

#cross validation score on full training dataset
scores = cross_val_score(cbn, x, y, cv=5, n_jobs=1)
print("cross val score",scores)

val score:  0.9672131147540983
cross val score [1.         0.98360656 0.98360656 0.91803279 0.93333333]


In [10]:
#DISCARDED MODEL
#bernoulli naive bayesian network model
bbn = BernoulliNB(alpha=0.1, binarize=0)
bbn.fit(x_train, y_train)

#prediction on validation data
y_pred  =  bbn.predict(x_val)

#accuracy score of prediction
acc_score = accuracy_score(y_val, y_pred)
print("val score: ",acc_score)

#cross validation score on full training dataset
scores = cross_val_score(bbn, x, y, cv=5, n_jobs=1)
print("cross val score",scores)

val score:  1.0
cross val score [1. 1. 1. 1. 1.]


In [11]:
#k nearest neighbours classifier model
knn_model = KNeighborsClassifier(n_neighbors=10, p=1, algorithm='ball_tree')
knn_model.fit(x_train, y_train)

#prediction on validation data
y_pred  =  knn_model.predict(x_val)

#accuracy score of prediction
acc_score = accuracy_score(y_val, y_pred)
print("val score: ",acc_score)

#cross validation score on full training dataset
scores = cross_val_score(knn_model, x, y, cv=5, n_jobs=1)
print("cross val score",scores)

val score:  0.9672131147540983
cross val score [0.95081967 0.86885246 0.95081967 0.91803279 0.95      ]


In [12]:
#SVM soft margin classifier model
svm = SVC(kernel='poly', degree=2, coef0=5, C=5)
svm.fit(x_train, y_train)

#prediction on validation data
y_pred  =  svm.predict(x_val)

#accuracy score of prediction
acc_score = accuracy_score(y_val, y_pred)
print("val score: ",acc_score)

#cross validation score on full training dataset
scores = cross_val_score(svm, x, y, cv=5, n_jobs=1)
print("cross val score",scores)

val score:  0.9672131147540983
cross val score [1.         0.96721311 0.95081967 0.98360656 0.91666667]


In [14]:
#random forest classifier model
rf = RandomForestClassifier(n_estimators=20, max_depth=9)
rf.fit(x_train, y_train)

#prediction on validation data
y_pred  =  rf.predict(x_val)

#accuracy score of prediction
acc_score = accuracy_score(y_val, y_pred)
print("val score: ",acc_score)

#cross validation score on full training dataset
scores = cross_val_score(rf, x, y, cv=5, n_jobs=1)
print("cross val score: ",scores)

#accuracy score of prediction on testset
y_pred_test  =  rf.predict(x_test)
acc_score = accuracy_score(y_test, y_pred_test)
print("test set score: ",acc_score)

val score:  0.9180327868852459
cross val score:  [0.91803279 0.93442623 0.83606557 0.93442623 0.85      ]
test set score:  0.926829268292683


In [17]:
#voting classifier model
vc = VotingClassifier(estimators=[('lr',LogisticRegression(random_state=42, C=0.1, solver='saga', penalty='l2', multi_class='multinomial', max_iter = 1000)),
                                ('svm',SVC(kernel='poly', degree=2, coef0=5, C=5)),
                                ('rfc',RandomForestClassifier(n_estimators=20, max_depth=9)),
                                ('kkn',KNeighborsClassifier(n_neighbors=10, p=1, algorithm='ball_tree')),
                                ('cbn',CategoricalNB(alpha=0.3))
                               ]
                    )
vc.fit(x_train,y_train)

#prediction on validation data
y_pred  =  vc.predict(x_val)

#accuracy score of prediction
acc_score = accuracy_score(y_val, y_pred)
print("val score: ",acc_score)

#cross validation score on full training dataset
scores = cross_val_score(vc, x, y, cv=5, n_jobs=1)
print("cross val score: ",scores)

#accuracy score of prediction on testset
y_pred_test  =  vc.predict(x_test)
acc_score = accuracy_score(y_test, y_pred_test)
print("test set score: ",acc_score)

val score:  0.9836065573770492
cross val score:  [1.         0.98360656 1.         0.98360656 0.98333333]
test set score:  1.0
