In [406]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

Membaca data

In [407]:
df = pd.read_csv("train.csv")

In [408]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1235 entries, 0 to 1234
Data columns (total 29 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     1235 non-null   int64  
 1   surgery                1235 non-null   object 
 2   age                    1235 non-null   object 
 3   hospital_number        1235 non-null   int64  
 4   rectal_temp            1235 non-null   float64
 5   pulse                  1235 non-null   float64
 6   respiratory_rate       1235 non-null   float64
 7   temp_of_extremities    1196 non-null   object 
 8   peripheral_pulse       1175 non-null   object 
 9   mucous_membrane        1214 non-null   object 
 10  capillary_refill_time  1229 non-null   object 
 11  pain                   1191 non-null   object 
 12  peristalsis            1215 non-null   object 
 13  abdominal_distention   1212 non-null   object 
 14  nasogastric_tube       1155 non-null   object 
 15  naso

Menghapus kolom yang tidak digunakan

In [409]:
df = df.drop_duplicates()
df = df.drop(columns=['id','hospital_number'])

In [410]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1235 entries, 0 to 1234
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   surgery                1235 non-null   object 
 1   age                    1235 non-null   object 
 2   rectal_temp            1235 non-null   float64
 3   pulse                  1235 non-null   float64
 4   respiratory_rate       1235 non-null   float64
 5   temp_of_extremities    1196 non-null   object 
 6   peripheral_pulse       1175 non-null   object 
 7   mucous_membrane        1214 non-null   object 
 8   capillary_refill_time  1229 non-null   object 
 9   pain                   1191 non-null   object 
 10  peristalsis            1215 non-null   object 
 11  abdominal_distention   1212 non-null   object 
 12  nasogastric_tube       1155 non-null   object 
 13  nasogastric_reflux     1214 non-null   object 
 14  nasogastric_reflux_ph  1235 non-null   float64
 15  rect

Inputasi NaN value

In [411]:
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = df[column].fillna(df[column].mode().iloc[0])
    else:
        df[column] = df[column].fillna(df[column].mean())

In [412]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1235 entries, 0 to 1234
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   surgery                1235 non-null   object 
 1   age                    1235 non-null   object 
 2   rectal_temp            1235 non-null   float64
 3   pulse                  1235 non-null   float64
 4   respiratory_rate       1235 non-null   float64
 5   temp_of_extremities    1235 non-null   object 
 6   peripheral_pulse       1235 non-null   object 
 7   mucous_membrane        1235 non-null   object 
 8   capillary_refill_time  1235 non-null   object 
 9   pain                   1235 non-null   object 
 10  peristalsis            1235 non-null   object 
 11  abdominal_distention   1235 non-null   object 
 12  nasogastric_tube       1235 non-null   object 
 13  nasogastric_reflux     1235 non-null   object 
 14  nasogastric_reflux_ph  1235 non-null   float64
 15  rect

In [413]:
new_df = pd.DataFrame()

Encoding data

In [414]:
for column in df.columns[:-1]:
    if df[column].dtype == 'object':
        encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
        encoded_df = pd.DataFrame(encoder.fit_transform(df[[column]])).copy()
        encoded_df.columns = encoder.get_feature_names_out()
        new_df = pd.concat([new_df, encoded_df], axis=1)
    else:
        temp = pd.DataFrame(df[column])
        new_df = pd.concat([new_df, temp], axis=1)
new_df = pd.concat([new_df,pd.DataFrame(df[df.columns[-1]])],axis=1)



In [415]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1235 entries, 0 to 1234
Data columns (total 71 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   surgery_no                        1235 non-null   float64
 1   surgery_yes                       1235 non-null   float64
 2   age_adult                         1235 non-null   float64
 3   age_young                         1235 non-null   float64
 4   rectal_temp                       1235 non-null   float64
 5   pulse                             1235 non-null   float64
 6   respiratory_rate                  1235 non-null   float64
 7   temp_of_extremities_cold          1235 non-null   float64
 8   temp_of_extremities_cool          1235 non-null   float64
 9   temp_of_extremities_normal        1235 non-null   float64
 10  temp_of_extremities_warm          1235 non-null   float64
 11  peripheral_pulse_absent           1235 non-null   float64
 12  periph

In [416]:
df = new_df

Membagi data menjadi X dan Y

In [417]:
x = df[df.columns[:-1]]
y = df[df.columns[-1]]

Reduksi data

In [418]:
pca = PCA()
pca.fit(x)
pca.explained_variance_ratio_

array([9.98400200e-01, 1.39201314e-03, 1.42110001e-04, 3.13153619e-05,
       2.38786022e-05, 7.23522426e-06, 2.81678157e-06, 9.65460625e-08,
       5.72040313e-08, 2.79100579e-08, 2.10790646e-08, 1.80290162e-08,
       1.62567255e-08, 1.40038312e-08, 1.36872679e-08, 1.22995615e-08,
       1.12938575e-08, 1.05511084e-08, 1.01450653e-08, 9.37056397e-09,
       9.14956461e-09, 8.77812249e-09, 7.89073438e-09, 7.51356232e-09,
       6.94962755e-09, 6.84327596e-09, 6.01811978e-09, 5.76947795e-09,
       5.31702690e-09, 4.88149530e-09, 4.43227860e-09, 4.18278576e-09,
       3.67215225e-09, 3.52943625e-09, 3.40611785e-09, 3.13023838e-09,
       3.00146540e-09, 2.74911292e-09, 2.29883826e-09, 2.14547809e-09,
       1.84026603e-09, 1.64419464e-09, 1.59366470e-09, 1.42573150e-09,
       1.33813040e-09, 1.22434026e-09, 7.61754472e-10, 6.49656412e-10,
       1.31004229e-10, 7.87758090e-11, 3.46432994e-11, 3.34408948e-11,
       3.21959248e-11, 3.19135718e-11, 9.90991656e-33, 9.90991656e-33,
      

In [419]:
x = pd.DataFrame(pca.transform(x),columns=pca.get_feature_names_out())

In [420]:
x = normalize(x)
x = pd.DataFrame(x,columns=pca.get_feature_names_out())

In [421]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [422]:
knn = KNeighborsClassifier()
nb = GaussianNB()
dt = DecisionTreeClassifier()
svm = SVC()
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()

Memasukkan data train ke model

In [423]:
knn.fit(x_train,y_train)
nb.fit(x_train,y_train)
dt.fit(x_train,y_train)
svm.fit(x_train,y_train)
rf.fit(x_train,y_train)
gb.fit(x_train,y_train)

Melakukan testing

In [424]:
knn_predict = knn.predict(x_test)
nb_predict = nb.predict(x_test)
dt_predict = dt.predict(x_test)
svm_predict = svm.predict(x_test)
rf_predict = rf.predict(x_test)
gb_predict = gb.predict(x_test)

Hasil Akurasi

In [425]:
print("knn accuracy : ")
print(accuracy_score(y_test,knn_predict))

knn accuracy : 
0.6275303643724697


In [426]:
print("nb accuracy : ")
print(accuracy_score(y_test,nb_predict))

nb accuracy : 
0.5668016194331984


In [427]:
print("dt accuracy : ")
print(accuracy_score(y_test,dt_predict))

dt accuracy : 
0.5748987854251012


In [428]:
print("svm accuracy : ")
print(accuracy_score(y_test,svm_predict))

svm accuracy : 
0.46963562753036436


In [429]:
print("rf accuracy : ")
print(accuracy_score(y_test,rf_predict))

rf accuracy : 
0.6720647773279352


In [430]:
print("gb accuracy : ")
print(accuracy_score(y_test,gb_predict))

gb accuracy : 
0.6639676113360324


In [431]:
knn_predict = knn_predict.tolist()
nb_predict = nb_predict.tolist()
dt_predict = dt_predict.tolist()
svm_predict = svm_predict.tolist()
rf_predict = rf_predict.tolist()
gb_predict = gb_predict.tolist()

Voting untuk menggabungkan data-datanya


In [432]:
final_result = []
for i in range (len(knn_predict)):
    temp = [knn_predict[i],nb_predict[i],dt_predict[i],svm_predict[i],rf_predict[i],gb_predict[i]]
    lived_counter = temp.count('lived')
    euthanized_counter = temp.count('euthanized')
    died_counter = temp.count('died')
    temp = [lived_counter,euthanized_counter,died_counter]
    if (max(temp) == lived_counter):
        final_result.append('lived')
    elif (max(temp) == died_counter):
        final_result.append('died')
    else:
        final_result.append('euthanized')

Akurasi akhir

In [433]:
final_result = pd.Series(final_result)
print('final accuracy: ')
print(accuracy_score(y_test,final_result))

final accuracy: 
0.659919028340081
