In [1]:
#!pip install scikit-multilearn
import pandas as pd
import zipfile as zf
import pickle
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import codecs
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import accuracy_score
from skmultilearn.problem_transform import LabelPowerset
from time import time

#### Load the dataset

In [2]:
data_file = "./SIT788_3_2_Data/diagnosis.data"
udoc = codecs.open(data_file,'rU','UTF-16') #reading with "universal" type
df = pd.read_csv(udoc, sep='\t', decimal=',') #Reading tab seperated values and replacing ',' with '.'
df.columns =['temp', 'nausea', 'lumbar_pain', 'urine_pushing', 'micturian_pains', 
             'urethra_pain', 'urinary_bladder_inflammation', 'renal_pelvis_nephritis'] #Add relevant column names

#### Check for null values

In [3]:
print(df.isnull().sum())


temp                            0
nausea                          0
lumbar_pain                     0
urine_pushing                   0
micturian_pains                 0
urethra_pain                    0
urinary_bladder_inflammation    0
renal_pelvis_nephritis          0
dtype: int64


#### Print 1st 5 rows and data tyes of column

In [4]:
display(df.head(5))
display(df.dtypes)

Unnamed: 0,temp,nausea,lumbar_pain,urine_pushing,micturian_pains,urethra_pain,urinary_bladder_inflammation,renal_pelvis_nephritis
0,35.9,no,no,yes,yes,yes,yes,no
1,35.9,no,yes,no,no,no,no,no
2,36.0,no,no,yes,yes,yes,yes,no
3,36.0,no,yes,no,no,no,no,no
4,36.0,no,yes,no,no,no,no,no


temp                            float64
nausea                           object
lumbar_pain                      object
urine_pushing                    object
micturian_pains                  object
urethra_pain                     object
urinary_bladder_inflammation     object
renal_pelvis_nephritis           object
dtype: object

#### Replace categorical string values wth numerical values

In [5]:
df.replace(to_replace=['no', 'yes'], value=[0, 1], inplace=True)
display(df.head(5))
display(df.describe())
display(df.dtypes)


Unnamed: 0,temp,nausea,lumbar_pain,urine_pushing,micturian_pains,urethra_pain,urinary_bladder_inflammation,renal_pelvis_nephritis
0,35.9,0,0,1,1,1,1,0
1,35.9,0,1,0,0,0,0,0
2,36.0,0,0,1,1,1,1,0
3,36.0,0,1,0,0,0,0,0
4,36.0,0,1,0,0,0,0,0


Unnamed: 0,temp,nausea,lumbar_pain,urine_pushing,micturian_pains,urethra_pain,urinary_bladder_inflammation,renal_pelvis_nephritis
count,119.0,119.0,119.0,119.0,119.0,119.0,119.0,119.0
mean,38.751261,0.243697,0.579832,0.672269,0.495798,0.420168,0.495798,0.420168
std,1.802346,0.431128,0.495673,0.471371,0.502096,0.495673,0.502096,0.495673
min,35.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,37.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,38.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
75%,40.6,0.0,1.0,1.0,1.0,1.0,1.0,1.0
max,41.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0


temp                            float64
nausea                            int64
lumbar_pain                       int64
urine_pushing                     int64
micturian_pains                   int64
urethra_pain                      int64
urinary_bladder_inflammation      int64
renal_pelvis_nephritis            int64
dtype: object

#### Split data into train and test

In [6]:
X, Y = df.loc[:, (df.columns != 'urinary_bladder_inflammation') & (df.columns != 'renal_pelvis_nephritis')], df[['renal_pelvis_nephritis', 'urinary_bladder_inflammation']]
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.40, random_state=42)
print ("Data distribution")
print(Y[Y==0].count())
print(Y[Y==1].count())

Data distribution
renal_pelvis_nephritis          69
urinary_bladder_inflammation    60
dtype: int64
renal_pelvis_nephritis          50
urinary_bladder_inflammation    59
dtype: int64


#### Training DecisionTree

In [7]:
clf_dt = LabelPowerset(DecisionTreeClassifier())

millis_a = int(time() * 1000)
clf_dt.fit(x_train, y_train)
millis_b = int(time() * 1000)
dif = millis_b - millis_a
print ("Decsion Tree train time: ", dif)

millis_a = int(time() * 1000)
y_pred_dt = clf_dt.predict(x_test)
millis_b = int(time() * 1000)
dif = millis_b - millis_a
avg_t = dif / len(x_test)
print ("Decsion Tree time per prediction: ", avg_t)

Decsion Tree train time:  5
Decsion Tree time per prediction:  0.22916666666666666


#### Training Random Forest

In [8]:
clf_rf = LabelPowerset(RandomForestClassifier())

millis_a = int(time() * 1000)
clf_rf = clf_rf.fit(x_train, y_train)
millis_b = int(time() * 1000)
dif = millis_b - millis_a
print ("RF train time: ", dif)

millis_a = int(time() * 1000)
y_pred_rf = clf_rf.predict(x_test)
millis_b = int(time() * 1000)
dif = millis_b - millis_a
avg_t = dif / len(x_test)
print ("RF time per prediction: ", avg_t)

RF train time:  283
RF time per prediction:  0.6875


#### comparing both the models on different metrics

In [9]:
print("########################Decision tree metrics###############################")
print (classification_report(y_test, y_pred_dt, zero_division=0))
print("DT Accuracy: ", accuracy_score(y_test, y_pred_dt))
print(multilabel_confusion_matrix(y_test, y_pred_dt))

print("########################Random Forest metrics###############################")
print (classification_report(y_test, y_pred_rf, zero_division=0))
print("RF Accuracy:", accuracy_score(y_test, y_pred_rf))
print(multilabel_confusion_matrix(y_test, y_pred_rf))


########################Decision tree metrics###############################
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        19
           1       1.00      1.00      1.00        22

   micro avg       0.98      1.00      0.99        41
   macro avg       0.97      1.00      0.99        41
weighted avg       0.98      1.00      0.99        41
 samples avg       0.73      0.73      0.73        41

DT Accuracy:  0.9791666666666666
[[[28  1]
  [ 0 19]]

 [[26  0]
  [ 0 22]]]
########################Random Forest metrics###############################
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        22

   micro avg       1.00      1.00      1.00        41
   macro avg       1.00      1.00      1.00        41
weighted avg       1.00      1.00      1.00        41
 samples avg       0.73      0.73      0.73        41

RF Accu

#### Exporting and Saving zipped model

In [10]:
model_f = open('best_model.pkl', 'wb')
pickle.dump(clf_rf, model_f)
model_f.close()
zf.ZipFile('best_model.zip', mode = 'w').write('best_model.pkl')