In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns',100) # Displaying all columns
pd.set_option('display.max_colwidth', 100)

# For adding missing values
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv('kidneyChronic.csv')
df.replace(['?', '?\t', '\t?'], np.nan, inplace=True)

In [3]:
numerical_columns = ['age', 'bp',  'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc', 'rbcc'] # 11 columns
nominal_columns = ['sg','al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'class']  # 14 columns

In [4]:
imp_nominal = SimpleImputer(strategy="most_frequent")
imp_numeric = SimpleImputer(missing_values=np.nan, strategy='mean')

In [5]:
df[nominal_columns] = imp_nominal.fit_transform(df[nominal_columns])
df[numerical_columns] = imp_numeric.fit_transform(df[numerical_columns])

In [6]:
labelencoder = LabelEncoder()

In [7]:
df[nominal_columns] = df[nominal_columns].apply(labelencoder.fit_transform)

In [8]:
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=2)

In [9]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(random_state=2)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [10]:
print("Accuracy:",accuracy_score(y_test, y_pred))
print("Confusion matrix: \n", confusion_matrix(y_test, y_pred))

Accuracy: 1.0
Confusion matrix: 
 [[54  0]
 [ 0 26]]
