In [None]:
import pandas as pd                                    # for reading csv file
import matplotlib.pyplot as plt                        # for data visualization
from sklearn import tree
import numpy as np                                     # for linear algebra

In [None]:
def create_piechart(data, column):
    """
    objective: Create piechart for categorical varaibles present in pandas Dataframe

    parameter:
        data: this is pandas dataframe
        colimn: thie is column name which is used to create plot

    return:
        this will show piechart
    """
    labels = list(data[column].value_counts().to_dict().keys())
    sizes = list(data[column].value_counts().to_dict().values())

    plt.pie(sizes,
           labels=labels,
           autopct='%1.2f%%',
           shadow=False,
           startangle=45)

    plt.axis('equal')
    plt.title("Piechart - {}".format(column))
    plt.show()



def check_missing_value(data):
    """
    Objective: Check missing value count and percentage in all columns

    data: input data frame
    return: Missing value data frame
    """
    total = data.isnull().sum().sort_values(ascending=False)
    percent= (data.isnull().sum()/data.isnull().count() *100).sort_values(ascending=False)
    missing_data = pd.concat([total,percent],axis=1,keys=['Total','Percent'])
    return missing_data

In [None]:
df =  pd.read_csv("Data/health_care_data.csv")
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
miss = check_missing_value (df)
miss

In [None]:
colname_cat=[]
for x in df.columns:
    if df[x].dtypes=='object':
        colname_cat.append(x)
colname_cat

In [None]:
colname1 = ["bmi"]

for x in colname1:
    df[x].fillna(df[x].mode()[0],inplace=True)

In [None]:
df.isnull().sum()

In [None]:
from sklearn import preprocessing

le=preprocessing.LabelEncoder()

for x in colname_cat:
    df[x]=le.fit_transform(df[x])
    le_name= dict(zip(le.classes_, le.transform(le.classes_)))
    print("Feature: ",x)
    print("Mapping: ", le_name)

In [None]:
df

In [None]:
X=df.values[:, :-1]
y=df.values[:,  -1]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=10)

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = StandardScaler()

scaler.fit(X_train)

X_train_scale = scaler.transform(X_train)
X_test_scale = scaler.transform(X_test)
print(X_train_scale.shape, y_train.shape)
print(X_test_scale.shape, y_test.shape)

In [None]:
from sklearn import tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

model_dt = DecisionTreeClassifier(criterion='entropy',random_state=30, min_samples_leaf=20, max_depth=10)

model_dt.fit(X_train_scale, y_train)

In [None]:
model_dt.get_depth()

In [None]:
y_pred= model_dt.predict(X_test_scale)

print(list(zip(y_test, y_pred)))

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


print(confusion_matrix(y_test,y_pred))

print(classification_report(y_test,y_pred))

print(accuracy_score(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


print(confusion_matrix(y_test,y_pred))

print(classification_report(y_test,y_pred))

print(accuracy_score(y_test, y_pred))

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(criterion='entropy',random_state=30, min_samples_leaf=20, max_depth=10)
dtree

In [None]:
dtree.fit(X_train, y_train)

In [None]:
print("Training Accuarcy:", dtree.score(X_train,y_train))

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(criterion='entropy',random_state=30, min_samples_leaf=20, max_depth=10)

In [None]:
rfc.fit(X_train, y_train)

In [None]:
print("Training accuracy: ",rfc.score(X_train,y_train))

In [None]:
rfc.fit(X_test, y_test)

In [None]:
print("Training accuracy: ",rfc.score(X_test,y_test))