In [1]:
import keras
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
df = pd.read_csv('/Users/HP/Desktop/Datasets/Maternal_Risk.csv')
df.head()

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel
0,25,130,80,15.0,98.0,86,high risk
1,35,140,90,13.0,98.0,70,high risk
2,29,90,70,8.0,100.0,80,high risk
3,30,140,85,7.0,98.0,70,high risk
4,35,120,60,6.1,98.0,76,low risk


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 808 entries, 0 to 807
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          808 non-null    int64  
 1   SystolicBP   808 non-null    int64  
 2   DiastolicBP  808 non-null    int64  
 3   BS           808 non-null    float64
 4   BodyTemp     808 non-null    float64
 5   HeartRate    808 non-null    int64  
 6   RiskLevel    808 non-null    object 
dtypes: float64(2), int64(4), object(1)
memory usage: 44.3+ KB


In [4]:
df.isna().sum()

Age            0
SystolicBP     0
DiastolicBP    0
BS             0
BodyTemp       0
HeartRate      0
RiskLevel      0
dtype: int64

In [5]:
duplicates = df.duplicated().any()
duplicates


True

In [6]:
df = df.drop_duplicates()

In [7]:
duplicates = df.duplicated().any()
duplicates

False

In [8]:
df.columns

Index(['Age', 'SystolicBP', 'DiastolicBP', 'BS', 'BodyTemp', 'HeartRate',
       'RiskLevel'],
      dtype='object')

In [9]:
df['Age'].unique()


array([25, 35, 29, 30, 23, 42, 15, 50, 10, 40, 21, 18, 16, 19, 22, 49, 28,
       20, 12, 60, 55, 45, 32, 48, 34, 38, 39, 63, 54, 14, 37, 17, 24, 31,
       27, 13, 59, 33, 43, 41, 46, 51, 62, 65, 66, 56, 70, 36])

In [10]:
df['SystolicBP'].unique()

array([130, 140,  90, 120,  85, 110,  70, 100,  75,  95,  76,  80, 135,
       160,  99,  78,  83, 129])

In [11]:
df['DiastolicBP'].unique()

array([ 80,  90,  70,  85,  60,  89, 100,  50,  65,  75,  95,  49,  76,
        68,  63,  69])

In [12]:
df['BS'].unique()

array([15.  , 13.  ,  8.  ,  7.  ,  6.1 ,  7.01, 11.  , 18.  ,  6.9 ,
        7.5 ,  7.2 ,  7.1 ,  6.7 ,  6.4 ,  6.8 ,  7.9 , 17.  ,  9.  ,
       19.  , 16.  ,  6.  ,  6.5 , 12.  ,  6.6 ,  7.6 , 10.  ,  7.7 ,
        6.3 ,  7.8 ])

In [13]:
df['BodyTemp'].unique()

array([ 98. , 100. , 102. , 101. ,  99. ,  98.4, 103. ,  98.6])

In [14]:
df['HeartRate'].unique()

array([86, 70, 80, 76, 77, 90, 66, 82, 88, 60, 75, 78,  7, 67, 65])

In [15]:
df['RiskLevel'].unique()

array(['high risk', 'low risk'], dtype=object)

In [16]:
RiskLevel = {'low risk':0,
#         'mid risk': 2,
        'high risk': 1}

df['RiskLevel'] = df['RiskLevel'].map(RiskLevel).astype(int)
df

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel
0,25,130,80,15.0,98.0,86,1
1,35,140,90,13.0,98.0,70,1
2,29,90,70,8.0,100.0,80,1
3,30,140,85,7.0,98.0,70,1
4,35,120,60,6.1,98.0,76,0
...,...,...,...,...,...,...,...
689,65,130,80,15.0,98.0,86,1
690,35,140,80,13.0,98.0,70,1
691,29,90,70,10.0,98.0,80,1
800,20,120,75,7.5,98.0,70,0


In [17]:
df['RiskLevel'].value_counts()

RiskLevel
0    233
1    112
Name: count, dtype: int64

In [18]:
df.describe()

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel
count,345.0,345.0,345.0,345.0,345.0,345.0,345.0
mean,29.42029,110.014493,75.655072,8.48971,98.643478,74.005797,0.324638
std,14.098291,18.651558,14.171495,2.944493,1.392116,8.467789,0.468919
min,10.0,70.0,49.0,6.0,98.0,7.0,0.0
25%,19.0,90.0,65.0,6.9,98.0,70.0,0.0
50%,25.0,120.0,80.0,7.5,98.0,76.0,0.0
75%,36.0,120.0,90.0,7.9,98.0,80.0,1.0
max,70.0,160.0,100.0,19.0,103.0,90.0,1.0


In [19]:
feature_cols = ['Age', 'SystolicBP', 'DiastolicBP', 'BS', 'BodyTemp', 'HeartRate']
# label = df['RiskLevel']
X = df[feature_cols]
y = df.RiskLevel

In [20]:
X

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate
0,25,130,80,15.0,98.0,86
1,35,140,90,13.0,98.0,70
2,29,90,70,8.0,100.0,80
3,30,140,85,7.0,98.0,70
4,35,120,60,6.1,98.0,76
...,...,...,...,...,...,...
689,65,130,80,15.0,98.0,86
690,35,140,80,13.0,98.0,70
691,29,90,70,10.0,98.0,80
800,20,120,75,7.5,98.0,70


In [21]:
y

0      1
1      1
2      1
3      1
4      0
      ..
689    1
690    1
691    1
800    0
805    0
Name: RiskLevel, Length: 345, dtype: int64

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [23]:
X_train.shape

(241, 6)

In [24]:
# model = DecisionTreeClassifier()
# model.fit(X_train,y_train)
# y_pred = model.predict(X_test)


In [25]:
# print("Accuracy:",metrics.accuracy_score(y_test, y_pred))


In [26]:
model = DecisionTreeClassifier(criterion="entropy", max_depth=2)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)



In [27]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9615384615384616


In [28]:
# pip install pydotplus

In [29]:
# from sklearn import tree
# import graphviz
# # DOT data
# dot_data = tree.export_graphviz(model, out_file=None, 
#                                 feature_names=feature_cols,  
#                                 class_names=df.columns,
#                                 filled=True)

# # Draw graph
# graph = graphviz.Source(dot_data, format="png") 
# graph

In [30]:
import pickle 
pickle_out = open("model.pkl","wb")
pickle.dump(model, pickle_out)
pickle_out.close()