<a href="https://www.kaggle.com/code/udayr007/medicalreportsurvivalusingann?scriptVersionId=154795743" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from tensorflow.keras import layers
from keras.optimizers import Adam
from warnings import simplefilter
simplefilter('ignore')

In [2]:
# importing the dataset
dataset = pd.read_csv('/kaggle/input/medical-students-dataset/medical_students_dataset.csv')
dataset.head()

Unnamed: 0,Student ID,Age,Gender,Height,Weight,Blood Type,BMI,Temperature,Heart Rate,Blood Pressure,Cholesterol,Diabetes,Smoking
0,1.0,18.0,Female,161.777924,72.354947,O,27.645835,,95.0,109.0,203.0,No,
1,2.0,,Male,152.069157,47.630941,B,,98.714977,93.0,104.0,163.0,No,No
2,3.0,32.0,Female,182.537664,55.741083,A,16.729017,98.260293,76.0,130.0,216.0,Yes,No
3,,30.0,Male,182.112867,63.332207,B,19.096042,98.839605,99.0,112.0,141.0,No,Yes
4,5.0,23.0,Female,,46.234173,O,,98.480008,95.0,,231.0,No,No


In [3]:
# get the number of missing data points per column
missing_values = dataset.isnull().sum()
missing_values

Student ID        20000
Age               20000
Gender            20000
Height            20000
Weight            20000
Blood Type        20000
BMI               20000
Temperature       20000
Heart Rate        20000
Blood Pressure    20000
Cholesterol       20000
Diabetes          20000
Smoking           20000
dtype: int64

In [4]:
dataset.shape

(200000, 13)

In [5]:
# Total values in the dataset
total_cells = np.product(dataset.shape)
total_cells

2600000

In [6]:
# Sum of total missing values
total_missing_values = missing_values.sum()
total_missing_values

260000

In [7]:
# Total missing percentage
percent = (total_missing_values/total_cells) * 100
percent

10.0

In [8]:
# seperating the categorical and numerical columns to work with missing values
# Impute missing values
categorical_features = ['Gender','Blood Type','Diabetes','Smoking']
numerical_features = ['Student ID','Age','Height','Weight','BMI','Temperature','Heart Rate','Blood Pressure','Cholesterol']


In [9]:
# print(categorical_features)

In [10]:
# print(numerical_features)

In [11]:
dataset.head()

Unnamed: 0,Student ID,Age,Gender,Height,Weight,Blood Type,BMI,Temperature,Heart Rate,Blood Pressure,Cholesterol,Diabetes,Smoking
0,1.0,18.0,Female,161.777924,72.354947,O,27.645835,,95.0,109.0,203.0,No,
1,2.0,,Male,152.069157,47.630941,B,,98.714977,93.0,104.0,163.0,No,No
2,3.0,32.0,Female,182.537664,55.741083,A,16.729017,98.260293,76.0,130.0,216.0,Yes,No
3,,30.0,Male,182.112867,63.332207,B,19.096042,98.839605,99.0,112.0,141.0,No,Yes
4,5.0,23.0,Female,,46.234173,O,,98.480008,95.0,,231.0,No,No


In [12]:
# Fill missing values in categorical columns with the mode
for col in dataset.select_dtypes(include='object').columns:
  mode_value = dataset[col].mode()[0]
  dataset[col] = dataset[col].fillna(mode_value)

In [13]:
# Fill missing values in numeric columns with the mean
for col in dataset.select_dtypes(include='number').columns:
    mean_value = dataset[col].mean()
    dataset[col] = dataset[col].fillna(mean_value)

In [14]:
dataset.head()

Unnamed: 0,Student ID,Age,Gender,Height,Weight,Blood Type,BMI,Temperature,Heart Rate,Blood Pressure,Cholesterol,Diabetes,Smoking
0,1.0,18.0,Female,161.777924,72.354947,O,27.645835,98.600948,95.0,109.0,203.0,No,No
1,2.0,26.021561,Male,152.069157,47.630941,B,23.338869,98.714977,93.0,104.0,163.0,No,No
2,3.0,32.0,Female,182.537664,55.741083,A,16.729017,98.260293,76.0,130.0,216.0,Yes,No
3,49974.042078,30.0,Male,182.112867,63.332207,B,19.096042,98.839605,99.0,112.0,141.0,No,Yes
4,5.0,23.0,Female,174.947103,46.234173,O,23.338869,98.480008,95.0,114.558033,231.0,No,No


In [15]:
# Identify categorical columns
categorical_features =  dataset.select_dtypes(include='object').columns
categorical_features

Index(['Gender', 'Blood Type', 'Diabetes', 'Smoking'], dtype='object')

In [16]:
# Label Encode assigning
label_encoder = LabelEncoder()

In [17]:
# Apply label encoding to each categorical column
for col in categorical_features:
  dataset[col] = label_encoder.fit_transform(dataset[col])

In [18]:
dataset.head()

Unnamed: 0,Student ID,Age,Gender,Height,Weight,Blood Type,BMI,Temperature,Heart Rate,Blood Pressure,Cholesterol,Diabetes,Smoking
0,1.0,18.0,0,161.777924,72.354947,3,27.645835,98.600948,95.0,109.0,203.0,0,0
1,2.0,26.021561,1,152.069157,47.630941,2,23.338869,98.714977,93.0,104.0,163.0,0,0
2,3.0,32.0,0,182.537664,55.741083,0,16.729017,98.260293,76.0,130.0,216.0,1,0
3,49974.042078,30.0,1,182.112867,63.332207,2,19.096042,98.839605,99.0,112.0,141.0,0,1
4,5.0,23.0,0,174.947103,46.234173,3,23.338869,98.480008,95.0,114.558033,231.0,0,0


In [19]:
# Calculate Survival Percentage based on the columns
dataset['Survival Percentage'] = dataset[['Age', 'Gender',	'Height',	'Weight','Blood Type',	'BMI',	'Temperature',	'Heart Rate',	'Blood Pressure',	'Cholesterol',	'Diabetes',	'Smoking']].mean(axis=1)

In [20]:
dataset.head()

Unnamed: 0,Student ID,Age,Gender,Height,Weight,Blood Type,BMI,Temperature,Heart Rate,Blood Pressure,Cholesterol,Diabetes,Smoking,Survival Percentage
0,1.0,18.0,0,161.777924,72.354947,3,27.645835,98.600948,95.0,109.0,203.0,0,0,65.698305
1,2.0,26.021561,1,152.069157,47.630941,2,23.338869,98.714977,93.0,104.0,163.0,0,0,59.231292
2,3.0,32.0,0,182.537664,55.741083,0,16.729017,98.260293,76.0,130.0,216.0,1,0,67.355671
3,49974.042078,30.0,1,182.112867,63.332207,2,19.096042,98.839605,99.0,112.0,141.0,0,1,62.448393
4,5.0,23.0,0,174.947103,46.234173,3,23.338869,98.480008,95.0,114.558033,231.0,0,0,67.463182


In [21]:
X = dataset.drop(['Survival Percentage','Student ID'], axis=1)
y = dataset['Survival Percentage']

In [22]:
X.head()

Unnamed: 0,Age,Gender,Height,Weight,Blood Type,BMI,Temperature,Heart Rate,Blood Pressure,Cholesterol,Diabetes,Smoking
0,18.0,0,161.777924,72.354947,3,27.645835,98.600948,95.0,109.0,203.0,0,0
1,26.021561,1,152.069157,47.630941,2,23.338869,98.714977,93.0,104.0,163.0,0,0
2,32.0,0,182.537664,55.741083,0,16.729017,98.260293,76.0,130.0,216.0,1,0
3,30.0,1,182.112867,63.332207,2,19.096042,98.839605,99.0,112.0,141.0,0,1
4,23.0,0,174.947103,46.234173,3,23.338869,98.480008,95.0,114.558033,231.0,0,0


In [23]:
y.head()

0    65.698305
1    59.231292
2    67.355671
3    62.448393
4    67.463182
Name: Survival Percentage, dtype: float64

In [24]:
# Divide the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)

In [25]:
# feature scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [26]:
# Normalize the target variable
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
y = scaler.fit_transform(np.array(y).reshape(-1, 1))

In [27]:
# Building an ANN
model = keras.Sequential([
    layers.Dense(units=32, activation='relu'),
    layers.Dense(units=16, activation='relu'),
    layers.Dense(units=1)
])

In [28]:
# compile an ANN model
optimizer = Adam(learning_rate=0.001)
model.compile(
    optimizer=optimizer,
    loss='mean_squared_error',
    metrics=['mae']
)

In [29]:
history = model.fit(
    X_train, y_train,
    batch_size=32,
    epochs=20,
    verbose=True
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [30]:
# Display the model summary
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (32, 32)                  416       
                                                                 
 dense_1 (Dense)             (32, 16)                  528       
                                                                 
 dense_2 (Dense)             (32, 1)                   17        
                                                                 
Total params: 961 (3.75 KB)
Trainable params: 961 (3.75 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
