In [1]:
%matplotlib inline

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix,classification_report,accuracy_score


# Health Care Dataset

## Understanding the data

In [None]:
health_care = pd.read_csv('data/healthcare_dataset.csv')

In [None]:
health_care.head()

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Tiffany Ramirez,81,Female,O-,Diabetes,2022-11-17,Patrick Parker,Wallace-Hamilton,Medicare,37490.983364,146,Elective,2022-12-01,Aspirin,Inconclusive
1,Ruben Burns,35,Male,O+,Asthma,2023-06-01,Diane Jackson,"Burke, Griffin and Cooper",UnitedHealthcare,47304.064845,404,Emergency,2023-06-15,Lipitor,Normal
2,Chad Byrd,61,Male,B-,Obesity,2019-01-09,Paul Baker,Walton LLC,Medicare,36874.896997,292,Emergency,2019-02-08,Lipitor,Normal
3,Antonio Frederick,49,Male,B-,Asthma,2020-05-02,Brian Chandler,Garcia Ltd,Medicare,23303.322092,480,Urgent,2020-05-03,Penicillin,Abnormal
4,Mrs. Brandy Flowers,51,Male,O-,Arthritis,2021-07-09,Dustin Griffin,"Jones, Brown and Murray",UnitedHealthcare,18086.344184,477,Urgent,2021-08-02,Paracetamol,Normal


In [None]:
health_care.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,10000.0,51.4522,19.588974,18.0,35.0,52.0,68.0,85.0
Billing Amount,10000.0,25516.806778,14067.292709,1000.180837,13506.523967,25258.112566,37733.913727,49995.902283
Room Number,10000.0,300.082,115.806027,101.0,199.0,299.0,400.0,500.0


In [None]:
health_care.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name                10000 non-null  object 
 1   Age                 10000 non-null  int64  
 2   Gender              10000 non-null  object 
 3   Blood Type          10000 non-null  object 
 4   Medical Condition   10000 non-null  object 
 5   Date of Admission   10000 non-null  object 
 6   Doctor              10000 non-null  object 
 7   Hospital            10000 non-null  object 
 8   Insurance Provider  10000 non-null  object 
 9   Billing Amount      10000 non-null  float64
 10  Room Number         10000 non-null  int64  
 11  Admission Type      10000 non-null  object 
 12  Discharge Date      10000 non-null  object 
 13  Medication          10000 non-null  object 
 14  Test Results        10000 non-null  object 
dtypes: float64(1), int64(2), object(12)
memory usage: 1.1+

In [None]:
#check if there are duplicate records in the data
health_care.duplicated().sum()

0

In [None]:
# Check the unique values in some columns
health_care['Blood Type'].unique()

array(['O-', 'O+', 'B-', 'AB+', 'A+', 'AB-', 'A-', 'B+'], dtype=object)

In [None]:
health_care['Blood Type'].value_counts()

AB-    1275
AB+    1258
B-     1252
O+     1248
O-     1244
B+     1244
A+     1241
A-     1238
Name: Blood Type, dtype: int64

In [None]:
health_care['Medical Condition'].unique()

array(['Diabetes', 'Asthma', 'Obesity', 'Arthritis', 'Hypertension',
       'Cancer'], dtype=object)

In [None]:
health_care['Medical Condition'].value_counts()

Asthma          1708
Cancer          1703
Hypertension    1688
Arthritis       1650
Obesity         1628
Diabetes        1623
Name: Medical Condition, dtype: int64

In [None]:
health_care.Doctor.unique(), len(health_care.Doctor.unique())

(array(['Patrick Parker', 'Diane Jackson', 'Paul Baker', ...,
        'Robert Nicholson', 'Jamie Lewis', 'Tasha Avila'], dtype=object),
 9416)

In [None]:
health_care.Hospital.unique(), len(health_care.Hospital.unique())

(array(['Wallace-Hamilton', 'Burke, Griffin and Cooper', 'Walton LLC', ...,
        'Nash-Krueger', 'Wilson-Lyons', 'Torres, Young and Stewart'],
       dtype=object),
 8639)

In [None]:
health_care['Insurance Provider'].unique(), len(health_care['Insurance Provider'].unique())

(array(['Medicare', 'UnitedHealthcare', 'Aetna', 'Cigna', 'Blue Cross'],
       dtype=object),
 5)

In [None]:
health_care['Admission Type'].unique(), len(health_care['Admission Type'].unique())

(array(['Elective', 'Emergency', 'Urgent'], dtype=object), 3)

In [None]:
health_care['Admission Type'].value_counts()

Urgent       3391
Emergency    3367
Elective     3242
Name: Admission Type, dtype: int64

In [None]:
health_care.Medication.unique(), len(health_care.Medication.unique())

(array(['Aspirin', 'Lipitor', 'Penicillin', 'Paracetamol', 'Ibuprofen'],
       dtype=object),
 5)

In [None]:
health_care.Medication.value_counts()

Penicillin     2079
Lipitor        2015
Ibuprofen      1976
Aspirin        1968
Paracetamol    1962
Name: Medication, dtype: int64

In [None]:
health_care['Room Number'].unique(), len(health_care['Room Number'].unique())

(array([146, 404, 292, 480, 477, 180, 161, 384, 215, 310, 306, 126, 444,
        492, 120, 315, 475, 125, 366, 238, 364, 130, 293, 379, 298, 392,
        162, 456, 197, 247, 228, 137, 192, 258, 219, 414, 110, 465, 469,
        182, 119, 388, 412, 359, 186, 437, 132, 271, 361, 303, 317, 439,
        153, 438, 380, 194, 199, 301, 223, 410, 205, 134, 407, 188, 213,
        405, 358, 147, 115, 436, 263, 493, 460, 356, 142, 139, 482, 141,
        397, 347, 245, 143, 108, 268, 176, 462, 484, 329, 335, 201, 309,
        389, 217, 299, 275, 181, 401, 214, 267, 211, 184, 140, 416, 179,
        289, 350, 104, 220, 464, 419, 445, 398, 336, 413, 145, 148, 432,
        430, 406, 295, 175, 409, 424, 168, 136, 459, 261, 257, 170, 451,
        372, 202, 394, 264, 279, 260, 499, 452, 365, 340, 360, 290, 103,
        187, 378, 334, 470, 252, 450, 106, 259, 344, 489, 276, 155, 455,
        425, 400, 127, 333, 443, 129, 164, 486, 440, 265, 193, 222, 488,
        472, 391, 230, 322, 272, 154, 236, 375, 221

In [None]:
health_care['Test Results'].unique(), len(health_care['Test Results'].unique())

(array(['Inconclusive', 'Normal', 'Abnormal'], dtype=object), 3)

In [None]:
health_care.columns

Index(['Name', 'Age', 'Gender', 'Blood Type', 'Medical Condition',
       'Date of Admission', 'Doctor', 'Hospital', 'Insurance Provider',
       'Billing Amount', 'Room Number', 'Admission Type', 'Discharge Date',
       'Medication', 'Test Results'],
      dtype='object')

## Feature Engineering

In [None]:
# Create new column for treatment period
health_care['Treatment Period'] = (pd.to_datetime(health_care['Discharge Date']) - pd.to_datetime(health_care['Date of Admission'])).dt.days

In [None]:
health_care.head()

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results,Treatment Period
0,Tiffany Ramirez,81,Female,O-,Diabetes,2022-11-17,Patrick Parker,Wallace-Hamilton,Medicare,37490.983364,146,Elective,2022-12-01,Aspirin,Inconclusive,14
1,Ruben Burns,35,Male,O+,Asthma,2023-06-01,Diane Jackson,"Burke, Griffin and Cooper",UnitedHealthcare,47304.064845,404,Emergency,2023-06-15,Lipitor,Normal,14
2,Chad Byrd,61,Male,B-,Obesity,2019-01-09,Paul Baker,Walton LLC,Medicare,36874.896997,292,Emergency,2019-02-08,Lipitor,Normal,30
3,Antonio Frederick,49,Male,B-,Asthma,2020-05-02,Brian Chandler,Garcia Ltd,Medicare,23303.322092,480,Urgent,2020-05-03,Penicillin,Abnormal,1
4,Mrs. Brandy Flowers,51,Male,O-,Arthritis,2021-07-09,Dustin Griffin,"Jones, Brown and Murray",UnitedHealthcare,18086.344184,477,Urgent,2021-08-02,Paracetamol,Normal,24


In [None]:
# Leave only the usefull columns
health_care = health_care[['Age', 'Gender', 'Blood Type', 'Medical Condition', 'Admission Type', 'Medication', 'Treatment Period', 'Test Results']]

In [None]:
health_care.head()

Unnamed: 0,Age,Gender,Blood Type,Medical Condition,Admission Type,Medication,Treatment Period,Test Results
0,81,Female,O-,Diabetes,Elective,Aspirin,14,Inconclusive
1,35,Male,O+,Asthma,Emergency,Lipitor,14,Normal
2,61,Male,B-,Obesity,Emergency,Lipitor,30,Normal
3,49,Male,B-,Asthma,Urgent,Penicillin,1,Abnormal
4,51,Male,O-,Arthritis,Urgent,Paracetamol,24,Normal


## Label Encoding

In [None]:
label_encoders = {}
for col in health_care.columns:    
    if col!='Age' and col != 'Treatment Period':
        label_encoders[col] = LabelEncoder()
        health_care[col]= label_encoders[col].fit_transform(health_care[col])
health_care.head()

Unnamed: 0,Age,Gender,Blood Type,Medical Condition,Admission Type,Medication,Treatment Period,Test Results
0,81,0,7,3,0,0,14,1
1,35,1,6,1,1,2,14,2
2,61,1,5,5,1,2,30,2
3,49,1,5,1,2,4,1,0
4,51,1,7,0,2,3,24,2


In [None]:
health_care.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   Age                10000 non-null  int64
 1   Gender             10000 non-null  int32
 2   Blood Type         10000 non-null  int32
 3   Medical Condition  10000 non-null  int32
 4   Admission Type     10000 non-null  int32
 5   Medication         10000 non-null  int32
 6   Treatment Period   10000 non-null  int64
 7   Test Results       10000 non-null  int32
dtypes: int32(6), int64(2)
memory usage: 390.8 KB


In [None]:
health_care.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,10000.0,51.4522,19.588974,18.0,35.0,52.0,68.0,85.0
Gender,10000.0,0.4925,0.499969,0.0,0.0,0.0,1.0,1.0
Blood Type,10000.0,3.5011,2.286071,0.0,2.0,3.0,5.0,7.0
Medical Condition,10000.0,2.4875,1.701803,0.0,1.0,2.0,4.0,5.0
Admission Type,10000.0,1.0149,0.814337,0.0,0.0,1.0,2.0,2.0
Medication,10000.0,2.0208,1.41858,0.0,1.0,2.0,3.0,4.0
Treatment Period,10000.0,15.5618,8.612038,1.0,8.0,16.0,23.0,30.0
Test Results,10000.0,0.9811,0.819762,0.0,0.0,1.0,2.0,2.0


## Splitting the dataset for training and testing

In [None]:
health_care_attributes, health_care_labels = health_care.drop(columns=['Test Results']), health_care['Test Results']

In [None]:
health_care_attributes.shape, health_care_labels.shape

((10000, 7), (10000,))

In [None]:
health_care_attributes.head()

Unnamed: 0,Age,Gender,Blood Type,Medical Condition,Admission Type,Medication,Treatment Period
0,81,0,7,3,0,0,14
1,35,1,6,1,1,2,14
2,61,1,5,5,1,2,30
3,49,1,5,1,2,4,1
4,51,1,7,0,2,3,24


In [None]:
health_care_labels.head()

0    1
1    2
2    2
3    0
4    2
Name: Test Results, dtype: int32

In [None]:
health_care_attributes_train, health_care_attributes_test, \
health_care_labels_train, health_care_labels_test = train_test_split(health_care_attributes, health_care_labels, \
                                                                     test_size=0.3, random_state=7)

In [None]:
health_care_attributes_train.head()

Unnamed: 0,Age,Gender,Blood Type,Medical Condition,Admission Type,Medication,Treatment Period
2317,75,1,1,4,1,4,9
259,52,1,3,4,2,4,28
584,61,1,6,4,2,3,21
475,72,0,7,2,1,3,25
9156,45,1,2,3,0,2,21


In [None]:
health_care_labels_train.head()

2317    1
259     0
584     0
475     0
9156    1
Name: Test Results, dtype: int32

In [None]:
health_care_attributes_test.head()

Unnamed: 0,Age,Gender,Blood Type,Medical Condition,Admission Type,Medication,Treatment Period
1977,42,1,5,5,2,2,5
3880,73,1,3,5,2,4,4
52,70,1,0,5,2,1,18
2551,40,0,1,5,2,3,27
2246,84,1,3,4,2,3,2


In [None]:
health_care_labels_test.head()

1977    1
3880    2
52      1
2551    0
2246    0
Name: Test Results, dtype: int32

## Scaling the data

In [None]:
scaler = StandardScaler()

In [None]:
health_care_attributes_train = scaler.fit_transform(health_care_attributes_train)

In [None]:
health_care_attributes_train

array([[ 1.18979323,  1.0100505 , -1.09732137, ..., -0.02831503,
         1.4073222 , -0.75064456],
       [ 0.01477891,  1.0100505 , -0.22405516, ...,  1.19517411,
         1.4073222 ,  1.4521162 ],
       [ 0.47456712,  1.0100505 ,  1.08584416, ...,  1.19517411,
         0.70118963,  0.64057277],
       ...,
       [-1.56893606, -0.9900495 , -0.22405516, ...,  1.19517411,
         0.70118963,  0.29276844],
       [ 0.2702168 ,  1.0100505 , -1.53395448, ...,  1.19517411,
        -0.00494293, -1.56218799],
       [-0.64935963,  1.0100505 ,  1.52247727, ...,  1.19517411,
         0.70118963,  0.9883771 ]])

In [None]:
health_care_attributes_test = scaler.fit_transform(health_care_attributes_test)

In [None]:
health_care_attributes_test

array([[-0.45132247,  1.0270319 ,  0.67101568, ...,  1.24478299,
        -0.03723366, -1.25530979],
       [ 1.12975974,  1.0270319 , -0.20785227, ...,  1.24478299,
         1.36780996, -1.37192009],
       [ 0.97675179,  1.0270319 , -1.52615421, ...,  1.24478299,
        -0.73975546,  0.260624  ],
       ...,
       [ 0.21171201,  1.0270319 , -0.20785227, ..., -1.23404142,
         1.36780996,  0.49384459],
       [-1.21636225, -0.97367959, -0.20785227, ...,  1.24478299,
        -0.73975546,  1.42672692],
       [ 0.97675179, -0.97367959,  1.54988364, ..., -1.23404142,
        -0.03723366, -0.43903775]])

## Logistic Regression

In [None]:
model = LogisticRegression()

In [None]:
model.fit(health_care_attributes_train, health_care_labels_train)

LogisticRegression()

In [None]:
model.score(health_care_attributes_train, health_care_labels_train)

0.3472857142857143

In [None]:
model.score(health_care_attributes_test, health_care_labels_test)

0.33366666666666667

In [None]:
labels_predict = model.predict(health_care_attributes_test)

In [None]:
labels_predict

array([0, 0, 0, ..., 0, 0, 1])

In [None]:
# Accuracy score
accuracy_score(health_care_labels_test, labels_predict)

0.33366666666666667

In [None]:
# Confussion matix
confusion_matrix(health_care_labels_test, labels_predict)

array([[740, 160, 103],
       [756, 151,  83],
       [755, 142, 110]], dtype=int64)

In [None]:
# Classification report
print(classification_report(health_care_labels_test, labels_predict))

              precision    recall  f1-score   support

           0       0.33      0.74      0.45      1003
           1       0.33      0.15      0.21       990
           2       0.37      0.11      0.17      1007

    accuracy                           0.33      3000
   macro avg       0.34      0.33      0.28      3000
weighted avg       0.34      0.33      0.28      3000



In [None]:
#Drop Treatment Period
health_care_attributes = health_care.drop(columns=['Treatment Period', 'Test Results'])

In [None]:
health_care_attributes.head()

Unnamed: 0,Age,Gender,Blood Type,Medical Condition,Admission Type,Medication
0,81,0,7,3,0,0
1,35,1,6,1,1,2
2,61,1,5,5,1,2
3,49,1,5,1,2,4
4,51,1,7,0,2,3


In [None]:
health_care_attributes_train, health_care_attributes_test, \
health_care_labels_train, health_care_labels_test = train_test_split(health_care_attributes, health_care_labels, \
                                                                     test_size=0.3, random_state=7)

In [None]:
health_care_attributes_train.head()

Unnamed: 0,Age,Gender,Blood Type,Medical Condition,Admission Type,Medication
2317,75,1,1,4,1,4
259,52,1,3,4,2,4
584,61,1,6,4,2,3
475,72,0,7,2,1,3
9156,45,1,2,3,0,2


In [None]:
# Scaling
health_care_attributes_train = scaler.fit_transform(health_care_attributes_train)
health_care_attributes_test = scaler.transform(health_care_attributes_test)

In [None]:
model = LogisticRegression()

In [None]:
model.fit(health_care_attributes_train, health_care_labels_train)

LogisticRegression()

In [None]:
model.score(health_care_attributes_train, health_care_labels_train)

0.351

In [None]:
model.score(health_care_attributes_test, health_care_labels_test)

0.3373333333333333

In [None]:
labels_predict = model.predict(health_care_attributes_test)

In [None]:
labels_predict

array([0, 0, 0, ..., 0, 0, 1])

In [None]:
# Accuracy score
accuracy_score(health_care_labels_test, labels_predict)

0.3373333333333333

In [None]:
# Confussion matix
confusion_matrix(health_care_labels_test, labels_predict)

array([[755, 150,  98],
       [755, 156,  79],
       [755, 151, 101]], dtype=int64)

In [None]:
# Classification report
print(classification_report(health_care_labels_test, labels_predict))

              precision    recall  f1-score   support

           0       0.33      0.75      0.46      1003
           1       0.34      0.16      0.22       990
           2       0.36      0.10      0.16      1007

    accuracy                           0.34      3000
   macro avg       0.35      0.34      0.28      3000
weighted avg       0.35      0.34      0.28      3000



## Random Forest 

In [67]:
model = RandomForestClassifier()
model.fit(health_care_attributes_train, health_care_labels_train)

RandomForestClassifier()

In [69]:
model.score(health_care_attributes_train, health_care_labels_train)

0.9772857142857143

In [70]:
labels_predict = model.predict(health_care_attributes_test)

In [72]:
# Accuracy score
accuracy_score(health_care_labels_test, labels_predict)

0.328

In [73]:
# Confussion matix
confusion_matrix(health_care_labels_test, labels_predict)

array([[376, 321, 306],
       [392, 308, 290],
       [354, 353, 300]], dtype=int64)

In [74]:
# Classification report
print(classification_report(health_care_labels_test, labels_predict))

              precision    recall  f1-score   support

           0       0.34      0.37      0.35      1003
           1       0.31      0.31      0.31       990
           2       0.33      0.30      0.32      1007

    accuracy                           0.33      3000
   macro avg       0.33      0.33      0.33      3000
weighted avg       0.33      0.33      0.33      3000

