In [44]:
import fcalc
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Binarized Data

In [2]:
#import data
df = pd.read_csv('healthcare_dataset.csv')

df.head()

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Tiffany Ramirez,81,Female,O-,Diabetes,2022-11-17,Patrick Parker,Wallace-Hamilton,Medicare,37490.983364,146,Elective,2022-12-01,Aspirin,Inconclusive
1,Ruben Burns,35,Male,O+,Asthma,2023-06-01,Diane Jackson,"Burke, Griffin and Cooper",UnitedHealthcare,47304.064845,404,Emergency,2023-06-15,Lipitor,Normal
2,Chad Byrd,61,Male,B-,Obesity,2019-01-09,Paul Baker,Walton LLC,Medicare,36874.896997,292,Emergency,2019-02-08,Lipitor,Normal
3,Antonio Frederick,49,Male,B-,Asthma,2020-05-02,Brian Chandler,Garcia Ltd,Medicare,23303.322092,480,Urgent,2020-05-03,Penicillin,Abnormal
4,Mrs. Brandy Flowers,51,Male,O-,Arthritis,2021-07-09,Dustin Griffin,"Jones, Brown and Murray",UnitedHealthcare,18086.344184,477,Urgent,2021-08-02,Paracetamol,Normal


In [3]:
#delete null rows
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name                10000 non-null  object 
 1   Age                 10000 non-null  int64  
 2   Gender              10000 non-null  object 
 3   Blood Type          10000 non-null  object 
 4   Medical Condition   10000 non-null  object 
 5   Date of Admission   10000 non-null  object 
 6   Doctor              10000 non-null  object 
 7   Hospital            10000 non-null  object 
 8   Insurance Provider  10000 non-null  object 
 9   Billing Amount      10000 non-null  float64
 10  Room Number         10000 non-null  int64  
 11  Admission Type      10000 non-null  object 
 12  Discharge Date      10000 non-null  object 
 13  Medication          10000 non-null  object 
 14  Test Results        10000 non-null  object 
dtypes: float64(1), int64(2), object(12)
memory usage: 1.1+

In [4]:
#count unique values
df.nunique ()

Name                   9378
Age                      68
Gender                    2
Blood Type                8
Medical Condition         6
Date of Admission      1815
Doctor                 9416
Hospital               8639
Insurance Provider        5
Billing Amount        10000
Room Number             400
Admission Type            3
Discharge Date         1834
Medication                5
Test Results              3
dtype: int64

In [5]:
#delete extra rows
df = df.drop(['Name','Doctor','Hospital','Room Number','Blood Type'], axis=1)
#calculate time period between dates
date_cols = ['Date of Admission','Discharge Date']
df[date_cols] = df[date_cols].astype("datetime64[ns]")
df['days'] = \
(df['Discharge Date'] - df['Date of Admission']).dt.days
#shuffle data
from sklearn.utils import shuffle
df = shuffle(df)
df=df.iloc[1:500]
df = df.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,Age,Gender,Medical Condition,Date of Admission,Insurance Provider,Billing Amount,Admission Type,Discharge Date,Medication,Test Results,days
0,80,Female,Cancer,2019-09-06,Medicare,24358.674062,Urgent,2019-09-17,Paracetamol,Abnormal,11
1,59,Male,Cancer,2022-03-27,Blue Cross,15450.181900,Emergency,2022-04-09,Ibuprofen,Abnormal,13
2,72,Female,Arthritis,2021-12-24,Blue Cross,42258.431826,Urgent,2022-01-18,Lipitor,Inconclusive,25
3,47,Male,Hypertension,2019-05-06,UnitedHealthcare,4397.847075,Elective,2019-05-28,Lipitor,Normal,22
4,35,Female,Diabetes,2022-10-23,Blue Cross,45960.250694,Urgent,2022-10-28,Ibuprofen,Normal,5
...,...,...,...,...,...,...,...,...,...,...,...
494,57,Female,Arthritis,2021-02-14,Medicare,21291.259735,Elective,2021-03-13,Ibuprofen,Abnormal,27
495,73,Male,Hypertension,2023-02-07,UnitedHealthcare,26156.213404,Urgent,2023-03-09,Lipitor,Inconclusive,30
496,42,Female,Cancer,2020-01-02,Medicare,26419.324813,Emergency,2020-01-30,Ibuprofen,Inconclusive,28
497,54,Female,Cancer,2021-12-17,Cigna,16479.896916,Elective,2021-12-27,Paracetamol,Normal,10


In [6]:
df = df.drop(['Date of Admission','Discharge Date'], axis=1)
df.nunique ()

Age                    68
Gender                  2
Medical Condition       6
Insurance Provider      5
Billing Amount        499
Admission Type          3
Medication              5
Test Results            3
days                   30
dtype: int64

DATA BINARIZATION

In [7]:
print (df['Age'].min())
print (df['Age'].max())

18
85


In [8]:
bin_data = dict()
bin_data['Age18_40'] = (df['Age']<=40).astype(int)
bin_data['Age40_62'] = ((df['Age']>40) & (df['Age']<=62)).astype(int)
bin_data['Age62_85'] = (df['Age']>62).astype(int)

In [9]:
print (df['Billing Amount'].min())
print (df['Billing Amount'].max())

1166.7713188498326
49909.18220645191


In [10]:
bin_data['Small_bill'] = (df['Billing Amount']<=17000).astype(int)
bin_data['Medium_bill'] = ((df['Billing Amount']>17000) & (df['Billing Amount']<=34000)).astype(int)
bin_data['Large_bill'] = (df['Billing Amount']>34000).astype(int)

In [11]:
bin_data['Small_days'] = (df['days']<=10).astype(int)
bin_data['Medium_days'] = ((df['days']>10) & (df['days']<=20)).astype(int)
bin_data['Large_days'] = (df['days']>20).astype(int)

In [12]:
df_bin = pd.DataFrame(bin_data)
df_bin = df_bin.sample(frac=1).reset_index(drop=True)
df_bin

Unnamed: 0,Age18_40,Age40_62,Age62_85,Small_bill,Medium_bill,Large_bill,Small_days,Medium_days,Large_days
0,0,1,0,1,0,0,1,0,0
1,0,1,0,0,0,1,0,1,0
2,0,1,0,1,0,0,0,1,0
3,0,1,0,1,0,0,1,0,0
4,1,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...
494,0,1,0,0,0,1,0,1,0
495,0,0,1,0,1,0,0,1,0
496,0,1,0,0,1,0,1,0,0
497,1,0,0,1,0,0,0,1,0


In [13]:
df = df.drop(['Age','Billing Amount','days'], axis=1)

In [14]:
df_new=pd.concat([df_bin, df], sort=False, axis=1)
df_new

Unnamed: 0,Age18_40,Age40_62,Age62_85,Small_bill,Medium_bill,Large_bill,Small_days,Medium_days,Large_days,Gender,Medical Condition,Insurance Provider,Admission Type,Medication,Test Results
0,0,1,0,1,0,0,1,0,0,Female,Cancer,Medicare,Urgent,Paracetamol,Abnormal
1,0,1,0,0,0,1,0,1,0,Male,Cancer,Blue Cross,Emergency,Ibuprofen,Abnormal
2,0,1,0,1,0,0,0,1,0,Female,Arthritis,Blue Cross,Urgent,Lipitor,Inconclusive
3,0,1,0,1,0,0,1,0,0,Male,Hypertension,UnitedHealthcare,Elective,Lipitor,Normal
4,1,0,0,1,0,0,0,1,0,Female,Diabetes,Blue Cross,Urgent,Ibuprofen,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,0,1,0,0,0,1,0,1,0,Female,Arthritis,Medicare,Elective,Ibuprofen,Abnormal
495,0,0,1,0,1,0,0,1,0,Male,Hypertension,UnitedHealthcare,Urgent,Lipitor,Inconclusive
496,0,1,0,0,1,0,1,0,0,Female,Cancer,Medicare,Emergency,Ibuprofen,Inconclusive
497,1,0,0,1,0,0,0,1,0,Female,Cancer,Cigna,Elective,Paracetamol,Normal


In [15]:
df_new['Test Results'] = [x == 'Normal' for x in df_new['Test Results']]
df_new.sample(10)

Unnamed: 0,Age18_40,Age40_62,Age62_85,Small_bill,Medium_bill,Large_bill,Small_days,Medium_days,Large_days,Gender,Medical Condition,Insurance Provider,Admission Type,Medication,Test Results
248,0,1,0,0,1,0,0,0,1,Female,Hypertension,Blue Cross,Elective,Paracetamol,True
498,0,0,1,0,1,0,1,0,0,Female,Cancer,UnitedHealthcare,Elective,Ibuprofen,False
356,0,1,0,1,0,0,0,0,1,Male,Asthma,Aetna,Elective,Ibuprofen,False
113,0,0,1,0,0,1,0,1,0,Male,Obesity,Cigna,Urgent,Lipitor,False
392,0,1,0,0,1,0,1,0,0,Female,Cancer,Aetna,Urgent,Ibuprofen,False
417,0,0,1,0,0,1,0,1,0,Female,Cancer,Medicare,Elective,Paracetamol,False
75,0,0,1,0,0,1,1,0,0,Female,Diabetes,Medicare,Elective,Paracetamol,True
464,1,0,0,0,0,1,1,0,0,Male,Asthma,UnitedHealthcare,Urgent,Penicillin,False
150,0,1,0,0,1,0,0,0,1,Male,Cancer,Medicare,Urgent,Ibuprofen,False
303,0,1,0,1,0,0,1,0,0,Female,Arthritis,UnitedHealthcare,Elective,Penicillin,False


In [16]:
#One-hot encoding
df_new_new=df_new.iloc[:,:-1]
X = pd.get_dummies(df_new_new).astype(bool)
y = df_new['Test Results']
X.head()

Unnamed: 0,Age18_40,Age40_62,Age62_85,Small_bill,Medium_bill,Large_bill,Small_days,Medium_days,Large_days,Gender_Female,...,Insurance Provider_Medicare,Insurance Provider_UnitedHealthcare,Admission Type_Elective,Admission Type_Emergency,Admission Type_Urgent,Medication_Aspirin,Medication_Ibuprofen,Medication_Lipitor,Medication_Paracetamol,Medication_Penicillin
0,False,True,False,True,False,False,True,False,False,True,...,True,False,False,False,True,False,False,False,True,False
1,False,True,False,False,False,True,False,True,False,False,...,False,False,False,True,False,False,True,False,False,False
2,False,True,False,True,False,False,False,True,False,True,...,False,False,False,False,True,False,False,True,False,False
3,False,True,False,True,False,False,True,False,False,False,...,False,True,True,False,False,False,False,True,False,False
4,True,False,False,True,False,False,False,True,False,True,...,False,False,False,False,True,False,True,False,False,False


In [17]:
#training model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [18]:
bin_cls = fcalc.classifier.BinarizedBinaryClassifier(X_train.values, y_train.to_numpy(), method="standard-support")

In [19]:
bin_cls.predict(X_test.values)

In [43]:
from sklearn.metrics import accuracy_score

print("Accuracy:", accuracy_score(y_test, bin_cls.predictions))

Accuracy: 0.58


In [21]:
bin_cls.predictions

array([0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0.,
       0., 0., 1., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1.,
       0., 0., 0., 0., 1., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 1., 1., 0., 1., 1., 1., 0., 1., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1.])

# Pattern structure

In [22]:
df1 = pd.read_csv('healthcare_dataset.csv')
df1 = df1.drop(['Name','Doctor','Hospital','Room Number','Blood Type'], axis=1)
df1 = df1.drop(['Discharge Date','Date of Admission'], axis=1)
from sklearn.utils import shuffle
df1 = shuffle(df1)
df1=df1.iloc[1:500]
df1 = df1.sample(frac=1).reset_index(drop=True)
df1

Unnamed: 0,Age,Gender,Medical Condition,Insurance Provider,Billing Amount,Admission Type,Medication,Test Results
0,51,Male,Obesity,Cigna,6252.471533,Emergency,Paracetamol,Normal
1,25,Male,Diabetes,Aetna,35842.414846,Emergency,Penicillin,Inconclusive
2,30,Female,Arthritis,Cigna,41430.883865,Elective,Paracetamol,Inconclusive
3,33,Female,Arthritis,UnitedHealthcare,18774.383818,Elective,Lipitor,Inconclusive
4,66,Female,Arthritis,Cigna,20465.568844,Emergency,Ibuprofen,Inconclusive
...,...,...,...,...,...,...,...,...
494,80,Female,Cancer,Medicare,24358.674062,Urgent,Paracetamol,Abnormal
495,38,Male,Arthritis,Blue Cross,25159.923895,Elective,Aspirin,Abnormal
496,40,Female,Diabetes,Cigna,42074.973034,Elective,Paracetamol,Normal
497,24,Male,Asthma,Cigna,4200.251623,Emergency,Aspirin,Abnormal


In [23]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

In [24]:
df1.loc[:, "Gender"] = le.fit_transform(df1["Gender"])
df1.loc[:, "Medical Condition"] = le.fit_transform(df1["Medical Condition"])
df1.loc[:, "Insurance Provider"] = le.fit_transform(df1["Insurance Provider"])
df1.loc[:, "Admission Type"] = le.fit_transform(df1["Admission Type"])
df1.loc[:, "Medication"] = le.fit_transform(df1["Medication"])

In [25]:
df1['Test Results'] = [x == 'Normal' for x in df1['Test Results']]
df1

Unnamed: 0,Age,Gender,Medical Condition,Insurance Provider,Billing Amount,Admission Type,Medication,Test Results
0,51,1,5,2,6252.471533,1,3,True
1,25,1,3,0,35842.414846,1,4,False
2,30,0,0,2,41430.883865,0,3,False
3,33,0,0,4,18774.383818,0,2,False
4,66,0,0,2,20465.568844,1,1,False
...,...,...,...,...,...,...,...,...
494,80,0,2,3,24358.674062,2,3,False
495,38,1,0,1,25159.923895,0,0,False
496,40,0,3,2,42074.973034,0,3,True
497,24,1,1,2,4200.251623,1,0,False


In [26]:
X = df1.iloc[:,:-1]
y = df1['Test Results']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [27]:
pat_cls = fcalc.classifier.PatternBinaryClassifier(X_train.values, y_train.to_numpy(), 
                                                   categorical=np.array([1,2,3,5,6]))

In [28]:
pat_cls.predict(X_test.values)

In [29]:
from sklearn.metrics import accuracy_score, f1_score
print("accuracy:",round(accuracy_score(y_test, pat_cls.predictions),4))

accuracy: 0.6533


In [30]:
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [35]:
def try_models(model):
    model.fit(X_train, y_train)
    
    y_preds = model.predict(X_test)
    
    acc_score = round(accuracy_score(y_test, y_preds),4)
    
    return (f'Accuracy Score of {model}: {acc_score}')

In [36]:
try_models(KNeighborsClassifier())

'Accuracy Score of KNeighborsClassifier(): 0.6267'

In [40]:
try_models(LogisticRegression())

'Accuracy Score of LogisticRegression(): 0.7'

In [37]:
try_models(DecisionTreeClassifier())

'Accuracy Score of DecisionTreeClassifier(): 0.4867'

In [38]:
try_models(RandomForestClassifier(random_state=42))

'Accuracy Score of RandomForestClassifier(random_state=42): 0.64'

In [41]:
table = pd.DataFrame({'model': ['FCA', 'KNN','LogisticRegression', 'DecisionTree', 'RandomForest'],'Accuracy': [0.6533, 0.6267, 0.7, 0.6267, 0.64]})
table

Unnamed: 0,model,Accuracy
0,FCA,0.6533
1,KNN,0.6267
2,LogisticRegression,0.7
3,DecisionTree,0.6267
4,RandomForest,0.64
