In [1]:
import fcalc
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Binarized Data

In [2]:
#import data
df = pd.read_csv('diabetes_prediction_dataset.csv')

df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [3]:
#delete null rows
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [4]:
#count unique values
df.nunique ()

gender                    3
age                     102
hypertension              2
heart_disease             2
smoking_history           6
bmi                    4247
HbA1c_level              18
blood_glucose_level      18
diabetes                  2
dtype: int64

In [5]:
#shuffle data
from sklearn.utils import shuffle
df = shuffle(df)
df=df.iloc[1:500]
df = df.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Male,5.0,0,0,No Info,12.73,3.5,126,0
1,Male,8.0,0,0,No Info,15.22,6.1,85,0
2,Female,80.0,0,0,never,27.99,6.2,80,0
3,Male,67.0,0,0,not current,32.72,5.7,140,0
4,Female,15.0,0,0,No Info,27.32,5.8,126,0
...,...,...,...,...,...,...,...,...,...
494,Female,24.0,0,0,never,34.65,3.5,159,0
495,Male,6.0,0,0,No Info,16.22,6.5,100,0
496,Female,77.0,0,0,never,27.32,4.8,85,0
497,Male,49.0,0,0,never,21.84,5.0,130,0


In [6]:
df.nunique ()

gender                   2
age                     90
hypertension             2
heart_disease            2
smoking_history          6
bmi                    340
HbA1c_level             18
blood_glucose_level     18
diabetes                 2
dtype: int64

DATA BINARIZATION

In [7]:
print (df['bmi'].min())
print (df['bmi'].max())

10.64
81.73


In [10]:
bin_data = dict()
bin_data['bmi10_34'] = (df['bmi']<=34).astype(int)
bin_data['bmi34_58'] = ((df['bmi']>34) & (df['bmi']<=58)).astype(int)
bin_data['bmi58_82'] = (df['bmi']>58).astype(int)

In [11]:
print (df['age'].min())
print (df['age'].max())

0.16
80.0


In [12]:
bin_data['Age0_27'] = (df['age']<=27).astype(int)
bin_data['Age27_54'] = ((df['age']>27) & (df['age']<=54)).astype(int)
bin_data['Age54_80'] = (df['age']>54).astype(int)

In [13]:
print (df['HbA1c_level'].min())
print (df['HbA1c_level'].max())

3.5
9.0


In [14]:
bin_data['small_lvl'] = (df['HbA1c_level']<=6.25).astype(int)
bin_data['high_lvl'] = (df['HbA1c_level']>6.25).astype(int)

In [15]:
print (df['blood_glucose_level'].min())
print (df['blood_glucose_level'].max())

80
300


In [16]:
bin_data['small_gluc'] = (df['blood_glucose_level']<=150).astype(int)
bin_data['middle_gluc'] = ((df['blood_glucose_level']>150) & (df['blood_glucose_level']<=220)).astype(int)
bin_data['high_gluc'] = (df['blood_glucose_level']>220).astype(int)

In [17]:
df_bin = pd.DataFrame(bin_data)
df_bin = df_bin.sample(frac=1).reset_index(drop=True)
df_bin

Unnamed: 0,bmi10_34,bmi34_58,bmi58_82,Age0_27,Age27_54,Age54_80,small_lvl,high_lvl,small_gluc,middle_gluc,high_gluc
0,1,0,0,0,1,0,1,0,1,0,0
1,1,0,0,0,1,0,1,0,1,0,0
2,0,1,0,0,1,0,1,0,1,0,0
3,1,0,0,0,1,0,0,1,1,0,0
4,1,0,0,0,0,1,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
494,1,0,0,0,0,1,1,0,0,1,0
495,1,0,0,1,0,0,1,0,0,1,0
496,1,0,0,0,1,0,1,0,1,0,0
497,1,0,0,1,0,0,1,0,1,0,0


In [18]:
df_new=pd.concat([df_bin, df], sort=False, axis=1)
df_new

Unnamed: 0,bmi10_34,bmi34_58,bmi58_82,Age0_27,Age27_54,Age54_80,small_lvl,high_lvl,small_gluc,middle_gluc,high_gluc,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,1,0,0,0,1,0,1,0,1,0,0,Male,5.0,0,0,No Info,12.73,3.5,126,0
1,1,0,0,0,1,0,1,0,1,0,0,Male,8.0,0,0,No Info,15.22,6.1,85,0
2,0,1,0,0,1,0,1,0,1,0,0,Female,80.0,0,0,never,27.99,6.2,80,0
3,1,0,0,0,1,0,0,1,1,0,0,Male,67.0,0,0,not current,32.72,5.7,140,0
4,1,0,0,0,0,1,1,0,1,0,0,Female,15.0,0,0,No Info,27.32,5.8,126,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,1,0,0,0,0,1,1,0,0,1,0,Female,24.0,0,0,never,34.65,3.5,159,0
495,1,0,0,1,0,0,1,0,0,1,0,Male,6.0,0,0,No Info,16.22,6.5,100,0
496,1,0,0,0,1,0,1,0,1,0,0,Female,77.0,0,0,never,27.32,4.8,85,0
497,1,0,0,1,0,0,1,0,1,0,0,Male,49.0,0,0,never,21.84,5.0,130,0


In [20]:
df_new['diabetes'] = [x == 1 for x in df_new['diabetes']]
df_new.sample(10)

Unnamed: 0,bmi10_34,bmi34_58,bmi58_82,Age0_27,Age27_54,Age54_80,small_lvl,high_lvl,small_gluc,middle_gluc,high_gluc,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
230,1,0,0,1,0,0,1,0,0,1,0,Male,38.0,0,0,No Info,27.32,6.1,80,False
355,0,1,0,0,0,1,0,1,1,0,0,Male,66.0,0,0,No Info,47.46,7.0,200,True
12,0,1,0,0,0,1,1,0,1,0,0,Male,26.0,0,0,never,27.88,5.7,159,False
97,1,0,0,0,1,0,0,1,1,0,0,Male,51.0,0,0,current,27.32,4.5,155,False
89,1,0,0,1,0,0,0,1,1,0,0,Male,76.0,0,0,ever,27.32,6.5,140,True
47,1,0,0,0,1,0,1,0,0,1,0,Female,23.0,0,0,never,22.31,6.5,130,False
223,1,0,0,0,0,1,0,1,1,0,0,Female,57.0,1,0,never,27.32,4.0,159,False
368,1,0,0,0,0,1,1,0,0,1,0,Male,28.0,0,0,never,20.91,5.0,200,False
267,1,0,0,0,0,1,1,0,0,1,0,Female,37.0,0,0,No Info,28.45,5.7,200,False
248,1,0,0,1,0,0,1,0,0,1,0,Male,10.0,0,0,No Info,14.3,4.8,85,False


In [21]:
df_new_new=df_new.iloc[:,:-1]
X = pd.get_dummies(df_new_new).astype(bool)
y = df_new['diabetes']
X.head()

Unnamed: 0,bmi10_34,bmi34_58,bmi58_82,Age0_27,Age27_54,Age54_80,small_lvl,high_lvl,small_gluc,middle_gluc,...,HbA1c_level,blood_glucose_level,gender_Female,gender_Male,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,True,False,False,False,True,False,True,False,True,False,...,True,True,False,True,True,False,False,False,False,False
1,True,False,False,False,True,False,True,False,True,False,...,True,True,False,True,True,False,False,False,False,False
2,False,True,False,False,True,False,True,False,True,False,...,True,True,True,False,False,False,False,False,True,False
3,True,False,False,False,True,False,False,True,True,False,...,True,True,False,True,False,False,False,False,False,True
4,True,False,False,False,False,True,True,False,True,False,...,True,True,True,False,True,False,False,False,False,False


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [23]:
bin_cls = fcalc.classifier.BinarizedBinaryClassifier(X_train.values, y_train.to_numpy(), method="standard-support")

In [24]:
bin_cls.predict(X_test.values)

In [27]:
from sklearn.metrics import accuracy_score

print("Accuracy:", accuracy_score(y_test, bin_cls.predictions))

Accuracy: 0.62


# Pattern structure

In [29]:
df1 = pd.read_csv('diabetes_prediction_dataset.csv')
from sklearn.utils import shuffle
df1 = shuffle(df1)
df1=df1.iloc[1:500]
df1 = df1.sample(frac=1).reset_index(drop=True)
df1

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,1,0,never,23.04,6.0,160,0
1,Female,80.0,0,0,not current,27.32,5.7,80,0
2,Male,5.0,0,0,No Info,19.70,6.0,140,0
3,Female,46.0,0,0,current,19.81,5.7,160,0
4,Female,52.0,0,0,never,28.20,3.5,85,0
...,...,...,...,...,...,...,...,...,...
494,Female,50.0,0,0,never,24.63,4.8,140,0
495,Male,62.0,0,0,No Info,27.32,4.5,159,0
496,Female,31.0,0,0,No Info,27.32,4.8,159,0
497,Male,45.0,0,0,No Info,26.62,3.5,159,0


In [31]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

In [32]:
df1.loc[:, "gender"] = le.fit_transform(df1["gender"])
df1.loc[:, "hypertension"] = le.fit_transform(df1["hypertension"])
df1.loc[:, "heart_disease"] = le.fit_transform(df1["heart_disease"])
df1.loc[:, "smoking_history"] = le.fit_transform(df1["smoking_history"])

In [34]:
df1['diabetes'] = [x == 1 for x in df1['diabetes']]
df1

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,1,0,4,23.04,6.0,160,False
1,0,80.0,0,0,5,27.32,5.7,80,False
2,1,5.0,0,0,0,19.70,6.0,140,False
3,0,46.0,0,0,1,19.81,5.7,160,False
4,0,52.0,0,0,4,28.20,3.5,85,False
...,...,...,...,...,...,...,...,...,...
494,0,50.0,0,0,4,24.63,4.8,140,False
495,1,62.0,0,0,0,27.32,4.5,159,False
496,0,31.0,0,0,0,27.32,4.8,159,False
497,1,45.0,0,0,0,26.62,3.5,159,False


In [35]:
X = df1.iloc[:,:-1]
y = df1['diabetes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [48]:
pat_cls = fcalc.classifier.PatternBinaryClassifier(X_train.values, y_train.to_numpy(), 
                                                   categorical=np.array([0,2,3,4]))

In [49]:
pat_cls.predict(X_test.values)

In [50]:
from sklearn.metrics import accuracy_score, f1_score
print("accuracy:",round(accuracy_score(y_test, pat_cls.predictions),4))

accuracy: 0.9533


In [51]:
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [52]:
def try_models(model):
    model.fit(X_train, y_train)
    
    y_preds = model.predict(X_test)
    
    acc_score = round(accuracy_score(y_test, y_preds),4)
    
    return (f'Accuracy Score of {model}: {acc_score}')

In [53]:
try_models(KNeighborsClassifier())

'Accuracy Score of KNeighborsClassifier(): 0.9667'

In [57]:
try_models(LogisticRegression())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


'Accuracy Score of LogisticRegression(): 0.9667'

In [55]:
try_models(DecisionTreeClassifier())

'Accuracy Score of DecisionTreeClassifier(): 0.9667'

In [56]:
try_models(RandomForestClassifier(random_state=42))

'Accuracy Score of RandomForestClassifier(random_state=42): 0.9667'

In [58]:
table = pd.DataFrame({'model': ['FCA', 'KNN','LogisticRegression', 'DecisionTree', 'RandomForest'],'Accuracy': [0.9533, 0.9667, 0.9667, 0.9667, 0.9667]})
table

Unnamed: 0,model,Accuracy
0,FCA,0.9533
1,KNN,0.9667
2,LogisticRegression,0.9667
3,DecisionTree,0.9667
4,RandomForest,0.9667
