1. Load the Required Libraries

In [2]:
import pandas as pd
import numpy as np
import dtale
import os
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
import category_encoders as ce
from category_encoders import TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer,SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split,KFold,cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,roc_auc_score,recall_score

  from pkg_resources import parse_version


2. Read the data from the dataset

In [3]:
df = pd.read_csv("C:\\Users\\Frank\\OneDrive\\Documentos\\ResearchPapers\\Datasets\\diabetes_dataset.csv", encoding='latin1')
d=dtale.show(df)
d.open_browser()

In [4]:
df.head()
df.drop(columns=['weight','payer_code','medical_specialty','encounter_id','patient_nbr','admission_type_id','discharge_disposition_id','admission_source_id'], inplace=True)

3. Split the attribites into dependent and independent attributes

In [5]:
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values
dtale.show(Y, ignore_duplicate=True)



3. Handling missing values and replacing missing values with nan from numpy and replace with mean of all the other values

In [6]:
df.replace('?', np.nan, inplace=True)

In [7]:
dtale.show(df, ignore_duplicate=True)



In [8]:
missing_values = df.isnull().sum()
missing_percent= df.isnull().sum() / len(df) * 100
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)
print("Missing values in each column (%):")
print(missing_values)

Missing values in each column (%):
max_glu_serum    96420
A1Cresult        84748
race              2273
diag_3            1423
diag_2             358
diag_1              21
dtype: int64


In [9]:
missing_data = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage (%)': missing_percent
}).sort_values(by='Missing Values', ascending=False)
missing_data = missing_data[missing_data['Missing Values'] > 0]
print("Columns with missing values:")
print(missing_data)


Columns with missing values:
               Missing Values  Percentage (%)
max_glu_serum         96420.0       94.746772
A1Cresult             84748.0       83.277322
race                   2273.0        2.233555
diag_3                 1423.0        1.398306
diag_2                  358.0        0.351787
diag_1                   21.0        0.020636


In [10]:
cat_cols_with_missing = ['race', 'gender', 'diag_1', 'diag_2', 'diag_3']
for col in cat_cols_with_missing:
    mode_val = df[col].mode()[0]
    df[col] = df[col].fillna(mode_val)
    print(f"Filled {col} with mode: {mode_val}")


Filled race with mode: Caucasian
Filled gender with mode: Female
Filled diag_1 with mode: 428
Filled diag_2 with mode: 276
Filled diag_3 with mode: 250


In [11]:
# For race - impute with 'Unknown' category
df['race'].fillna('Unknown', inplace=True)
dtale.show(df, ignore_duplicate=True)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.







In [12]:
df = df.dropna(subset=['diag_1'])

# For secondary diagnoses, missing likely means no secondary diagnosis
df['diag_2'].fillna('None', inplace=True)
df['diag_3'].fillna('None', inplace=True)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [13]:
df['max_glu_serum_missing'] = df['max_glu_serum'].isnull().astype(int)
df['A1Cresult_missing'] = df['A1Cresult'].isnull().astype(int)

# Then impute with 'None' (assuming missing means test wasn't performed)
df['max_glu_serum'].fillna('None', inplace=True)
df['A1Cresult'].fillna('None', inplace=True)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [14]:
# Identify numerical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Use iterative imputer for numerical variables (more sophisticated than simple mean/median)
imputer = IterativeImputer(max_iter=10, random_state=42)
df[num_cols] = imputer.fit_transform(df[num_cols])

In [15]:
# Check if any missing values remain
print("\nMissing values after imputation:")
print(df.isnull().sum().sum())


Missing values after imputation:
0


In [16]:
print("\nLaunching D-Tale for final verification...")
d_final = dtale.show(df)
d_final.open_browser()


Launching D-Tale for final verification...


4. Encoding the categorical data

In [17]:
#Identify categorical columns (object type and low-cardinality numeric)
categorical_cols = [col for col in df.columns 
                   if df[col].dtype == 'object' or df[col].nunique() < 10]

# Exclude columns that shouldn't be encoded (like IDs or text fields)
# For this dataset, we might exclude diagnosis codes if we want special handling
cols_to_exclude = ['diag_1', 'diag_2', 'diag_3']  # Example - adjust as needed
categorical_cols = [col for col in categorical_cols if col not in cols_to_exclude]

print("Categorical columns to encode:")
print(categorical_cols)

Categorical columns to encode:
['race', 'gender', 'age', 'num_procedures', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted', 'max_glu_serum_missing', 'A1Cresult_missing']


In [18]:
#One-hot encode categorical variables
nominal_cols = ['race', 'gender', 'max_glu_serum', 'A1Cresult', 'change', 'diabetesMed']

# Initialize OneHotEncoder
ohe = OneHotEncoder(drop='first', sparse_output=False)  # drop first to avoid dummy trap

# Fit and transform
ohe_array = ohe.fit_transform(df[nominal_cols])

# Create column names for the encoded features
ohe_columns = ohe.get_feature_names_out(nominal_cols)

# Create DataFrame from the encoded array
ohe_df = pd.DataFrame(ohe_array, columns=ohe_columns, index=df.index)

# Drop original columns and concatenate with encoded data
df = df.drop(nominal_cols, axis=1)
df = pd.concat([df, ohe_df], axis=1)

In [19]:
ordinal_cols = ['age']  # age groups have natural ordering

# Define custom ordering for each ordinal column
age_order = ['[0-10)', '[10-20)', '[20-30)', '[30-40)', '[40-50)', 
             '[50-60)', '[60-70)', '[70-80)', '[80-90)', '[90-100)']

# Initialize OrdinalEncoder with specified categories
ordinal_encoder = OrdinalEncoder(categories=[age_order])

# Fit and transform
df[ordinal_cols] = ordinal_encoder.fit_transform(df[ordinal_cols])

In [25]:
# For diagnosis codes if we decide to encode them
high_cardinality_cols = ['diag_1', 'diag_2', 'diag_3']  # Example

# Initialize BinaryEncoder
binary_encoder = ce.BinaryEncoder(cols=high_cardinality_cols)

# Fit and transform
df = binary_encoder.fit_transform(df)

In [22]:
def group_diagnosis(code):
    if pd.isna(code) or code == 'None':
        return 'None'
    try:
        code = float(code)
        if code >= 390 and code <= 459 or code == 785:
            return 'Circulatory'
        elif code >= 460 and code <= 519 or code == 786:
            return 'Respiratory'
        elif code >= 520 and code <= 579 or code == 787:
            return 'Digestive'
        elif code >= 250 and code < 251:
            return 'Diabetes'
        else:
            return 'Other'
    except:
        return 'Other'

# Apply grouping
df['diag_1_grouped'] = df['diag_1'].apply(group_diagnosis)
df['diag_2_grouped'] = df['diag_2'].apply(group_diagnosis)
df['diag_3_grouped'] = df['diag_3'].apply(group_diagnosis)

In [23]:
# Define your target column
target_col = 'readmitted'

# Select columns for target encoding - only use columns that exist
target_encode_cols = [col for col in ['diag_1_grouped', 'diag_2_grouped', 'diag_3_grouped'] 
                    if col in df.columns]

# Only proceed if we have columns to encode
if target_encode_cols:
    # Initialize TargetEncoder
    target_encoder = TargetEncoder(cols=target_encode_cols)
    
    # Fit and transform
    df[target_encode_cols] = target_encoder.fit_transform(df[target_encode_cols], df[target_col])
else:
    print("No valid columns found for target encoding")

In [26]:
for col in high_cardinality_cols:
    freq = df[col].value_counts(normalize=True)
    df[col+'_freq'] = df[col].map(freq)
    df.drop(col, axis=1, inplace=True)

KeyError: 'diag_1'

5. Splitting the dataset intro training set and test set

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

KeyboardInterrupt: 

6. Feature Scaling

In [None]:
dtale.show(X_test)



7. Train the Random Forest Model

In [None]:
rf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=42)
rf.fit(X_train, Y_train)

Kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(rf, X_train, Y_train, cv=Kfold, scoring='accuracy')

Y_pred = rf.predict(X_test)

In [None]:
rf.score(X_test, Y_test)
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

         0.0       0.86      1.00      0.92     43525
         1.0       0.00      0.00      0.00      7211

    accuracy                           0.86     50736
   macro avg       0.43      0.50      0.46     50736
weighted avg       0.74      0.86      0.79     50736




Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [None]:
rf2 = RandomForestClassifier(
    
    n_estimators=1000,
    criterion= 'entropy',
    min_samples_split= 10,
    random_state=42)

8. Retraining the model using Resampled data

In [None]:
smote_enn = SMOTE(sampling_strategy='minority', random_state=42)
X_train_res, Y_train_res = smote_enn.fit_resample(X_train, Y_train)
print(pd.Series(Y_train_res).value_counts())

0.0    174809
1.0    174809
Name: count, dtype: int64


In [None]:
model = RandomForestClassifier( random_state= 42)
model.fit(X_train_res, Y_train_res) 
Kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train_res, Y_train_res, cv=Kfold, scoring='accuracy')

In [None]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]
sensitivity = recall_score(Y_test, y_pred, pos_label=1)
roc_auc = roc_auc_score(Y_test, y_proba)

tn, fp, fn, tp = confusion_matrix(Y_test, y_pred).ravel()
specificity = tn / (tn + fp)

report = classification_report(Y_test, y_pred, output_dict=True)
df_report = pd.DataFrame(report).transpose()

pd.set_option("display.precision", 4)
print(df_report)
print(f"ROC-AUC: {roc_auc:.4f}")
print(f"Sensitivity: {sensitivity:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"Confusion Matrix:\n{confusion_matrix(Y_test, y_pred)}")
print(f"Mean Accuracy:{scores.mean():.4f} (+/- {scores.std():.4f})")

              precision  recall  f1-score     support
0.0              0.8843  0.9257    0.9045  43525.0000
1.0              0.3749  0.2688    0.3131   7211.0000
accuracy         0.8324  0.8324    0.8324      0.8324
macro avg        0.6296  0.5972    0.6088  50736.0000
weighted avg     0.8119  0.8324    0.8205  50736.0000
ROC-AUC: 0.7622
Sensitivity: 0.2688
Specificity: 0.9257
Confusion Matrix:
[[40293  3232]
 [ 5273  1938]]
Mean Accuracy:0.8983 (+/- 0.0015)
