In [56]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns

In [57]:
df=pd.read_csv('diabetic_data.csv')


In [58]:
# display basic information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

In [59]:
df.shape

(101766, 50)

In [60]:
df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [61]:
df.replace('?', np.nan, inplace=True)

In [62]:
df.dtypes

encounter_id                 int64
patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride         

In [63]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [64]:
# Check for null values in the entire DataFrame
null_values = df.isnull()

# Count the number of null values in each column
null_count = df.isnull().sum()

# Display the null count for each column
print(null_count)

encounter_id                    0
patient_nbr                     0
race                         2273
gender                          0
age                             0
weight                      98569
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                  40256
medical_specialty           49949
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         21
diag_2                        358
diag_3                       1423
number_diagnoses                0
max_glu_serum                   0
A1Cresult                       0
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide 

In [65]:
# Define the threshold for the percentage of missing values
threshold = 0.3  # 50%

# Calculate the maximum allowed number of missing values per column
max_missing_values = len(df) * threshold

# Filter columns with more missing values than the threshold
columns_to_drop = df.columns[df.isnull().sum() > max_missing_values]

# Drop the selected columns from the DataFrame
df = df.drop(columns=columns_to_drop)

In [66]:
# Check for null values in the entire DataFrame
null_values = df.isnull()

# Count the number of null values in each column
null_count = df.isnull().sum()

# Display the null count for each column
print(null_count)

encounter_id                   0
patient_nbr                    0
race                        2273
gender                         0
age                            0
admission_type_id              0
discharge_disposition_id       0
admission_source_id            0
time_in_hospital               0
num_lab_procedures             0
num_procedures                 0
num_medications                0
number_outpatient              0
number_emergency               0
number_inpatient               0
diag_1                        21
diag_2                       358
diag_3                      1423
number_diagnoses               0
max_glu_serum                  0
A1Cresult                      0
metformin                      0
repaglinide                    0
nateglinide                    0
chlorpropamide                 0
glimepiride                    0
acetohexamide                  0
glipizide                      0
glyburide                      0
tolbutamide                    0
pioglitazo

In [67]:
df.shape

(101766, 47)

In [68]:
# Remove rows with NaN values
df_cleaned = df.dropna()

In [69]:
df_cleaned.shape

(98053, 47)

In [70]:
# Define a function to extract numeric values from the age ranges
import re


def extract_midpoint(age_range):
    match = re.search(r'(\d+)-(\d+)', age_range)
    if match:
        start = int(match.group(1))
        end = int(match.group(2))
        midpoint = (start + end) / 2
        return midpoint
    return None

# Apply the function to extract and calculate the midpoint
df_cleaned['age'] = df_cleaned['age'].apply(extract_midpoint)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['age'] = df_cleaned['age'].apply(extract_midpoint)


In [71]:
# Seperating the numeric and categorical dataset
numeric_columns = df_cleaned.select_dtypes(include=['int64']).columns.tolist()
categorical_columns = df_cleaned.select_dtypes(include=['object']).columns.tolist()

In [72]:
categorical_columns

['race',
 'gender',
 'diag_1',
 'diag_2',
 'diag_3',
 'max_glu_serum',
 'A1Cresult',
 'metformin',
 'repaglinide',
 'nateglinide',
 'chlorpropamide',
 'glimepiride',
 'acetohexamide',
 'glipizide',
 'glyburide',
 'tolbutamide',
 'pioglitazone',
 'rosiglitazone',
 'acarbose',
 'miglitol',
 'troglitazone',
 'tolazamide',
 'examide',
 'citoglipton',
 'insulin',
 'glyburide-metformin',
 'glipizide-metformin',
 'glimepiride-pioglitazone',
 'metformin-rosiglitazone',
 'metformin-pioglitazone',
 'change',
 'diabetesMed',
 'readmitted']

In [74]:
# # Create transformers for numeric and categorical columns
# numeric_transformer = StandardScaler()
# categorical_transformer = OneHotEncoder(drop='first')

# # Apply transformers to columns
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numeric_transformer, numeric_columns),
#         ('cat', categorical_transformer, categorical_columns)
#     ])

# # Create a pipeline for preprocessing
# pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# # Apply preprocessing to your data
# X_preprocessed = pipeline.fit_transform(X)


In [75]:
df_cleaned.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [76]:
target_variable='readmitted'
features=['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed']

X = df_cleaned[features]
y = df_cleaned[target_variable]

# Encode categorical variables (one-hot encoding)
X_encoded = pd.get_dummies(X, columns=['race',
 'gender',
 'age',
 'diag_1',
 'diag_2',
 'diag_3',
 'max_glu_serum',
 'A1Cresult',
 'metformin',
 'repaglinide',
 'nateglinide',
 'chlorpropamide',
 'glimepiride',
 'acetohexamide',
 'glipizide',
 'glyburide',
 'tolbutamide',
 'pioglitazone',
 'rosiglitazone',
 'acarbose',
 'miglitol',
 'troglitazone',
 'tolazamide',
 'examide',
 'citoglipton',
 'insulin',
 'glyburide-metformin',
 'glipizide-metformin',
 'glimepiride-pioglitazone',
 'metformin-rosiglitazone',
 'metformin-pioglitazone',
 'change',
 'diabetesMed'])

# Exclude the original categorical columns from X_encoded
# X_encoded = X_encoded.drop(['A1Cresult', 'max_glu_serum', 'insulin', 'change', 'diabetesMed', 'race', 'gender'], axis=1)

# Split the data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42)

# Standardize numerical features (optional but can improve model performance)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create and train a Random Forest classifier (you can try different models)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Get feature importances
feature_importances = model.feature_importances_

# Convert feature importances and feature names to a DataFrame
feature_importance_df = pd.DataFrame({'Feature': X_encoded.columns, 'Importance': feature_importances})

# Sort the DataFrame by importance in descending order
sorted_feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot feature importances
# plt.figure(figsize=(10, 6))
# plt.bar(range(len(sorted_feature_importance_df)), sorted_feature_importance_df['Importance'])
# plt.xticks(range(len(sorted_feature_importance_df)), sorted_feature_importance_df['Feature'], rotation=90)
# plt.xlabel('Feature')
# plt.ylabel('Feature Importance')
# plt.title('Feature Importance Plot')
# plt.tight_layout()
# plt.show()

In [78]:
# Select the top 20 features with the highest importance
top_features = sorted_feature_importance_df['Feature'][:20]

# Create a new DataFrame containing only the top 20 features
X_top_features = X_encoded[top_features]

# Split the data into training and testing sets using X_top_features
X_train, X_test, y_train, y_test = train_test_split(X_top_features, y, test_size=0.3, random_state=42)

# Standardize numerical features (optional but can improve model performance)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create and train a Random Forest classifier (you can try different models)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [79]:
# Make predictions on the filtered test set
y_pred_filtered = model.predict(X_test)

# Evaluate the model with filtered features
accuracy_filtered = accuracy_score(y_test, y_pred_filtered)
conf_matrix_filtered = confusion_matrix(y_test, y_pred_filtered)
class_report_filtered = classification_report(y_test, y_pred_filtered)

print(f"Accuracy with filtered features: {accuracy_filtered:.2f}")
print("Confusion Matrix with filtered features:")
print(conf_matrix_filtered)
print("Classification Report with filtered features:")
print(class_report_filtered)

Accuracy with filtered features: 0.59
Confusion Matrix with filtered features:
[[  116  1386  1751]
 [   98  4842  5601]
 [   46  3173 12403]]
Classification Report with filtered features:
              precision    recall  f1-score   support

         <30       0.45      0.04      0.07      3253
         >30       0.52      0.46      0.49     10541
          NO       0.63      0.79      0.70     15622

    accuracy                           0.59     29416
   macro avg       0.53      0.43      0.42     29416
weighted avg       0.57      0.59      0.55     29416



In [80]:
top_features

1                  patient_nbr
0                 encounter_id
6           num_lab_procedures
8              num_medications
5             time_in_hospital
11            number_inpatient
3     discharge_disposition_id
12            number_diagnoses
7               num_procedures
2            admission_type_id
4          admission_source_id
9            number_outpatient
10            number_emergency
28                    age_75.0
19                 gender_Male
27                    age_65.0
18               gender_Female
15              race_Caucasian
29                    age_85.0
26                    age_55.0
Name: Feature, dtype: object

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, scoring='accuracy', cv=5)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)

print(f"Best Hyperparameters: {best_params}")
print(f"Accuracy with Best Model: {accuracy_best:.2f}")


In [None]:
# Define the list of classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Support Vector Machine": SVC(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# Standardize numerical features (optional but can improve model performance)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Iterate through classifiers and evaluate each one
for clf_name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train_scaled, y_train)
    
    # Make predictions on the test set
    y_pred = clf.predict(X_test_scaled)

    # Evaluate the classifier
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)

    print(f"Classifier: {clf_name}")
    print(f"Accuracy: {accuracy:.2f}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Classification Report:")
    print(class_report)
    print("\n")