In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

In [None]:
# Load the uploaded CSV file to inspect its contents
file_path = 'diabetic_data.csv'
data = pd.read_csv(file_path)

# Display basic information about the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 48 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   admission_type_id         101766 non-null  int64 
 6   discharge_disposition_id  101766 non-null  int64 
 7   admission_source_id       101766 non-null  int64 
 8   time_in_hospital          101766 non-null  int64 
 9   medical_specialty         101766 non-null  object
 10  num_lab_procedures        101766 non-null  int64 
 11  num_procedures            101766 non-null  int64 
 12  num_medications           101766 non-null  int64 
 13  number_outpatient         101766 non-null  int64 
 14  numb

(None,
    encounter_id  patient_nbr             race  gender      age  \
 0       2278392      8222157        Caucasian  Female   [0-10)   
 1        149190     55629189        Caucasian  Female  [10-20)   
 2         64410     86047875  AfricanAmerican  Female  [20-30)   
 3        500364     82442376        Caucasian    Male  [30-40)   
 4         16680     42519267        Caucasian    Male  [40-50)   
 
    admission_type_id  discharge_disposition_id  admission_source_id  \
 0                  6                        25                    1   
 1                  1                         1                    7   
 2                  1                         1                    7   
 3                  1                         1                    7   
 4                  1                         1                    7   
 
    time_in_hospital         medical_specialty  ...  citoglipton  insulin  \
 0                 1  Pediatrics-Endocrinology  ...           No       No   
 

In [2]:
# Analyze numeric columns for summary statistics
numeric_summary = data.describe()

# Analyze categorical columns for unique values and their counts
categorical_columns = data.select_dtypes(include=['object'])
categorical_summary = {col: data[col].value_counts().to_dict() for col in categorical_columns}

# Check for missing values or anomalies ('?')
missing_values = data.isin(['?']).sum()

# Summarize results
numeric_summary, missing_values, {col: len(categorical_summary[col]) for col in categorical_summary}


(       encounter_id   patient_nbr  admission_type_id  \
 count  1.017660e+05  1.017660e+05      101766.000000   
 mean   1.652016e+08  5.433040e+07           2.024006   
 std    1.026403e+08  3.869636e+07           1.445403   
 min    1.252200e+04  1.350000e+02           1.000000   
 25%    8.496119e+07  2.341322e+07           1.000000   
 50%    1.523890e+08  4.550514e+07           1.000000   
 75%    2.302709e+08  8.754595e+07           3.000000   
 max    4.438672e+08  1.895026e+08           8.000000   
 
        discharge_disposition_id  admission_source_id  time_in_hospital  \
 count             101766.000000        101766.000000     101766.000000   
 mean                   3.715642             5.754437          4.395987   
 std                    5.280166             4.064081          2.985108   
 min                    1.000000             1.000000          1.000000   
 25%                    1.000000             1.000000          2.000000   
 50%                    1.000000   

In [None]:
# Copy data for preprocessing
df = data.copy()

# Binary classification: map 'readmitted' to 0 (NO) and 1 (other values)
df['readmitted'] = df['readmitted'].map(lambda x: 0 if x == 'NO' else 1)

# Handle missing values ('?') - Replace with 'Unknown' or drop if necessary
df.replace('?', 'Unknown', inplace=True)

# Encode categorical variables
categorical_features = df.select_dtypes(include=['object']).columns
encoder = LabelEncoder()
for col in categorical_features:
    df[col] = encoder.fit_transform(df[col])

# Split data into features and target
X = df.drop(columns=['readmitted', 'encounter_id', 'patient_nbr'])  # Drop ID-like columns
y = df['readmitted']

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)


In [4]:
print(accuracy)

0.6376633585536012


In [5]:
print(report)

              precision    recall  f1-score   support

           0       0.64      0.74      0.69     10952
           1       0.63      0.52      0.57      9402

    accuracy                           0.64     20354
   macro avg       0.64      0.63      0.63     20354
weighted avg       0.64      0.64      0.63     20354



In [None]:
# Extract feature importances
feature_importances = rf_model.feature_importances_
feature_names = X.columns

# Create a DataFrame for feature importances
important_features_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Extract the top N features (e.g., 15)
top_n = 15
top_features = important_features_df.head(top_n)['Feature'].tolist()

# Output the top features
print("Top Features:")
for i in top_features:
    print(i)


Top Features:
num_lab_procedures
diag_2
diag_1
diag_3
num_medications
time_in_hospital
number_inpatient
age
discharge_disposition_id
number_diagnoses
medical_specialty
num_procedures
insulin
admission_type_id
race


In [None]:
from sklearn.impute import SimpleImputer

# Select relevant columns and define the target variable
X = data[top_features]
y = data['readmitted']

# Handle missing values with a simple imputer
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = imputer.fit_transform(X)

# Scale numerical features for Logistic Regression
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [None]:
# Inspect unique values in the 'readmitted' column
data['readmitted'].unique()

array([2, 1, 0])

In [32]:
# Correctly map 'readmitted' to binary classes
y_binary = data['readmitted'].map(lambda x: 0 if x == 2 else 1)  # 0: Not Readmitted, 1: Readmitted

# Check the distribution of the binary target variable
y_binary.value_counts()

readmitted
0    54864
1    46902
Name: count, dtype: int64

In [None]:
# Apply SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_scaled, y_binary)

# Verify the new distribution of the balanced target variable
balanced_distribution = pd.Series(y_balanced).value_counts()

# Split the balanced data into training and test sets
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(
    X_balanced, y_balanced, test_size=0.2, random_state=42
)

# Train a Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_bal, y_train_bal)

# Predictions and evaluation
y_pred_bal = rf_model.predict(X_test_bal)
rf_accuracy = accuracy_score(y_test_bal, y_pred_bal)
rf_report = classification_report(y_test_bal, y_pred_bal)


In [34]:
print(balanced_distribution)

readmitted
0    54864
1    54864
Name: count, dtype: int64


In [35]:
print(rf_accuracy)

0.6716941583887724


In [36]:
print(rf_report)

              precision    recall  f1-score   support

           0       0.66      0.71      0.69     10997
           1       0.69      0.63      0.66     10949

    accuracy                           0.67     21946
   macro avg       0.67      0.67      0.67     21946
weighted avg       0.67      0.67      0.67     21946

