## 1. Importing necessary Libraries


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings as wrn
from sklearn.preprocessing import StandardScaler

## 2. Reading the dataset

In [None]:
df = pd.read_csv(r'C:\Users\Frank\OneDrive\Documentos\ITRI 616 Semester Project\diabetic_data.csv')

In [None]:
#Check the first few rows of the dataset
df.head()

In [None]:
# Check the last few rows of the dataframe
df.tail()

## 3. Checking the balance of the dataset

In [None]:
#Shape of the data
df.shape

In [None]:
#Checking for null values
df.info()

In [None]:
#Checking for null values in the dataset
df.isnull().sum()

In [None]:
#Checks the percentage of null values in each column
df.isnull().sum()//df.shape[0]*100

In [None]:
#finding the duplicate values in the dataset
df .duplicated().sum()

In [None]:
#identifying the garbage values in the dataset
for iteration in df.select_dtypes(include=['object']).columns:
    print(df[iteration].value_counts())
    print("*"*10)

## 4. Exploratory Data Analysis

In [None]:
#Provides the summary statistics of the dataset
df.describe()

In [None]:
df.describe(include ="object")

In [None]:
#Display the histogram to understand the distribution of the dataset
for i in df.select_dtypes(include = "number").columns:
    sns.histplot(data=df,x=i)
    plt.show()

In [None]:
#Displays the boxplot to identify the outliers in the dataset
for i in df.select_dtypes(include = "number").columns:
    sns.boxplot(data=df,x=i)
    plt.show()

In [None]:
#Display the scatterplot to understand the relationship between the features
for i in df.select_dtypes(include = "number").columns:
    for j in df.select_dtypes(include = "number").columns:
        if i != j:
            sns.scatterplot(data=df,x=i,y=j)
            plt.show()

In [None]:
#Displays the correlation with heatmap to interprer the relation and multicolliniarity
s=df.select_dtypes(include="number").corr()

In [None]:
plt.figure(figsize =(15,15))
sns.heatmap(s,annot = True)

## 5. Handling Missing Values

In [None]:

wrn.filterwarnings("ignore")
df['race'].fillna('Unknown', inplace=True)
df['payer_code'].fillna('Self-Pay', inplace=True)

In [None]:
df['race'] = df['race'].replace('?', 'Unknown')

In [None]:
df["A1Cresult"] = df["A1Cresult"].fillna("NotTested")

In [None]:
df["max_glu_serum"] = df["max_glu_serum"].fillna("NotTested")

In [None]:
df['weight_recorded'] = df['weight'].notna().astype(int)

In [None]:
df['diag_1'].fillna('Unknown', inplace=True)
df['diag_2'].fillna('Unknown', inplace=True)
df['diag_3'].fillna('Unknown', inplace=True)

In [None]:
for iteration in df.select_dtypes(include=['object']).columns:
    print(df[iteration].value_counts())
    print("*"*10)

In [None]:
df.isnull().sum()

## 6. Feature Engineering

In [None]:
age_mapping = {
    '[0-10)': 5,
    '[10-20)': 15,
    '[20-30)': 25,
    '[30-40)': 35,
    '[40-50)': 45,
    '[50-60)': 55,
    '[60-70)': 65,
    '[70-80)': 75,
    '[80-90)': 85,
    '[90-100)': 95
}
df['age'] = df['age'].map(age_mapping)
# Create a binary target variable for readmission within 30 days
df['readmitted_30'] = df['readmitted'].apply(lambda x: 1 if x == '<30' else 0)

# Create a binary target variable for any readmission (within 30 or >30 days)
df['readmitted_any'] = df['readmitted'].apply(lambda x: 1 if x != 'NO' else 0)

# Create feature for number of diagnoses (from diag_1, diag_2, diag_3)
df['num_diagnoses'] = df[['diag_1', 'diag_2', 'diag_3']].notna().sum(axis=1)

# Create feature for medication changes
df['med_change'] = df['change'].apply(lambda x: 1 if x == 'Ch' else 0)

## 7. Outliers Treatment

In [None]:
#decide whether to do outliers treatment or not
def wisker(col):
   q1,q3=np.percentile(col,[25,75])
   iqr = q3 - q1
   lower_bound = q1 - 1.5 * iqr
   upper_bound = q3 + 1.5 * iqr
   return lower_bound, upper_bound


In [None]:
for i in ['patient_nbr','discharge_disposition_id','num_lab_procedures','num_medications','number_outpatient', 'number_emergency', 'number_inpatient']:
    lower_bound, upper_bound = wisker(df[i])
    df[i] = np.where(df[i] < lower_bound, lower_bound, df[i])
    df[i] = np.where(df[i] > upper_bound, upper_bound, df[i]) 

In [None]:
df.columns

## 8. Encoding categorical values 

In [None]:
#Create dummies for categorical variables
# One-hot encode gender and race
df = pd.get_dummies(df, columns=['gender', 'race'], drop_first=True)

# Label encode other categorical variables
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['admission_type_id', 'discharge_disposition_id', 
                   'admission_source_id', 'A1Cresult', 'max_glu_serum']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

# Convert medication columns to binary (Yes=1, No=0)
medication_cols = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
                  'glimepiride', 'acetohexamide', 'glipizide', 'glyburide',
                  'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
                  'miglitol', 'troglitazone', 'tolazamide', 'examide',
                  'citoglipton', 'insulin', 'glyburide-metformin',
                  'glipizide-metformin', 'glimepiride-pioglitazone',
                  'metformin-rosiglitazone', 'metformin-pioglitazone']

for col in medication_cols:
    df[col] = df[col].map({'No': 0, 'Steady': 1, 'Up': 1, 'Down': 1})

## 9. Handling Diagnosis Codes

In [None]:
# Group diagnosis codes into categories (ICD-9 codes)
def group_diagnosis(code):
    if pd.isna(code) or code == '?':
        return 0
    try:
        code = float(code)
    except:
        return 0
    
    if code >= 390 and code <= 459 or code == 785:
        return 1  # Circulatory
    elif code >= 460 and code <= 519 or code == 786:
        return 2  # Respiratory
    elif code >= 520 and code <= 579 or code == 787:
        return 3  # Digestive
    elif code >= 250 and code < 251:
        return 4  # Diabetes
    elif code >= 800 and code <= 999:
        return 5  # Injury
    elif code >= 710 and code <= 739:
        return 6  # Musculoskeletal
    elif code >= 580 and code <= 629 or code == 788:
        return 7  # Genitourinary
    elif code >= 140 and code <= 239:
        return 8  # Neoplasms
    else:
        return 0  # Other

for diag_col in ['diag_1', 'diag_2', 'diag_3']:
    df[diag_col+'_group'] = df[diag_col].apply(group_diagnosis)

## 10. Feature Scaling

In [None]:

# Scale numerical features
numerical_features = ['time_in_hospital', 'num_lab_procedures', 'num_procedures',
                     'num_medications', 'number_diagnoses', 'number_outpatient',
                     'number_emergency', 'number_inpatient', 'age']

scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

## 11. Final Data Preparation

In [None]:
# Drop original columns we've transformed
columns_to_drop = ['encounter_id', 'patient_nbr', 'readmitted', 'diag_1', 'diag_2', 'diag_3', 'change']
df.drop(columns=columns_to_drop, inplace=True)

# Drop diabetesMed since it's highly correlated with our target (most patients have diabetes)
df.drop(columns=['diabetesMed'], inplace=True)

# Check for class imbalance in target variables
print(df['readmitted_30'].value_counts())
print(df['readmitted_any'].value_counts())

# If severe imbalance exists, we might apply SMOTE later during modeling

## 12. Save the preprocessed dataset for training the model

In [None]:
df.to_csv('diabetic_data_cleaned.csv', index=False)