# PHASE 01
### Data Ingestion & Clinical Sanitation

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
# "na_values='?'" tells Pandas that '?' means missing data (like null/undefined)
# [cite: 133, 134]
df = pd.read_csv('./data_files/data_files/diabetic_data.csv', na_values="?")

print("Data Loaded Succesfully!!")
print(f"Total Rows: {df.shape[0]}")
print(f"Total Columns: {df.shape[1]}")

Data Loaded Succesfully!!
Total Rows: 101766
Total Columns: 50


  df = pd.read_csv('./data_files/data_files/diabetic_data.csv', na_values="?")


In [3]:
# specific cleaning task: Check the 'weight' column
# The PDF says if >90% is missing, we must drop it. [cite: 135]
missing_weight = df['weight'].isnull().sum()
total_rows = len(df)
percentage = (missing_weight / total_rows) * 100

print(f"Percentage of missing weight values: {percentage:.2f}%")

# Show me the first 5 rows
df.head()

Percentage of missing weight values: 96.86%


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [4]:
# 1. Drop the 'weight' column if it's mostly empty [cite: 135]
if percentage > 90:
    df = df.drop(columns=['weight'])
    print("Dropped 'weight' column.")

# 2. Remove Deceased Patients [cite: 139, 140]
# IDs 11, 19, 20 mean the patient died. We can't predict readmission for them.
deceased_ids = [11, 19, 20]
df = df[~df['discharge_disposition_id'].isin(deceased_ids)]
print("Removed deceased patients.")

# 3. Remove Exact Duplicates [cite: 141]
df = df.drop_duplicates()
print("Removed duplicate rows.")

# Check the new size of your data
print(f"New Data Shape: {df.shape}")

Dropped 'weight' column.
Removed deceased patients.
Removed duplicate rows.
New Data Shape: (100114, 49)


In [5]:
# --- 1. THE AUDIT (Required by PDF Section 4.1) ---
# The PDF asks for .info(), .describe(), and .columns to see the "messy" data types.
print("--- COLUMN NAMES ---")
print(df.columns)

print("\n--- DATA INFO ---")
# This shows us that ID columns are currently integers (int64), which is wrong.
df.info() 

print("\n--- STATISTICAL SUMMARY ---")
df.describe()

# --- 2. DATA TYPE CONVERSION (Required by PDF) ---
# The PDF says: "Identify columns that have incorrect data types (e.g., IDs reading as integers instead of categories)."
# IDs like 'admission_type_id' are labels, not math numbers. We must convert them.

# List of columns that should be Categories (Strings), not Numbers
id_columns = [
    'admission_type_id', 
    'discharge_disposition_id', 
    'admission_source_id'
]

# Convert them
df[id_columns] = df[id_columns].astype(str)

print("\n--- VERIFICATION ---")
# Check if they are now 'object' (string) instead of 'int64'
print(df[id_columns].dtypes)

--- COLUMN NAMES ---
Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

--- DATA INFO ---
<class 'pandas.core.frame.DataFrame'>
I

# PHASE 02
## Data Enrichment via Web Scraping

In [6]:
df["diag_1"]

0         250.83
1            276
2            648
3              8
4            197
           ...  
101761    250.13
101762       560
101763        38
101764       996
101765       530
Name: diag_1, Length: 100114, dtype: object