# Step:1 **Diabetes Data Cleaning and Preparation**
## **Notebook steps Overview:**
1. Load the Raw Dataset  
2. Initial Data Exploration 
3. Data Summary Statistics 
4. Data Cleaning  
5. Handle Missing Values  
6. Create Data Quality Report 
7. Save Cleaned Dataset 
8. Data Distribution Visualization 
9. Correlation Analysis 
10. Data Cleaning Summary


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set styling for plots
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# Create paths if they don't exist
os.makedirs('../../outputs', exist_ok=True)


In [6]:
#1. Load the Raw Dataset
print("Loading raw diabetes dataset...")
df = pd.read_csv('C:/Users/hp\Desktop/diabetes-analysis-project/data/diabetes_raw.csv')

Loading raw diabetes dataset...


In [7]:
# 2. Initial Data Exploration
print("\nInitial Data Overview:")
print(f"Dataset Shape: {df.shape}")
df.head()


📊 Initial Data Overview:
Dataset Shape: (5050, 9)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0,125,96,0,0,22.5,0.262,21,0
1,0,141,0,0,0,42.4,0.205,29,1
2,1,115,70,29,96,34.503341,0.520052,32,1
3,10,101,86,37,0,45.6,1.136,38,1
4,1,96,122,0,0,22.4,0.207,27,0


In [8]:
# 3. Data Summary Statistics
print("\n Data Summary Statistics:")
df.describe().T

# Check data types
print("\nData Types:")
df.dtypes

# Check for missing values
print("\nMissing Values:")
df.isnull().sum()


 Data Summary Statistics:

Data Types:

Missing Values:


Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [9]:
# 4. Data Cleaning

print("\n Data Cleaning Process:")

# Check for zeros in columns where zeros don't make physiological sense
zero_columns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

for column in zero_columns:
    print(f"Zeros in {column}: {(df[column] == 0).sum()}")

# Replace zeros with NaN for columns where zero is physiologically impossible
for column in zero_columns:
    df[column] = df[column].replace(0, np.nan)

print("\nMissing values after replacing zeros:")
df.isnull().sum()


 Data Cleaning Process:
Zeros in Glucose: 30
Zeros in BloodPressure: 258
Zeros in SkinThickness: 1502
Zeros in Insulin: 2475
Zeros in BMI: 49

Missing values after replacing zeros:


Pregnancies                    0
Glucose                       30
BloodPressure                258
SkinThickness               1502
Insulin                     2475
BMI                           49
DiabetesPedigreeFunction       0
Age                            0
Outcome                        0
dtype: int64

In [10]:
# 5. Handle Missing Values

print("\nHandling Missing Values:")

# Strategy: Replace missing values with median based on Outcome groups
for column in df.columns[df.isnull().any()]:
    # Get median values for each Outcome group
    median_0 = df[df['Outcome'] == 0][column].median()
    median_1 = df[df['Outcome'] == 1][column].median()
    
    # Replace missing values based on Outcome group
    df.loc[(df['Outcome'] == 0) & (df[column].isnull()), column] = median_0
    df.loc[(df['Outcome'] == 1) & (df[column].isnull()), column] = median_1

print("\nMissing values after imputation:")
df.isnull().sum()

# Check for duplicates
duplicate_count = df.duplicated().sum()
print(f"\nDuplicate rows found: {duplicate_count}")

if duplicate_count > 0:
    df = df.drop_duplicates()
    print(f"Duplicates removed. New dataset shape: {df.shape}")


Handling Missing Values:

Missing values after imputation:

Duplicate rows found: 4061
Duplicates removed. New dataset shape: (989, 9)


In [11]:
# 6. Create Data Quality Report

print("\nData Quality Report:")

# Check for outliers using IQR method
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)][column]
    return outliers.count()

outlier_counts = {column: detect_outliers(df, column) for column in df.select_dtypes(include=[np.number]).columns}
print("Outlier counts by column:")
for col, count in outlier_counts.items():
    print(f"- {col}: {count} outliers")


Data Quality Report:
Outlier counts by column:
- Pregnancies: 3 outliers
- Glucose: 0 outliers
- BloodPressure: 28 outliers
- SkinThickness: 110 outliers
- Insulin: 55 outliers
- BMI: 12 outliers
- DiabetesPedigreeFunction: 31 outliers
- Age: 15 outliers
- Outcome: 0 outliers


In [19]:
# 7. Save Cleaned Dataset 

# print("\nSaving cleaned dataset...")
# df.to_csv('../../outputs/cleaned_data.csv', index=False)

# print("\nData cleaning complete! Cleaned data saved to '../../outputs/cleaned_data.csv'")

In [None]:
# 8. Data Distribution Visualization

# Create a function to plot histograms for numerical columns
def plot_distributions(df):
    num_cols = df.select_dtypes(include=['float64', 'int64']).columns
    plt.figure(figsize=(15, 12))
    
    for i, column in enumerate(num_cols):
        plt.subplot(3, 3, i+1)
        sns.histplot(data=df, x=column, hue='Outcome', kde=True, bins=30, alpha=0.6)
        plt.title(f'Distribution of {column}')
        plt.tight_layout()
    
    plt.savefig('C:/Users/hp/Desktop/diabetes-analysis-project/visuals/static/data_distributions.png')
    plt.close()

plot_distributions(df)
# Distribution plots saved to visuals/static/data_distributions.png 

In [16]:
# 9. Correlation Analysis

plt.figure(figsize=(10, 8))
correlation_matrix = df.corr().round(2)
mask = np.triu(correlation_matrix)
sns.heatmap(correlation_matrix, annot=True, mask=mask, cmap='coolwarm', linewidths=0.5)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.savefig('C:/Users/hp/Desktop/diabetes-analysis-project/visuals/static/correlation_matrix.png')
plt.close()

# Correlation matrix saved to '../../visuals/static/correlation_matrix.png

In [18]:
# 10. Data Cleaning Summary
# -----------------------------------------------------

print("\n Data Cleaning Summary:")
print(f"1. Initial dataset shape: {5050} rows, {df.shape[1]} columns")
print(f"2. Missing values handled: {', '.join(zero_columns)}")
print(f"3. Outliers identified in multiple columns")
print(f"4. Final dataset shape: {df.shape}")
print("\nCleaned data is ready for exploratory data analysis and feature engineering!")


 Data Cleaning Summary:
1. Initial dataset shape: 5050 rows, 9 columns
2. Missing values handled: Glucose, BloodPressure, SkinThickness, Insulin, BMI
3. Outliers identified in multiple columns
4. Final dataset shape: (989, 9)

Cleaned data is ready for exploratory data analysis and feature engineering!
