<a href="https://colab.research.google.com/github/UdayPuligilla/AI-Driven-Insights-Machine-Learning-Models-for-Chronic-Kidney-Disease-Prediction/blob/main/Data_Loading.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Loading

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define file paths
DATA_FILE_PATH = '/content/drive/MyDrive/Chronic kidney disease/chronic_kidney_disease (1).csv'
COLUMN_MAPPING_FILE_PATH = '/content/drive/MyDrive/Chronic kidney disease/column_mapping.json'
CLEANED_DATA_FILE_PATH = '/content/drive/MyDrive/Chronic kidney disease/ckd_data_cleaned.csv'

# Define column mapping dictionary
COLUMN_MAPPING = {
    'age': 'Age',
    'bp': 'Blood Pressure',
    'sg': 'Specific Gravity',
    'al': 'Albumin',
    'su': 'Sugar',
    'rbc': 'Red Blood Cells',
    'pc': 'Pus Cell',
    'pcc': 'Pus Cell Clumps',
    'ba': 'Bacteria',
    'bgr': 'Blood Glucose Random',
    'bu': 'Blood Urea',
    'sc': 'Serum Creatinine',
    'sod': 'Sodium',
    'pot': 'Potassium',
    'hemo': 'Hemoglobin',
    'pcv': 'Packed Cell Volume',
    'wbcc': 'White Blood Cell Count',
    'rbcc': 'Red Blood Cell Count',
    'htn': 'Hypertension',
    'dm': 'Diabetes Mellitus',
    'cad': 'Coronary Artery Disease',
    'appet': 'Appetite',
    'pe': 'Pedal Edema',
    'ane': 'Anemia',
    'class': 'CKD Stage'
}


def save_column_mapping(column_mapping, file_path):
    """Saves the column mapping dictionary to a JSON file."""
    with open(file_path, 'w') as f:
        json.dump(column_mapping, f, indent=4)
    print(f"Column mapping saved to {file_path}")


def load_column_mapping(file_path):
    """Loads the column mapping dictionary from a JSON file."""
    with open(file_path, 'r') as f:
        loaded_column_mapping = json.load(f)
    print("Loaded column mapping:")
    print(loaded_column_mapping)
    return loaded_column_mapping


def load_data(file_path):
    """Loads the dataset from a CSV file."""
    ckd_data = pd.read_csv(file_path)
    print("Original column names:")
    print(ckd_data.columns)
    print("\nFirst 5 rows of original data:")
    print(ckd_data.head())
    return ckd_data


def rename_columns(df, column_mapping):
    """Renames columns of a DataFrame using a mapping dictionary."""
    df = df.rename(columns=column_mapping)
    print("\nColumn names after mapping:")
    print(df.columns)
    print("\nFirst 5 rows after mapping:")
    print(df.head())
    return df


def clean_categorical_data(df):
    """Cleans the categorical data in the DataFrame."""
    df['Diabetes Mellitus'] = df['Diabetes Mellitus'].replace('\tno', 'no')
    df['CKD Stage'] = df['CKD Stage'].replace('ckd\t', 'ckd')
    return df


def analyze_features(df):
    """Analyzes and prints unique values of categorical and numerical features."""
    categorical_features = df.select_dtypes(include=['object']).columns
    numerical_features = df.select_dtypes(include=['float', 'int']).columns

    print("\nUnique values in categorical features:")
    for col in categorical_features:
        unique_values = df[col].unique()
        print(f"{col}: {unique_values}")

    print("\nNumerical features and their value ranges:")
    for col in numerical_features:
        min_value = df[col].min()
        max_value = df[col].max()
        unique_values_count = df[col].nunique()
        print(f"{col}: Range = ({min_value}, {max_value}), Unique values count = {unique_values_count}")


def handle_missing_values(df):
    """Removes rows with any missing values."""
    df_cleaned = df.dropna()
    return df_cleaned


def save_cleaned_data(df, file_path):
    """Saves the cleaned DataFrame to a new CSV file."""
    df.to_csv(file_path, index=False)
    print(f"Cleaned data saved to {file_path}")


def verify_cleaned_data(file_path):
    """Verifies the cleaned data by loading it and checking for missing values."""
    df_cleaned = pd.read_csv(file_path)
    print("\nHead of the cleaned dataset:")
    print(df_cleaned.head())
    missing_values_after = df_cleaned.isnull().sum()
    print("\nMissing values in the cleaned dataset:")
    print(missing_values_after)


def main():
    """Main function to execute the data loading, cleaning, and saving process."""

    # Save column mapping
    save_column_mapping(COLUMN_MAPPING, COLUMN_MAPPING_FILE_PATH)

    # Load column mapping
    loaded_column_mapping = load_column_mapping(COLUMN_MAPPING_FILE_PATH)

    # Load data
    ckd_data = load_data(DATA_FILE_PATH)

    # Rename columns
    ckd_data = rename_columns(ckd_data, loaded_column_mapping)

    # Clean categorical data
    ckd_data = clean_categorical_data(ckd_data)

    # Analyze features
    analyze_features(ckd_data)

    # Print class counts before cleaning
    class_counts = ckd_data['CKD Stage'].value_counts()
    print("\nCounts of CKD and notckd in the dataset before cleaning:")
    print(class_counts)

    # Handle missing values
    ckd_data_cleaned = handle_missing_values(ckd_data)

    # Save cleaned data
    save_cleaned_data(ckd_data_cleaned, CLEANED_DATA_FILE_PATH)

    # Verify cleaned data
    verify_cleaned_data(CLEANED_DATA_FILE_PATH)

    # Print class counts after cleaning
    class_counts_cleaned = ckd_data_cleaned['CKD Stage'].value_counts()
    print("\nCounts of CKD and notckd in the dataset after cleaning:")
    print(class_counts_cleaned)


if __name__ == "__main__":
    main()

Mounted at /content/drive
Column mapping saved to /content/drive/MyDrive/Chronic kidney disease/column_mapping.json
Loaded column mapping:
{'age': 'Age', 'bp': 'Blood Pressure', 'sg': 'Specific Gravity', 'al': 'Albumin', 'su': 'Sugar', 'rbc': 'Red Blood Cells', 'pc': 'Pus Cell', 'pcc': 'Pus Cell Clumps', 'ba': 'Bacteria', 'bgr': 'Blood Glucose Random', 'bu': 'Blood Urea', 'sc': 'Serum Creatinine', 'sod': 'Sodium', 'pot': 'Potassium', 'hemo': 'Hemoglobin', 'pcv': 'Packed Cell Volume', 'wbcc': 'White Blood Cell Count', 'rbcc': 'Red Blood Cell Count', 'htn': 'Hypertension', 'dm': 'Diabetes Mellitus', 'cad': 'Coronary Artery Disease', 'appet': 'Appetite', 'pe': 'Pedal Edema', 'ane': 'Anemia', 'class': 'CKD Stage'}
Original column names:
Index(['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc', 'rbcc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'class'],
      dtype='object')

First 5 rows of original data:
    age  