In [21]:
#libraries Initialization
import numpy as np
import pandas as pd
import math
import scipy as sp 



In [24]:
#data Loading and converting data values into numpy array

# I initially used Pandas here because there are both numeric and string values but numpy.isnan is valid for numeric only
# therefore, I calculated the missing values based on pandas data frame 

dataMatrix = pd.read_csv("patient_records.csv")     #Data is stored in pandas DataFrame
Datadimension = dataMatrix.shape
records, attributes = Datadimension
missingvalues = dataMatrix.isna().sum().sum()   #Used Pandas dataFrame to calculate all the missing values {NAN}
dataSparsity = missingvalues/(records*attributes)    #Sparsity calculation 
dataDensity = 1-dataSparsity                         #Density Calculation

attributesTypes = {
    "patient_id": "Nominal (Identifier)",
    "age": "Numeric - Ratio (Continuous)",
    "gender": "Nominal (Categorical)",
    "bmi": "Numeric - Ratio (Continuous)",
    "blood_pressure_systolic": "Numeric - Interval (Continuous)",
    "blood_pressure_diastolic": "Numeric - Interval (Continuous)",
    "cholesterol_level": "Numeric - Ratio (Continuous)",
    "diabetes_type": "Ordinal (Discrete)",
    "heart_disease": "Binary",
    "annual_income": "Numeric - Ratio (Continuous, skewed)",
    "exercise_hours_weekly": "Numeric - Ratio (Continuous)",
    "smoking_status": "Nominal (Categorical)",
    "family_history": "Binary",
    "medication_count": "Numeric - Ratio (Discrete)"

}

report = pd.DataFrame({
    "Attribute": dataMatrix.columns,
    "Type": [attributesTypes[col] for col in dataMatrix.columns],
    "Missing_Values": dataMatrix.isna().sum().values,
    "Unique_Values": dataMatrix.nunique().values,
    "Min": dataMatrix.min(numeric_only=True).reindex(dataMatrix.columns).values,
    "Max": dataMatrix.max(numeric_only=True).reindex(dataMatrix.columns).values,
    "Mean": dataMatrix.mean(numeric_only=True).reindex(dataMatrix.columns).values,
    "StdDev": dataMatrix.std(numeric_only=True).reindex(dataMatrix.columns).values
})

dataMatrix = dataMatrix.values                      # Data stored as numpy data Matrix

print("----------------- <ATTRIBUTES> -------------------------------\n");



for i, typ in attributesTypes.items():
    print(f"- {i}: {typ}")

print("\n\nDimensions: ",Datadimension,"\nSparsity: ",dataSparsity, "\nDensity: ",dataDensity)
pd.set_option("display.max_columns", None)
print("\n--- Data Quality Report ---\n")
print(report)


# I saved the generated report as a csv file 
report.to_csv("datareport.csv", index=False)
print("\nData Quality Report saved as 'datareport.csv'")





----------------- <ATTRIBUTES> -------------------------------

- patient_id: Nominal (Identifier)
- age: Numeric - Ratio (Continuous)
- gender: Nominal (Categorical)
- bmi: Numeric - Ratio (Continuous)
- blood_pressure_systolic: Numeric - Interval (Continuous)
- blood_pressure_diastolic: Numeric - Interval (Continuous)
- cholesterol_level: Numeric - Ratio (Continuous)
- diabetes_type: Ordinal (Discrete)
- heart_disease: Binary
- annual_income: Numeric - Ratio (Continuous, skewed)
- exercise_hours_weekly: Numeric - Ratio (Continuous)
- smoking_status: Nominal (Categorical)
- family_history: Binary
- medication_count: Numeric - Ratio (Discrete)


Dimensions:  (30000, 14) 
Sparsity:  0.023702380952380954 
Density:  0.976297619047619

--- Data Quality Report ---

                   Attribute                                  Type  \
0                 patient_id                  Nominal (Identifier)   
1                        age          Numeric - Ratio (Continuous)   
2                  