In [1]:
#libraries Initialization
import numpy as np
import pandas as pd
import math
import scipy as sp 



PART 1: DATA ARCHITECTURE ANALYSIS

In [2]:
#data Loading and converting data values into numpy array

# I initially used Pandas here because there are both numeric and string values but numpy.isnan is valid for numeric only
# therefore, I calculated the missing values based on pandas data frame 

dataMatrix = pd.read_csv("patient_records.csv")     #Data is stored in pandas DataFrame
Datadimension = dataMatrix.shape
records, attributes = Datadimension
missingvalues = dataMatrix.isna().sum().sum()   #Used Pandas dataFrame to calculate all the missing values {NAN}
dataSparsity = missingvalues/(records*attributes)    #Sparsity calculation 
dataDensity = 1-dataSparsity                         #Density Calculation

attributesTypes = {
    "patient_id": "Nominal (Identifier)",
    "age": "Numeric - Ratio (Continuous)",
    "gender": "Nominal (Categorical)",
    "bmi": "Numeric - Ratio (Continuous)",
    "blood_pressure_systolic": "Numeric - Interval (Continuous)",
    "blood_pressure_diastolic": "Numeric - Interval (Continuous)",
    "cholesterol_level": "Numeric - Ratio (Continuous)",
    "diabetes_type": "Ordinal (Discrete)",
    "heart_disease": "Binary",
    "annual_income": "Numeric - Ratio (Continuous, skewed)",
    "exercise_hours_weekly": "Numeric - Ratio (Continuous)",
    "smoking_status": "Nominal (Categorical)",
    "family_history": "Binary",
    "medication_count": "Numeric - Ratio (Discrete)"

}

report = pd.DataFrame({
    "Attribute": dataMatrix.columns,
    "Type": [attributesTypes[col] for col in dataMatrix.columns],
    "Missing_Values": dataMatrix.isna().sum().values,
    "Unique_Values": dataMatrix.nunique().values,
    "Min": dataMatrix.min(numeric_only=True).reindex(dataMatrix.columns).values,
    "Max": dataMatrix.max(numeric_only=True).reindex(dataMatrix.columns).values,
    "Mean": dataMatrix.mean(numeric_only=True).reindex(dataMatrix.columns).values,
    "StdDev": dataMatrix.std(numeric_only=True).reindex(dataMatrix.columns).values
})

dataMatrix = dataMatrix.values                      # Data stored as numpy data Matrix

print("----------------- <ATTRIBUTES> -------------------------------\n");



for i, typ in attributesTypes.items():
    print(f"- {i}: {typ}")

print("\n\nDimensions: ",Datadimension,"\nSparsity: ",dataSparsity, "\nDensity: ",dataDensity)
pd.set_option("display.max_columns", None)
print("\n--- < Data Quality Report > ---\n")
print(report)


# I saved the generated report as a csv file 
report.to_csv("datareport.csv", index=False)
print("\nData Quality Report saved as 'datareport.csv'")





----------------- <ATTRIBUTES> -------------------------------

- patient_id: Nominal (Identifier)
- age: Numeric - Ratio (Continuous)
- gender: Nominal (Categorical)
- bmi: Numeric - Ratio (Continuous)
- blood_pressure_systolic: Numeric - Interval (Continuous)
- blood_pressure_diastolic: Numeric - Interval (Continuous)
- cholesterol_level: Numeric - Ratio (Continuous)
- diabetes_type: Ordinal (Discrete)
- heart_disease: Binary
- annual_income: Numeric - Ratio (Continuous, skewed)
- exercise_hours_weekly: Numeric - Ratio (Continuous)
- smoking_status: Nominal (Categorical)
- family_history: Binary
- medication_count: Numeric - Ratio (Discrete)


Dimensions:  (30000, 14) 
Sparsity:  0.023702380952380954 
Density:  0.976297619047619

--- < Data Quality Report > ---

                   Attribute                                  Type  \
0                 patient_id                  Nominal (Identifier)   
1                        age          Numeric - Ratio (Continuous)   
2              

In [27]:
# I have calculated Mean, Median and Mode here

#Mean Calculation
meansum = np.zeros(attributes)
mediansum = np.zeros(attributes)
modesum = [None] * attributes
varianceAttr = np.zeros(attributes)
sdAttr  = np.zeros(attributes)
count = 0

print(type(dataMatrix[0][0]))
for j in range(attributes):
    count=0
    numvals=[]
    for i in range(records):       
        if(type(dataMatrix[i][j])==type(5) or type(dataMatrix[i][j])==type(5.5)):  # Calculating Mean of each Attribute
            meansum[j]+=dataMatrix[i][j]
            count+=1
            numvals.append(dataMatrix[i][j])
        else:
            break
    if(meansum[j]>0):
        meansum[j]/=count
    else:
        meansum[j]=0
    if len(numvals)>0:                   #Calcuating Median of each attribute     
        sortedvals = sorted(numvals)
        sizearr = len(sortedvals)
        mid = sizearr//2
        if sizearr%2==0:
            mediansum[j] = (sortedvals[mid - 1] + sortedvals[mid]) / 2
        else:
            mediansum[j] = sortedvals[mid]

        freq = {}
        for v in numvals:
            freq[v] = freq.get(v, 0) + 1
        max_count = max(freq.values())
        if max_count == 1:   # all unique
            modesum[j] = "No mode"
        else:
            modes = [k for k, v in freq.items() if v == max_count]
            modesum[j] = modes

        varsum = 0                                         #Calculating Variance of each attribute
        for x in numvals:
            varsum = varsum + (x-meansum[j])**2
        varianceAttr[j] = varsum/len(numvals) 
        sdAttr[j] = np.sqrt(varianceAttr[j])               #Calculating Standard Deviation of each attribute
    else:
        mediansum[j] = 0
        modesum[j] = "non-Numeric"
        varianceAttr[j] = np.nan
        sdAttr[j] = np.nan

    
#Printing Mean, Median , Mode, Variance and Standard Deviation

print("------------------------- Mean -------------------------------")
for i in range(attributes):
    print(meansum[i])

print("------------------------- Median -------------------------------")
for i in range(attributes):
    print(mediansum[i])

print("------------------------- Mode -------------------------------")
for i in range(attributes):
    print(modesum[i])

print("------------------------- Variances --------------------------")
for i in range(attributes):
    print(varianceAttr[i])   

print("------------------------- Standard Deviations --------------------------")
for i in range(attributes):
    print(sdAttr[i])
  








<class 'int'>
------------------------- Mean -------------------------------
15000.5
0.0
0.0
0.0
119.6029
79.4218
199.4116
0.0
0.2988
27.716349333333497
3.053893333333323
0.0
0.39586666666666664
1.9944333333333333
------------------------- Median -------------------------------
15000.5
32.0
0.0
27.89998564866462
119.0
79.0
199.0
1.5
0.0
20.22
3.0
0.0
0.0
2.0
------------------------- Mode -------------------------------
No mode
[89.0]
non-Numeric
No mode
[118]
[80]
[198]
[0.0]
[0]
[12.24, 17.35]
[0.0]
non-Numeric
[0]
[2]
------------------------- Variances --------------------------
74999999.91666667
nan
nan
nan
323.59621158999477
101.16355142666787
1617.3948521066766
nan
0.20951856000008776
685.1722180392957
3.5561555086224685
nan
0.2391562488887832
1.9880690122229576
------------------------- Standard Deviations --------------------------
8660.254033033134
nan
nan
nan
17.988780158476416
10.058009317288779
40.21684786388258
nan
0.4577319739761335
26.175794506362088
1.8857771630345057
