In [84]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [98]:
df = pd.read_csv("final_renamed_merged_data.csv")
df.astype(np.float32)
df = df.loc[:, 
            ["FastingGlucose_mmol_L", "FastingInsulin_uU_mL", "Gender", "AgeYears", "Race_Ethnicity", "CitizenshipStatus", 
             "YearsInUS", "EducationLevel_Youth6_19", "EducationLevel_Adults20+", "MaritalStatus", "HouseholdSize", "FamilySize",
             "IncomeToPovertyRatio", "PregnancyStatus", "InterviewLang_SP", "ProxyUsed_SP_Interview", "InterpreterUsed_SP_Interview",
             "InterviewSampleWeight", "ExamSampleWeight", "AnnualHouseholdIncome", "AnnualFamilyIncome", "CountryOfBirth"]]
df["HOMA-IR"] = (df["FastingGlucose_mmol_L"] * df["FastingInsulin_uU_mL"]) / (22.5 * 4.0)
df = df.drop(columns=["FastingGlucose_mmol_L", "FastingInsulin_uU_mL"])
df.columns

Index(['Gender', 'AgeYears', 'Race_Ethnicity', 'CitizenshipStatus',
       'YearsInUS', 'EducationLevel_Youth6_19', 'EducationLevel_Adults20+',
       'MaritalStatus', 'HouseholdSize', 'FamilySize', 'IncomeToPovertyRatio',
       'PregnancyStatus', 'InterviewLang_SP', 'ProxyUsed_SP_Interview',
       'InterpreterUsed_SP_Interview', 'InterviewSampleWeight',
       'ExamSampleWeight', 'AnnualHouseholdIncome', 'AnnualFamilyIncome',
       'CountryOfBirth', 'HOMA-IR'],
      dtype='object')

In [99]:
df["HOMA-IR"].fillna(df["HOMA-IR"].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["HOMA-IR"].fillna(df["HOMA-IR"].mean(), inplace=True)


In [100]:
len(df)

13487

In [101]:
df.dropna(inplace=True, axis=1)
non_missing_count = df.count()
non_missing_count

Gender                   13487
AgeYears                 13487
Race_Ethnicity           13487
HouseholdSize            13487
FamilySize               13487
InterviewSampleWeight    13487
ExamSampleWeight         13487
HOMA-IR                  13487
dtype: int64

In [102]:
print(len(df))

13487


In [103]:
df.describe()

Unnamed: 0,Gender,AgeYears,Race_Ethnicity,HouseholdSize,FamilySize,InterviewSampleWeight,ExamSampleWeight,HOMA-IR
count,13487.0,13487.0,13487.0,13487.0,13487.0,13487.0,13487.0,13487.0
mean,1.505301,42.303329,2.922296,3.444206,3.253429,34754.213229,35918.898517,0.954011
std,0.49999,21.277953,1.16802,1.725526,1.773505,30327.453337,31193.588376,1.260516
min,1.0,12.0,1.0,1.0,1.0,1339.046608,1363.174167,0.011964
25%,1.0,22.0,2.0,2.0,2.0,14000.793521,14505.006982,0.410952
50%,2.0,41.0,3.0,3.0,3.0,23783.600538,24504.01967,0.702717
75%,2.0,60.0,4.0,5.0,4.0,43836.600929,46179.032532,1.039153
max,2.0,85.0,5.0,7.0,7.0,220233.315202,222579.783434,51.116528


In [104]:
df.head()

Unnamed: 0,Gender,AgeYears,Race_Ethnicity,HouseholdSize,FamilySize,InterviewSampleWeight,ExamSampleWeight,HOMA-IR
0,2.0,85.0,3.0,1.0,1.0,29960.839509,34030.994786,0.954011
1,2.0,44.0,4.0,4.0,4.0,26457.70818,26770.584605,0.556776
2,1.0,70.0,3.0,2.0,2.0,32961.50992,35315.5389,0.870532
3,2.0,16.0,4.0,3.0,3.0,5635.221296,5920.617679,0.583911
4,1.0,73.0,3.0,2.0,2.0,43718.506372,44231.167252,0.894945


In [105]:
for column in df.columns:
    plt.figure()
    df[column].plot(kind='hist', title=f"Distribution {column}", edgecolor='black')
    plt.savefig(f"data_analysis/{column}_histogram.png")
    plt.close()

In [96]:
df.to_csv('data.csv', index=False)

In [97]:
print(list(df.columns))

['Gender', 'AgeYears', 'Race_Ethnicity', 'HouseholdSize', 'FamilySize', 'InterviewSampleWeight', 'ExamSampleWeight', 'HOMA-IR']
