In [1]:
# This code loads NHIS 2022 data, filters adult records, cleans specific predictor columns, 
# isolates individuals aged 50 and older, examines diabetes status distribution among them, 
# and saves the cleaned dataset for further analysis.

import pandas as pd

df = pd.read_csv("nhis_2022.csv")

adult_dataset = df[df['ASTATFLG'].isin([1, 6])].copy()

predictors = ['BMICALC', 'FRUTNO', 'SALADSNO', 'MOD10DMIN', 'HRSLEEP', 'PIZZANO', 'FRIESPNO', 'AGE']

adult_dataset = adult_dataset[~adult_dataset['BMICALC'].isin([0.0, 996.0])]

adult_dataset = adult_dataset[~adult_dataset['FRUTNO'].isin(range(996, 1000))]
adult_dataset['FRUTNO'] = adult_dataset['FRUTNO'].clip(upper=995)

adult_dataset = adult_dataset[~adult_dataset['SALADSNO'].isin(range(996, 1000))]
adult_dataset['SALADSNO'] = adult_dataset['SALADSNO'].clip(upper=95)

adult_dataset = adult_dataset[~adult_dataset['MOD10DMIN'].isin([0, 996, 997, 998, 999])]

adult_dataset = adult_dataset[~adult_dataset['HRSLEEP'].isin([0, 97])]

adult_dataset = adult_dataset[~adult_dataset['PIZZANO'].isin([996, 997, 998, 999])]

adult_dataset = adult_dataset[~adult_dataset['FRIESPNO'].isin([996, 997, 998, 999])]

adult_dataset = adult_dataset[~adult_dataset['AGE'].isin([997, 998, 999])]

adult_dataset = adult_dataset[adult_dataset['DIABETICEV'].isin([1, 2, 3])]

adult_dataset = adult_dataset.dropna(subset=['DIABETICEV'])

over_50_dataset = adult_dataset[adult_dataset['AGE'] >= 50].copy()

diabetic_counts = over_50_dataset['DIABETICEV'].value_counts().sort_index()
print("\nDIABETICEV Distribution for Age > 50:")
print(f"No Diabetes (1): {diabetic_counts.get(1, 0)}")
print(f"Yes Diabetes (2): {diabetic_counts.get(2, 0)}")
print(f"Borderline (3): {diabetic_counts.get(3, 0)}")

total = diabetic_counts.sum()
print(f"\nPercentage Distribution:")
print(f"No Diabetes: {(diabetic_counts.get(1, 0)/total*100):.2f}%")
print(f"Yes Diabetes: {(diabetic_counts.get(2, 0)/total*100):.2f}%")
print(f"Borderline: {(diabetic_counts.get(3, 0)/total*100):.2f}%")

print(f"\nTotal number of people over 50: {len(over_50_dataset)}")

over_50_dataset.to_csv('cleaned_data_set_for_svm.csv', index=False)



DIABETICEV Distribution for Age > 50:
No Diabetes (1): 8066
Yes Diabetes (2): 1176
Borderline (3): 0

Percentage Distribution:
No Diabetes: 87.28%
Yes Diabetes: 12.72%
Borderline: 0.00%

Total number of people over 50: 9242
