In [31]:
#Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import sklearn.model_selection as skm 
from sklearn.model_selection import KFold,  GridSearchCV

import matplotlib.pyplot as plt



#load the dataset
data = pd.read_csv('C:/Users/Sowmy/OneDrive/Desktop/hw2/nhis_2022.csv')
#view first few roes of the data
print(data.head())
print(data.shape)



   YEAR  SERIAL  STRATA  PSU         NHISHID  REGION  PERNUM  \
0  2022       1     143   16  0002022H000001       4       1   
1  2022       2     106   53  0002022H000003       3       1   
2  2022       2     106   53  0002022H000003       3       2   
3  2022       3     134   13  0002022H000006       2       1   
4  2022       4     106   53  0002022H000007       3       1   

            NHISPID      HHX  SAMPWEIGHT  ...  TOMSAUCEMNO  SODAPNO  FRIESPNO  \
0  0002022H00000110  H000001      8018.0  ...            2        0       110   
1  0002022H00000310  H000003     10117.0  ...            1        0         1   
2  0002022H00000320  H000003      7933.0  ...          996      996       996   
3  0002022H00000610  H000006      2681.0  ...            1        1         1   
4  0002022H00000710  H000007     10233.0  ...            3       30         5   

   SPORDRMNO  FRTDRINKMNO  COFETEAMNO  POTATONO  PIZZANO  HRSLEEP  CVDSHT  
0          3            0           0         3     

The dataset has been loades and it has 35,115 rows and 48 columns.

In [32]:
#drop survey columns
survey_cols = ['YEAR', 'SERIAL', 'STRATA', 'PSU', 'NHISHID', 'REGION',
               'PERNUM', 'NHISPID', 'HHX', 'SAMPWEIGHT', 'ASTATFLG', 'CSTATFLG']
data.drop(columns=survey_cols, inplace=True)


Since survey ID  variables are not beeing used, so dropped them from the dataset.

In [33]:
#Replace special missing codes
data.replace([996,997,998,999],np.nan,inplace=True)

# Define columns and their replacements
replace_7_9 = ['SEX', 'HINOTCOVE', 'CANCEREV', 'CHEARTDIEV', 'DIABETICEV', 'HEARTATTEV', 'STROKEV']
replace_97_98_99 = ['HOURSWRK', 'HRSLEEP']

# Replace 7 and 9 with NaN
data[replace_7_9] = data[replace_7_9].replace({7: np.nan, 9: np.nan})

# Replace 97, 98, 99 with NaN
data[replace_97_98_99] = data[replace_97_98_99].replace({97: np.nan, 98: np.nan, 99: np.nan})


Replaced 996, 997, 998,999 special codes with nan in whole dataset. But replacing 7 and 9 with Nan across the entire datset can lose real data where 7 and 9 are valid responses . So replced those 7 and 9 special codes in specific columns. Simmilarly replaced special codes 97, 98, 99 from specific columns mentioned above.

In [34]:
#define the variables. 
variables = ['AGE', 'SEX', 'BMICALC', 'HRSLEEP', 'SODAPNO','FRIESPNO', 'DIABETICEV']
data  = data[variables]
data

Unnamed: 0,AGE,SEX,BMICALC,HRSLEEP,SODAPNO,FRIESPNO,DIABETICEV
0,61.0,1.0,38.4,8.0,0.0,110.0,1.0
1,43.0,1.0,27.3,6.0,0.0,1.0,1.0
2,12.0,2.0,18.7,0.0,,,1.0
3,68.0,1.0,25.0,6.0,1.0,1.0,1.0
4,73.0,1.0,24.0,8.0,30.0,5.0,1.0
...,...,...,...,...,...,...,...
35110,11.0,1.0,24.6,0.0,,,1.0
35111,18.0,1.0,,7.0,2.0,1.0,1.0
35112,12.0,2.0,18.6,0.0,,,1.0
35113,61.0,2.0,29.8,5.0,2.0,2.0,1.0


Defined the variables for analysis. Now dataset has 7 columns : AGE, SEX, BMICALC, HRSLEEO, SODAPNO, FRIESPNO, DIABETICEV. The question that I am working on is " "Can demographic and behavioral variables such as age, sex, BMI, sleep hours, soda consumption, and fries consumption be used to predict whether an individual has ever been diagnosed with diabetes?"



In [35]:
#drop rows with nan value
data = data.dropna(subset=['DIABETICEV']).copy()

#Impute missing values with median age.
data['AGE'] = data['AGE'].fillna(data['AGE'].median())

#Impute missing values with mode.
data['SEX'] = data['SEX'].fillna(data['SEX'].mode()[0])

#Impute missing values with median
data['HRSLEEP'] = data['HRSLEEP'].fillna(data['HRSLEEP'].median())

#Drop rows with Nan values.
data = data.dropna()

data

Unnamed: 0,AGE,SEX,BMICALC,HRSLEEP,SODAPNO,FRIESPNO,DIABETICEV
0,61.0,1.0,38.4,8.0,0.0,110.0,1.0
1,43.0,1.0,27.3,6.0,0.0,1.0,1.0
3,68.0,1.0,25.0,6.0,1.0,1.0,1.0
4,73.0,1.0,24.0,8.0,30.0,5.0,1.0
6,73.0,1.0,26.5,6.0,5.0,3.0,1.0
...,...,...,...,...,...,...,...
35106,84.0,2.0,20.7,9.0,0.0,1.0,1.0
35107,45.0,1.0,31.4,8.0,0.0,4.0,1.0
35108,47.0,2.0,36.3,8.0,0.0,1.0,1.0
35109,37.0,2.0,29.0,7.0,4.0,1.0,1.0


Cleaned the dataset by imputing missing values for age with median because it is numeric and can have more outliers. And HRSLEEP with median as it is numeric and median is more robust. Also imputed sex with mode beacuse it is categorical. Other predictors which has nan rows where dropped to maintain data integrity.

In [36]:
data['DIABETICEV'] = data['DIABETICEV'].map({1.0: 0, 2.0: 1})#1 = No diabetes, 2 = has diabetes.

#define feature variable
X = data.drop('DIABETICEV', axis=1)

#define target variable
y = data['DIABETICEV']

#split the 80% data for training and 20% for testing.
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=1, stratify=y)

In [37]:
#feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [38]:
#fit the model
svm_linear = SVC(kernel='linear',random_state=1, class_weight='balanced')
svm_linear.fit(X_train, y_train)

In [39]:
y_pred_linear = svm_linear.predict(X_test)


print("Accuracy:", accuracy_score(y_test, y_pred_linear))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_linear))
print("Classification Report:\n", classification_report(y_test, y_pred_linear))

Accuracy: 0.6378066378066378
Confusion Matrix:
 [[2714 1635]
 [ 122  380]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.62      0.76      4349
           1       0.19      0.76      0.30       502

    accuracy                           0.64      4851
   macro avg       0.57      0.69      0.53      4851
weighted avg       0.88      0.64      0.71      4851



- The model achieved 64 % of accuracy.
- 19 % precision for predicting diabetes.
- 96% precision for non diabetic individual.

In [40]:
svm_rbf = SVC(kernel='rbf', random_state=1, class_weight='balanced')
svm_rbf.fit(X_train, y_train)

In [41]:
y_pred_rbf = svm_rbf.predict(X_test)


print("Accuracy:", accuracy_score(y_test, y_pred_rbf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rbf))
print("Classification Report:\n", classification_report(y_test, y_pred_rbf))

Accuracy: 0.6390434961863534
Confusion Matrix:
 [[2724 1625]
 [ 126  376]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.63      0.76      4349
           1       0.19      0.75      0.30       502

    accuracy                           0.64      4851
   macro avg       0.57      0.69      0.53      4851
weighted avg       0.88      0.64      0.71      4851



- Test accuracy of 62.4 % similar to linear model.
- Precision is same as linear.
- Recall for diabetic slightly improved.

In [43]:
svm_model_poly = SVC(kernel='poly', random_state=1, class_weight='balanced')

# fit the model
svm_model_poly.fit(X_train, y_train)
y_pred_poly = svm_model_poly.predict(X_test)


print("Accuracy:", accuracy_score(y_test, y_pred_poly))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_poly))
print("Classification Report:\n", classification_report(y_test, y_pred_poly))


Accuracy: 0.6287363430220573
Confusion Matrix:
 [[2664 1685]
 [ 116  386]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.61      0.75      4349
           1       0.19      0.77      0.30       502

    accuracy                           0.63      4851
   macro avg       0.57      0.69      0.52      4851
weighted avg       0.88      0.63      0.70      4851



- The test acccuracy is 62.9% .
- Higher recall of 0.77.

From above we can say that
- Accuracy is almost same for all three models which is nearly 64%.
- Precision is poor for all models which is 0.19.
- PolynomialSVM gives the highest recall for diabetes(77%), slightly better than linear.

Now lets do tuning for each kernel to improve performance.