In [53]:
#Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import sklearn.model_selection as skm 
from sklearn.model_selection import KFold,  GridSearchCV

import matplotlib.pyplot as plt



#load the dataset
data = pd.read_csv('C:/Users/Sowmy/OneDrive/Desktop/hw2/nhis_2022.csv')
#view first few roes of the data
print(data.head())
print(data.shape)



   YEAR  SERIAL  STRATA  PSU         NHISHID  REGION  PERNUM  \
0  2022       1     143   16  0002022H000001       4       1   
1  2022       2     106   53  0002022H000003       3       1   
2  2022       2     106   53  0002022H000003       3       2   
3  2022       3     134   13  0002022H000006       2       1   
4  2022       4     106   53  0002022H000007       3       1   

            NHISPID      HHX  SAMPWEIGHT  ...  TOMSAUCEMNO  SODAPNO  FRIESPNO  \
0  0002022H00000110  H000001      8018.0  ...            2        0       110   
1  0002022H00000310  H000003     10117.0  ...            1        0         1   
2  0002022H00000320  H000003      7933.0  ...          996      996       996   
3  0002022H00000610  H000006      2681.0  ...            1        1         1   
4  0002022H00000710  H000007     10233.0  ...            3       30         5   

   SPORDRMNO  FRTDRINKMNO  COFETEAMNO  POTATONO  PIZZANO  HRSLEEP  CVDSHT  
0          3            0           0         3     

In [54]:
#drop survey columns
survey_cols = ['YEAR', 'SERIAL', 'STRATA', 'PSU', 'NHISHID', 'REGION',
               'PERNUM', 'NHISPID', 'HHX', 'SAMPWEIGHT', 'ASTATFLG', 'CSTATFLG']
data.drop(columns=survey_cols, inplace=True)
#Replace special missing codes
data.replace([996,997,998,999],np.nan,inplace=True)

# Define columns and their replacements
replace_7_9 = ['SEX', 'HINOTCOVE', 'CANCEREV', 'CHEARTDIEV', 'DIABETICEV', 'HEARTATTEV', 'STROKEV']
replace_97_98_99 = ['HOURSWRK', 'HRSLEEP']

# Replace 7 and 9 with NaN
data[replace_7_9] = data[replace_7_9].replace({7: np.nan, 9: np.nan})

# Replace 97, 98, 99 with NaN
data[replace_97_98_99] = data[replace_97_98_99].replace({97: np.nan, 98: np.nan, 99: np.nan})


In [55]:
#define the variables. 
variables = ['AGE', 'SEX', 'BMICALC', 'HRSLEEP', 'SODAPNO','FRIESPNO', 'DIABETICEV']
data  = data[variables]
data

Unnamed: 0,AGE,SEX,BMICALC,HRSLEEP,SODAPNO,FRIESPNO,DIABETICEV
0,61.0,1.0,38.4,8.0,0.0,110.0,1.0
1,43.0,1.0,27.3,6.0,0.0,1.0,1.0
2,12.0,2.0,18.7,0.0,,,1.0
3,68.0,1.0,25.0,6.0,1.0,1.0,1.0
4,73.0,1.0,24.0,8.0,30.0,5.0,1.0
...,...,...,...,...,...,...,...
35110,11.0,1.0,24.6,0.0,,,1.0
35111,18.0,1.0,,7.0,2.0,1.0,1.0
35112,12.0,2.0,18.6,0.0,,,1.0
35113,61.0,2.0,29.8,5.0,2.0,2.0,1.0


In [56]:
#drop rows with nan value
data = data.dropna(subset=['DIABETICEV']).copy()

#Impute missing values with median age.
data['AGE'] = data['AGE'].fillna(data['AGE'].median())

#Impute missing values with mode.
data['SEX'] = data['SEX'].fillna(data['SEX'].mode()[0])

#Impute missing values with median
data['HRSLEEP'] = data['HRSLEEP'].fillna(data['HRSLEEP'].median())

#Drop rows with Nan values.
data = data.dropna()

data

Unnamed: 0,AGE,SEX,BMICALC,HRSLEEP,SODAPNO,FRIESPNO,DIABETICEV
0,61.0,1.0,38.4,8.0,0.0,110.0,1.0
1,43.0,1.0,27.3,6.0,0.0,1.0,1.0
3,68.0,1.0,25.0,6.0,1.0,1.0,1.0
4,73.0,1.0,24.0,8.0,30.0,5.0,1.0
6,73.0,1.0,26.5,6.0,5.0,3.0,1.0
...,...,...,...,...,...,...,...
35106,84.0,2.0,20.7,9.0,0.0,1.0,1.0
35107,45.0,1.0,31.4,8.0,0.0,4.0,1.0
35108,47.0,2.0,36.3,8.0,0.0,1.0,1.0
35109,37.0,2.0,29.0,7.0,4.0,1.0,1.0


In [57]:
data['DIABETICEV'] = data['DIABETICEV'].map({1.0: 0, 2.0: 1}) # 1 = No diabetes, 2 = Has diabetes

# Define feature and target
X = data.drop('DIABETICEV', axis=1)
y = data['DIABETICEV']

# Sample only 5000 rows for tuning
X_sample, _, y_sample, _ = train_test_split(X, y, train_size=5000, random_state=42, stratify=y)

# Split into train and test 
X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split(
    X_sample, y_sample, test_size=0.2, random_state=42, stratify=y_sample)

# Scale features
scaler = StandardScaler()
X_train_sample = scaler.fit_transform(X_train_sample)
X_test_sample = scaler.transform(X_test_sample)



Mapped target variables to 0 and 1. Defined the feature and target variables. Tuning for whole dataset is taking lots of time . so considered subset of the sample took sample of 5000 rows. Scaled the sampled data since it is sensitive to svm. After tuning retrainned rhe final models on the full dataset.

In [58]:

#  Hyperparameter tuning using GridSearchCV
param_grid = {
    'C': [0.01, 0.1, 1, 10,100], 
}

svm = SVC(kernel='linear', class_weight='balanced', random_state=1)
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

grid_search.fit(X_train_sample, y_train_sample)

print("Best parameters:", grid_search.best_params_)


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best parameters: {'C': 1}


In [59]:

# Split full data
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y)
# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the best model
final_svm = SVC(kernel='linear', C=grid_search.best_params_['C'], class_weight='balanced', random_state=1)
final_svm.fit(X_train, y_train)

# Predict and evaluate
y_pred = final_svm.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Test Accuracy: 0.6272933415790559
Confusion Matrix:
 [[2627 1722]
 [  86  416]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.60      0.74      4349
           1       0.19      0.83      0.32       502

    accuracy                           0.63      4851
   macro avg       0.58      0.72      0.53      4851
weighted avg       0.89      0.63      0.70      4851



In [33]:
param_grid_rbf = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1]
}

svm_rbf = SVC(kernel='rbf', class_weight='balanced', random_state=1)
grid_rbf = GridSearchCV(svm_rbf, param_grid_rbf, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_rbf.fit(X_train_sample, y_train_sample)

print("Best RBF Parameters:", grid_rbf.best_params_)


Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best RBF Parameters: {'C': 100, 'gamma': 1}


In [34]:
# 8. Now train on full dataset for RBF kernel

# Split full data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the best RBF model
final_svm_rbf = SVC(kernel='rbf', 
                    C=grid_rbf.best_params_['C'], 
                    gamma=grid_rbf.best_params_['gamma'], 
                    class_weight='balanced', 
                    random_state=1)
final_svm_rbf.fit(X_train, y_train)

# Predict and evaluate
y_pred_rbf = final_svm_rbf.predict(X_test)

print("Test Accuracy (RBF Kernel):", accuracy_score(y_test, y_pred_rbf))
print("Confusion Matrix (RBF Kernel):\n", confusion_matrix(y_test, y_pred_rbf))
print("Classification Report (RBF Kernel):\n", classification_report(y_test, y_pred_rbf))


Test Accuracy (RBF Kernel): 0.7136672850958565
Confusion Matrix (RBF Kernel):
 [[3216 1133]
 [ 256  246]]
Classification Report (RBF Kernel):
               precision    recall  f1-score   support

           0       0.93      0.74      0.82      4349
           1       0.18      0.49      0.26       502

    accuracy                           0.71      4851
   macro avg       0.55      0.61      0.54      4851
weighted avg       0.85      0.71      0.76      4851



In [35]:
param_grid_poly = {
    'C': [0.1, 1, 10],
    'degree': [2, 3, 4],
    'gamma': [0.001, 0.01, 0.1]
}

svm_poly = SVC(kernel='poly', class_weight='balanced', random_state=1)
grid_poly = GridSearchCV(svm_poly, param_grid_poly, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_poly.fit(X_train_sample, y_train_sample)

print("Best Polynomial Parameters:", grid_poly.best_params_)


Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best Polynomial Parameters: {'C': 0.1, 'degree': 2, 'gamma': 0.001}


In [36]:
# 8. Now train on full dataset for Polynomial kernel

# Split full data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the best Polynomial model
final_svm_poly = SVC(kernel='poly', 
                     C=grid_poly.best_params_['C'], 
                     degree=grid_poly.best_params_['degree'], 
                     gamma=grid_poly.best_params_['gamma'], 
                     class_weight='balanced', 
                     random_state=1)
final_svm_poly.fit(X_train, y_train)

# Predict and evaluate
y_pred_poly = final_svm_poly.predict(X_test)

print("Test Accuracy (Polynomial Kernel):", accuracy_score(y_test, y_pred_poly))
print("Confusion Matrix (Polynomial Kernel):\n", confusion_matrix(y_test, y_pred_poly))
print("Classification Report (Polynomial Kernel):\n", classification_report(y_test, y_pred_poly))


Test Accuracy (Polynomial Kernel): 0.10348381776953205
Confusion Matrix (Polynomial Kernel):
 [[   0 4349]
 [   0  502]]
Classification Report (Polynomial Kernel):
               precision    recall  f1-score   support

           0       0.00      0.00      0.00      4349
           1       0.10      1.00      0.19       502

    accuracy                           0.10      4851
   macro avg       0.05      0.50      0.09      4851
weighted avg       0.01      0.10      0.02      4851



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Among the three kernels:

Linear kernel : It achieved the highesr recall 83% but has low precision. Identified many non diabetic as diabetic.

RBF kernel : Achieved good test accuracy 71.4% but less than linear accuracy. Offered a better balance between identifying diabetic individuals and minimizing false positives.

Polynomial: Performed poorly, has low accuracy 10%. Predicting all as diabetic.