In [None]:
[
 HighBP, HighChol, HeartDiseaseorAttack, Stroke,
 Smoker, PhysActivity, DiffWalk,
 BMI, MentHlth, PhysHlth, GenHlth,
 Sex, Age, HvyAlcoholConsump,
]

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

In [27]:
data=pd.read_csv("cdc_diabetes_health_indicators.csv")

In [28]:
data.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
0,1,1,1,40,1,0,0,0,0,1,...,0,5,18,15,1,0,9,4,3,0
1,0,0,0,25,1,0,0,1,0,0,...,1,3,0,0,0,0,7,6,1,0
2,1,1,1,28,0,0,0,0,1,0,...,1,5,30,30,1,0,9,4,8,0
3,1,0,1,27,0,0,0,1,1,1,...,0,2,0,0,0,0,11,3,6,0
4,1,1,1,24,0,0,0,1,1,1,...,0,2,3,0,0,0,11,5,4,0


In [29]:
features = [
    'Diabetes_binary', 
    'Age',
    'Sex',
    'BMI',
    'HighBP',
    'PhysActivity',
    'Smoker',
    'HighChol',
    'HeartDiseaseorAttack',
    'Stroke',
    'DiffWalk',
    'MentHlth',
    'PhysHlth',
    'GenHlth',
    'HvyAlcoholConsump',
]

In [30]:
df = data[features]

df = df.dropna()

x = df.drop('Diabetes_binary', axis=1)
y = df['Diabetes_binary']

In [31]:
x.head()

Unnamed: 0,Age,Sex,BMI,HighBP,PhysActivity,Smoker,HighChol,HeartDiseaseorAttack,Stroke,DiffWalk,MentHlth,PhysHlth,GenHlth,HvyAlcoholConsump
0,9,0,40,1,0,1,1,0,0,1,18,15,5,0
1,7,0,25,0,1,1,0,0,0,0,0,0,3,0
2,9,0,28,1,0,0,1,0,0,1,30,30,5,0
3,11,0,27,1,1,0,0,0,0,0,0,0,2,0
4,11,0,24,1,1,0,1,0,0,0,3,0,2,0


In [32]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [33]:
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=42)

In [34]:
smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

In [35]:
print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_train_resampled.value_counts())

Before SMOTE: Diabetes_binary
0    174595
1     28349
Name: count, dtype: int64
After SMOTE: Diabetes_binary
0    174595
1    174595
Name: count, dtype: int64


In [36]:
x_train

array([[ 1.29914939,  1.12736918, -1.26838675, ..., -0.48659241,
        -0.47861876, -0.24401415],
       [-0.01051636,  1.12736918,  0.85003895, ..., -0.48659241,
         0.45729435, -0.24401415],
       [ 1.29914939,  1.12736918, -0.66312226, ...,  0.08693796,
        -0.47861876, -0.24401415],
       ...,
       [ 0.31690008, -0.88702088, -0.51180614, ..., -0.14247419,
        -0.47861876, -0.24401415],
       [-0.99276567, -0.88702088, -0.81443838, ..., -0.48659241,
        -0.47861876, -0.24401415],
       [ 0.97173295, -0.88702088,  1.00135507, ..., -0.48659241,
         0.45729435, -0.24401415]], shape=(202944, 14))

In [37]:
linear_svm_model = LinearSVC(random_state=42)
linear_svm_model.fit(x_train_resampled, y_train_resampled)
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

y_pred = linear_svm_model.predict(x_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.7246

Confusion Matrix:
[[31290 12449]
 [ 1526  5471]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.72      0.82     43739
           1       0.31      0.78      0.44      6997

    accuracy                           0.72     50736
   macro avg       0.63      0.75      0.63     50736
weighted avg       0.86      0.72      0.77     50736



In [38]:
def predict_diabetes(model, scaler, HighBP, HighChol, HeartDiseaseorAttack, Stroke, Smoker, PhysActivity, DiffWalk, BMI, MentHlth, PhysHlth, GenHlth, Sex, Age, HvyAlcoholConsump):
    new_patient = [[HighBP, HighChol, HeartDiseaseorAttack, Stroke, Smoker, PhysActivity, DiffWalk, BMI, MentHlth, PhysHlth, GenHlth, Sex, Age, HvyAlcoholConsump]]
    new_patient_scaled = scaler.transform(new_patient)
    prediction = model.predict(new_patient_scaled)
    
    if prediction[0] == 1:
        print("⚠️  The patient is likely diabetic.")
    else:
        print("✅  The patient is likely not diabetic.")

In [39]:
predict_diabetes(
    model=linear_svm_model,
    scaler=scaler,
    HighBP=1,
    HighChol=1,
    HeartDiseaseorAttack=1,
    Stroke=1,
    Smoker=1,
    PhysActivity=1,
    DiffWalk=1,
    BMI=34, 
    MentHlth=0,
    PhysHlth=7,
    GenHlth=4,
    Sex=0,
    Age=9,
    HvyAlcoholConsump=0,
)

⚠️  The patient is likely diabetic.


In [42]:
import pandas as pd
from sklearn.utils import resample

# If y_train is a numpy array, convert it to a Series
y_train_series = pd.Series(y_train, name="Diabetes_binary")

# Combine X and y
train_data = pd.concat([x_train, y_train_series], axis=1)

# Separate majority and minority classes
df_majority = train_data[train_data['Diabetes_binary'] == 0]
df_minority = train_data[train_data['Diabetes_binary'] == 1]



TypeError: cannot concatenate object of type '<class 'numpy.ndarray'>'; only Series and DataFrame objs are valid