In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.stats import ttest_ind
df=pd.read_csv("/content/drive/MyDrive/UCR/FALL 2023/DMT /smoking_driking_dataset_Ver01.csv")

**LOADING THE DATASET**

In [None]:
df.head()

Unnamed: 0,sex,age,height,weight,waistline,sight_left,sight_right,hear_left,hear_right,SBP,...,LDL_chole,triglyceride,hemoglobin,urine_protein,serum_creatinine,SGOT_AST,SGOT_ALT,gamma_GTP,SMK_stat_type_cd,DRK_YN
0,Male,35,170,75,90.0,1.0,1.0,1.0,1.0,120.0,...,126.0,92.0,17.1,1.0,1.0,21.0,35.0,40.0,1.0,Y
1,Male,30,180,80,89.0,0.9,1.2,1.0,1.0,130.0,...,148.0,121.0,15.8,1.0,0.9,20.0,36.0,27.0,3.0,N
2,Male,40,165,75,91.0,1.2,1.5,1.0,1.0,120.0,...,74.0,104.0,15.8,1.0,0.9,47.0,32.0,68.0,1.0,N
3,Male,50,175,80,91.0,1.5,1.2,1.0,1.0,145.0,...,104.0,106.0,17.6,1.0,1.1,29.0,34.0,18.0,1.0,N
4,Male,50,165,60,80.0,1.0,1.2,1.0,1.0,138.0,...,117.0,104.0,13.8,1.0,0.8,19.0,12.0,25.0,1.0,N


In [None]:
columns = df.columns

**DATA PREPROCESSING**

In [None]:
df["sex"] = np.where(df["sex"] == "Male", 1.0, 0.0) #Converting Male and Female to 1s and 0s

In [None]:
df['DRK_YN'] = np.where(df['DRK_YN'] == "Y", 1.0, 0.0) #Converting Y and N to 1s and 0s

In [None]:
for i in df.columns:
  df[i] = df[i].astype('float64') #Converting the datatype as Float64

In [None]:
df["SMK_stat_type_cd"].value_counts()

1.0    602441
3.0    213954
2.0    174951
Name: SMK_stat_type_cd, dtype: int64

1 -  denotes non smokers
2 - smokers who quitted smoking
3 - currently smoking

From above we can see that there is a kind of class imbalance for smokers which is why we have decided to merge 2 and 3 as smokers.


In [None]:
df.loc[df["SMK_stat_type_cd"] == 3.0, "SMK_stat_type_cd"] = 0.0
df.loc[df["SMK_stat_type_cd"] == 2.0, "SMK_stat_type_cd"] = 0.0

In [None]:
df["SMK_stat_type_cd"].value_counts()

1.0    602441
0.0    388905
Name: SMK_stat_type_cd, dtype: int64

In [None]:
corr = (df.corr())
print(corr['SMK_stat_type_cd'])

sex                -0.642598
age                 0.073802
height             -0.491795
weight             -0.395614
waistline          -0.234447
sight_left         -0.063685
sight_right        -0.065000
hear_left           0.010414
hear_right          0.012682
SBP                -0.107369
DBP                -0.142998
BLDS               -0.100516
tot_chole          -0.007086
HDL_chole           0.167597
LDL_chole           0.011335
triglyceride       -0.208084
hemoglobin         -0.464186
urine_protein      -0.018797
serum_creatinine   -0.148009
SGOT_AST           -0.069315
SGOT_ALT           -0.139159
gamma_GTP          -0.239668
SMK_stat_type_cd    1.000000
DRK_YN             -0.362274
Name: SMK_stat_type_cd, dtype: float64


In [None]:
from sklearn.preprocessing import StandardScaler
# Min Max Preprocessing
for i in columns:
  if i != "sex" and i != "DRK_YN" and i != "SMK_stat_type_cd":
    min_val = df[i].min()
    max_val = df[i].max()
    df[i] = (df[i] - min_val) / (max_val - min_val)

**SPLITTING THE TRAINING AND TESTING DATA**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop("SMK_stat_type_cd", axis = 1)
y = df['SMK_stat_type_cd']

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

undersampler = RandomUnderSampler(sampling_strategy=.95)
X, y = undersampler.fit_resample(X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
X_train = X_train.values
X_test = X_test.values

**SUPPORT VECTOR MACHINE**

In [None]:
import numpy as np

def LinearSVM_Scratch(X_train, y_train, X_test, lr=0.001, lambda_parameter=0.01, iterations=1000):

    #Kfold
    num_folds = 5
    fold_size = len(X_train) // num_folds
    X_train_cv = X_train
    y_train_cv = y_train
    for fold in range(num_folds):
        # Define the start and end index for the validation set
        start_val_index = fold * fold_size
        end_val_index = (fold + 1) * fold_size

        # New training set
        X_train = np.append(X_train_cv[:start_val_index], X_train_cv[end_val_index:], axis=0)
        y_train = np.append(y_train_cv[:start_val_index], y_train_cv[end_val_index:])

        X_val = X_train_cv[start_val_index:end_val_index]
        y_val = y_train_cv[start_val_index:end_val_index]


        #SVM Model Training
        #Initialization of Weight and Bias
        w = None
        b = None

        # Get the number of samples and features from the training data
        _, features = X_train.shape

        # Convert class labels to binary {-1, 1} for SVM
        y = np.where(y_train <= 0, -1, 1)

        # Initialize Weights and Bias as Zeros
        w = np.zeros(features)
        b = 0

        # Training the SVM model using Stochastic Gradient Descent (SGD)
        for _ in range(iterations):
            for id, x_i in enumerate(X_train):
                # Check the condition for each data point
                condition = y[id] * (np.dot(x_i, w) - b) >= 1
                # Update weights and bias based on the condition
                if condition:
                    w -= lr * (2 * lambda_parameter * w)
                else:
                    w -= lr * (2 * lambda_parameter * w - np.dot(x_i, y[id]))
                    b -= lr * y[id]

        # Prediction on training and testing data
        approx_train = np.dot(X_train, w) - b
        y_pred_train = np.sign(approx_train)
        y_pred_train = np.where(y_pred_train == -1.0, 0.0, y_pred_train)

        approx_val = np.dot(X_val, w) - b
        y_pred_val = np.sign(approx_val)
        y_pred_val = np.where(y_pred_val == -1.0, 0.0, y_pred_val)

        approx_test = np.dot(X_test, w) - b
        y_pred = np.sign(approx_test)
        y_pred = np.where(y_pred == -1.0, 0.0, y_pred)

        training_accuracy = accuracy_score(y_train, y_pred_train)
        validation_accuracy = accuracy_score(y_val, y_pred_val)
        testing_accuracy = accuracy_score(y_test, y_pred)

        training_report = classification_report(y_train, y_pred_train)
        validation_report = classification_report(y_val, y_pred_val)
        testing_report = classification_report(y_test, y_pred)


        print(f"*************************************************************************************From Scratch Model Fold {fold}*********************************************************************************************")
        print(f"Training Accuracy fold{i} scratch model: {training_accuracy:.3f}")
        print(f"Validation Accuracy fold{i} scratch model: {validation_accuracy:.3f}")
        print(f"Testing Accuracy fold{i} scratch model: {testing_accuracy:.3f}")
        print("Classification Report training scratch model:\n", training_report)
        print("Classification Report validation scratch model:\n", validation_report)
        print("Classification Report testing scratch model:\n", testing_report)
        t_statistic, p_value = ttest_ind(y_pred, y_test)
        print("Scratch T-statistic with ground truth:", t_statistic)
        print("Scratch P-value with ground truth:", p_value)




In [None]:
LinearSVM_Scratch(X_train, y_train, X_test)

*************************************************************************************From Scratch Model Fold 0*********************************************************************************************
Training Accuracy foldDRK_YN scratch model: 0.822
Validation Accuracy foldDRK_YN scratch model: 0.823
Testing Accuracy foldDRK_YN scratch model: 0.822
Classification Report training scratch model:
               precision    recall  f1-score   support

         0.0       0.76      0.93      0.84      4340
         1.0       0.91      0.72      0.80      4524

    accuracy                           0.82      8864
   macro avg       0.84      0.82      0.82      8864
weighted avg       0.84      0.82      0.82      8864

Classification Report validation scratch model:
               precision    recall  f1-score   support

         0.0       0.75      0.94      0.83      1039
         1.0       0.93      0.72      0.81      1177

    accuracy                           0.82      2216
   m

##Sklearn Implementation of SVM

In [None]:
from sklearn.svm import LinearSVC

num_folds = 5
fold_size = len(X_train) // num_folds
X_train_cv = X_train
y_train_cv = y_train

for fold in range(num_folds):
    # Define the start and end index for the validation set
    start_val_index = fold * fold_size
    end_val_index = (fold + 1) * fold_size

    # New training set
    X_train = np.append(X_train_cv[:start_val_index], X_train_cv[end_val_index:], axis=0)
    y_train = np.append(y_train_cv[:start_val_index], y_train_cv[end_val_index:])

    X_val = X_train_cv[start_val_index:end_val_index]
    y_val = y_train_cv[start_val_index:end_val_index]

    svm = LinearSVC()  # You can choose different kernel functions (e.g., linear, radial basis function, polynomial, etc.)

    # Train the model on the training data
    svm.fit(X_train, y_train)

    y_pred_train = svm.predict(X_train)
    y_pred_val = svm.predict(X_val)
    y_pred = svm.predict(X_test)

    training_accuracy = accuracy_score(y_train, y_pred_train)
    validation_accuracy = accuracy_score(y_val, y_pred_val)
    testing_accuracy = accuracy_score(y_test, y_pred)

    training_report = classification_report(y_train, y_pred_train)
    validation_report = classification_report(y_val, y_pred_val)
    testing_report = classification_report(y_test, y_pred)
    print(f"*************************************************************************************Sklearn Model Fold {fold}*********************************************************************************************")
    print(f"Training Accuracy fold{i} scratch model: {training_accuracy:.3f}")
    print(f"Validation Accuracy fold{i} scratch model: {validation_accuracy:.3f}")
    print(f"Testing Accuracy fold{i} scratch model: {testing_accuracy:.3f}")
    print("Classification Report training scratch model:\n", training_report)
    print("Classification Report validation scratch model:\n", validation_report)
    print("Classification Report testing scratch model:\n", testing_report)
    t_statistic, p_value = ttest_ind(y_pred, y_test)
    print("Scratch T-statistic with ground truth:", t_statistic)
    print("Scratch P-value with ground truth:", p_value)



*************************************************************************************Sklearn Model Fold 0*********************************************************************************************
Training Accuracy foldDRK_YN scratch model: 0.822
Validation Accuracy foldDRK_YN scratch model: 0.822
Testing Accuracy foldDRK_YN scratch model: 0.822
Classification Report training scratch model:
               precision    recall  f1-score   support

         0.0       0.76      0.93      0.84      4340
         1.0       0.91      0.72      0.81      4524

    accuracy                           0.82      8864
   macro avg       0.84      0.82      0.82      8864
weighted avg       0.84      0.82      0.82      8864

Classification Report validation scratch model:
               precision    recall  f1-score   support

         0.0       0.75      0.93      0.83      1039
         1.0       0.93      0.72      0.81      1177

    accuracy                           0.82      2216
   macro 