In [1]:
import pandas as pd
from pandas import Series, DataFrame
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
file_path = r'C:\Users\ssbid\Desktop\Summer 2024\Data Science Programming (MIS S381N)\Notebooks used in class\smoking_driking_dataset_Ver01.csv'
column_names = [ 'sex', 'age', 'height', 'weight', 'waistline',
    'sight_left', 'sight_right',
    'hear_left', 'hear_right',
    'SBP', 'DBP', 'BLDS', 'tot_chole', 'HDL_chole', 'LDL_chole',
    'triglyceride', 'hemoglobin', 'urine_protein', 'serum_creatinine',
    'SGOT_AST', 'SGOT_ALT', 'gamma_GTP', 'SMK_stat_type_cd', 'DRK_YN'
]

# Load the dataset with specified column names
df = pd.read_csv('smoking_driking_dataset_Ver01.csv', header=0, names=column_names)

# Display the first few rows of the dataframe
#print(df.head())

# Drop rows with any missing values
df = df.dropna()
#len(df)



In [5]:
# Select random 100,000 records from the dataframe
sample_df = df.sample(n=100000, random_state=1)

# Save the sampled dataframe to a new CSV file
sample_file_path = 'smoking_driking_dataset_sampled.csv'
sample_df.to_csv(sample_file_path, index=False)

sample_df.head()
sample_df_no_feature_engg = sample_df.copy()

In [7]:
# FEATURE ENGINEERING

# Standardize numerical features
scaler = StandardScaler()
numerical_features = ['SBP', 'DBP', 'BLDS', 'tot_chole', 'HDL_chole', 'LDL_chole', 'hemoglobin',
                      'SGOT_AST', 'SGOT_ALT', 'gamma_GTP']
sample_df[numerical_features] = scaler.fit_transform(sample_df[numerical_features])
#print(sample_df.head())

# new column 'Hearing_State'
sample_df['Hearing_State'] = df.apply(lambda row: 2 if row['hear_left'] == 2 or row['hear_right'] == 2 else 1, axis=1)

# new column 'Sight_State'
#sample_df['Sight_State'] = df.apply(lambda row: 2 if row['sight_left'] == 2 or row['sight_right'] == 2 else 1, axis=1)

# new column 'HDL_LDL_ratio'
sample_df['HDL_LDL_Ratio'] = sample_df['HDL_chole'] / sample_df['LDL_chole']

# new column 'BMI'
sample_df['BMI'] = sample_df['weight'] / ((sample_df['height'] / 100) ** 2)

# new column 'Total_HDL_ratio'
sample_df['Total_HDL_Ratio'] = sample_df['tot_chole'] / sample_df['HDL_chole']

# new column 'Liver_Enzyme_Ratio' (SGOT:SGPT)
sample_df['Liver_Enzyme_Ratio'] = sample_df['SGOT_AST'] / sample_df['SGOT_ALT']

# new column 'Liver_Damage_Score' (SGOT_ALT + gamma_GTP)
sample_df['Liver_Damage_Score'] = df['gamma_GTP'] + df['SGOT_ALT']

#Smoked [0], Never Smoked [1]
# Categorize smoking status
sample_df['Smoking_Status'] = sample_df['SMK_stat_type_cd'].apply(lambda x: '0' if x == 1 else '1')

# Categorize drinking status
sample_df['Drinking_Status'] = sample_df['DRK_YN'].apply(lambda x: '0' if x == 'N' else '1')

#Has ever been into drinking or smoking
sample_df['DorS'] = sample_df.apply(lambda row: 1 if row['Smoking_Status'] == '1' or row['Drinking_Status'] == '1' else 0, axis=1)

sample_df[:5]
#len(sample_df)


Unnamed: 0,sex,age,height,weight,waistline,sight_left,sight_right,hear_left,hear_right,SBP,...,DRK_YN,Hearing_State,HDL_LDL_Ratio,BMI,Total_HDL_Ratio,Liver_Enzyme_Ratio,Liver_Damage_Score,Smoking_Status,Drinking_Status,DorS
921688,Female,50,155,60,82.0,0.7,0.5,1.0,1.0,0.521217,...,Y,1,-3.045653,24.973985,0.315012,6.448453,61.0,0,1,1
56603,Female,55,155,50,73.2,1.0,0.8,1.0,1.0,-0.508473,...,N,1,1.127806,20.811655,0.454888,0.080905,32.0,0,0,0
434962,Female,60,160,60,78.0,0.6,0.8,1.0,1.0,-1.744101,...,N,1,-1.037596,23.4375,-1.081401,0.329893,27.0,0,0,0
742617,Male,35,175,80,94.0,1.2,1.2,1.0,1.0,-0.096597,...,Y,1,-1.26111,26.122449,-1.650104,-3.175189,58.0,1,1,1
604547,Male,45,160,60,82.1,0.9,0.8,1.0,1.0,0.864448,...,Y,1,2.887429,23.4375,0.777988,-2.383707,76.0,0,1,1


In [281]:
sample_df_no_feature_engg.columns

Index(['sex', 'age', 'height', 'weight', 'waistline', 'sight_left',
       'sight_right', 'hear_left', 'hear_right', 'SBP', 'DBP', 'BLDS',
       'tot_chole', 'HDL_chole', 'LDL_chole', 'triglyceride', 'hemoglobin',
       'urine_protein', 'serum_creatinine', 'SGOT_AST', 'SGOT_ALT',
       'gamma_GTP', 'SMK_stat_type_cd', 'DRK_YN'],
      dtype='object')

In [285]:
# Drop rows with any missing values
sample_df = sample_df.dropna()
len(sample_df)

100000

In [315]:
### Logistic regression on non-feature engineered dataset

from patsy import dmatrices

# Creating 2 dataframes - one without the DRK_YN column and one without the SMK_stat_type_cd column

sample_df_drinker = sample_df_no_feature_engg.drop(columns=['SMK_stat_type_cd'])
sample_df_smoker = sample_df_no_feature_engg.drop(columns=['DRK_YN'])

# One-hot encoding the drinking status column

sample_df_drinker['DRK_YN'] = sample_df_drinker['DRK_YN'].map({'Y': 1, 'N': 0})

# Converting the smoking status column to binary values - 0 for never smoked, 1 otherwise

sample_df_smoker['SMK_stat_type_cd'] = sample_df_smoker['SMK_stat_type_cd'].map({2: 1, 3:1, 1: 0})

In [319]:
## Logistic regression to determine drinking status, with smoking status removed from the dataset

# Creating design matrices

formula_drinker = 'DRK_YN ~ 0 + C(sex) + age + height + weight + waistline + sight_left + sight_right + hear_left + hear_right + SBP + DBP ' + \
                  ' + BLDS + tot_chole + HDL_chole + LDL_chole + triglyceride + hemoglobin + urine_protein + serum_creatinine ' + \
                  ' + SGOT_AST + SGOT_ALT + gamma_GTP'
Y_drinker, X_drinker = dmatrices(formula_drinker, sample_df_drinker, return_type='dataframe')
y_drinker = Y_drinker['DRK_YN'].values

# Splitting the data into training and testing sets with a 80-20 split ratio

from sklearn.model_selection import train_test_split
X_drinker_train, X_drinker_test, y_drinker_train, y_drinker_test = train_test_split(X_drinker, y_drinker, test_size=0.2, random_state=1)

# Fit the classifier to the training data

from sklearn.linear_model import LogisticRegression
model_drinker = LogisticRegression()
result_drinker = model_drinker.fit(X_drinker_train, y_drinker_train)

# Checking prediction accuracy on training data

from sklearn import metrics

prediction_drinker_train = model_drinker.predict(X_drinker_train)
print(f"Accuracy on training data is {metrics.accuracy_score(y_drinker_train, prediction_drinker_train)*100:.2f}%")

# Checking prediction accuracy on test data

prediction_drinker_test = model_drinker.predict(X_drinker_test)
print(f"Accuracy on test data is {metrics.accuracy_score(y_drinker_test, prediction_drinker_test)*100:.2f}%")

# Feature weightage

weights_drinker = Series(model_drinker.coef_[0],
                 index=X_drinker_test.columns.values)
weights_drinker.sort_values()

Accuracy on training data is 71.14%
Accuracy on test data is 71.13%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


C(sex)[Female]     -0.243171
age                -0.037454
waistline          -0.024065
SGOT_ALT           -0.021364
urine_protein      -0.019149
hear_left          -0.014700
height             -0.014518
hear_right         -0.012942
sight_right        -0.007704
tot_chole          -0.003525
BLDS               -0.001814
DBP                -0.001580
SBP                -0.001475
LDL_chole          -0.001365
triglyceride        0.001195
sight_left          0.003336
SGOT_AST            0.010388
HDL_chole           0.017299
gamma_GTP           0.017662
serum_creatinine    0.018657
weight              0.036496
C(sex)[Male]        0.227553
hemoglobin          0.259882
dtype: float64

In [321]:
## Logistic regression to determine smoking status, with drinking status removed from the dataset

# Creating design matrices

formula_smoker = 'SMK_stat_type_cd ~ 0 + C(sex) + age + height + weight + waistline + sight_left + sight_right + hear_left + hear_right + SBP + DBP ' + \
                  ' + BLDS + tot_chole + HDL_chole + LDL_chole + triglyceride + hemoglobin + urine_protein + serum_creatinine ' + \
                  ' + SGOT_AST + SGOT_ALT + gamma_GTP'
Y_smoker, X_smoker = dmatrices(formula_smoker, sample_df_smoker, return_type='dataframe')
y_smoker = Y_smoker['SMK_stat_type_cd'].values

# Splitting the data into training and testing sets with a 80-20 split ratio

from sklearn.model_selection import train_test_split
X_smoker_train, X_smoker_test, y_smoker_train, y_smoker_test = train_test_split(X_smoker, y_smoker, test_size=0.2, random_state=1)

# Fit the classifier to the training data

from sklearn.linear_model import LogisticRegression
model_smoker = LogisticRegression()
result_smoker = model_smoker.fit(X_smoker_train, y_smoker_train)

# Checking prediction accuracy on training data

from sklearn import metrics

prediction_smoker_train = model_smoker.predict(X_smoker_train)
print(f"Accuracy on training data is {metrics.accuracy_score(y_smoker_train, prediction_smoker_train)*100:.2f}%")

# Checking prediction accuracy on test data

prediction_smoker_test = model_smoker.predict(X_smoker_test)
print(f"Accuracy on test data is {metrics.accuracy_score(y_smoker_test, prediction_smoker_test)*100:.2f}%")

# Feature weightage

weights_smoker = Series(model_smoker.coef_[0],
                 index=X_smoker_test.columns.values)
weights_smoker.sort_values()

Accuracy on training data is 77.91%
Accuracy on test data is 78.26%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


C(sex)[Female]     -0.774688
waistline          -0.075902
hear_right         -0.041590
hear_left          -0.038174
urine_protein      -0.029692
SBP                -0.027464
tot_chole          -0.016965
sight_left         -0.015895
height             -0.010027
sight_right        -0.005091
SGOT_ALT           -0.004863
SGOT_AST           -0.002042
BLDS               -0.001229
triglyceride        0.003570
gamma_GTP           0.007484
HDL_chole           0.008318
LDL_chole           0.010342
age                 0.016289
DBP                 0.024894
weight              0.070014
serum_creatinine    0.091660
hemoglobin          0.332897
C(sex)[Male]        0.724507
dtype: float64

In [325]:
### Logistic regression on feature engineered dataset

# Creating 2 dataframes - one without the drinking status columns and one without the smoking status columns

sample_df_fe_drinker = sample_df.drop(columns=['SMK_stat_type_cd','Smoking_Status','DorS','DRK_YN'])
sample_df_fe_smoker = sample_df.drop(columns=['DRK_YN','Drinking_Status','DorS','SMK_stat_type_cd'])

# One-hot encoding the drinking status column

sample_df_fe_drinker['Drinking_Status'] = sample_df_fe_drinker['Drinking_Status'].map({'1': 1, '0': 0})

# One-hot encoding the smoking status column

sample_df_fe_smoker['Smoking_Status'] = sample_df_fe_smoker['Smoking_Status'].map({'1': 1, '0': 0})

In [327]:
## Logistic regression to determine drinking status, with smoking status columns removed from the dataset

# Creating design matrices

formula_fe_drinker = 'Drinking_Status ~ 0 + C(sex) + age + height + weight + waistline + sight_left + sight_right + hear_left + hear_right + SBP + DBP ' + \
                  ' + BLDS + tot_chole + HDL_chole + LDL_chole + triglyceride + hemoglobin + urine_protein + serum_creatinine ' + \
                  ' + SGOT_AST + SGOT_ALT + gamma_GTP + Hearing_State + HDL_LDL_Ratio + BMI + Total_HDL_Ratio + Liver_Enzyme_Ratio + Liver_Damage_Score'
Y_fe_drinker, X_fe_drinker = dmatrices(formula_fe_drinker, sample_df_fe_drinker, return_type='dataframe')
y_fe_drinker = Y_fe_drinker['Drinking_Status'].values

# Splitting the data into training and testing sets with a 80-20 split ratio

from sklearn.model_selection import train_test_split
X_fe_drinker_train, X_fe_drinker_test, y_fe_drinker_train, y_fe_drinker_test = train_test_split(X_fe_drinker, y_fe_drinker, test_size=0.2, random_state=1)

# Fit the classifier to the training data

from sklearn.linear_model import LogisticRegression
model_fe_drinker = LogisticRegression()
result_fe_drinker = model_fe_drinker.fit(X_fe_drinker_train, y_fe_drinker_train)

# Checking prediction accuracy on training data

from sklearn import metrics

prediction_fe_drinker_train = model_fe_drinker.predict(X_fe_drinker_train)
print(f"Accuracy on training data is {metrics.accuracy_score(y_fe_drinker_train, prediction_fe_drinker_train)*100:.2f}%")

# Checking prediction accuracy on test data

prediction_fe_drinker_test = model_fe_drinker.predict(X_fe_drinker_test)
print(f"Accuracy on test data is {metrics.accuracy_score(y_fe_drinker_test, prediction_fe_drinker_test)*100:.2f}%")

# Feature weightage

weights_fe_drinker = Series(model_fe_drinker.coef_[0],
                 index=X_fe_drinker_test.columns.values)
weights_fe_drinker.sort_values()

Accuracy on training data is 71.61%
Accuracy on test data is 71.47%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


SGOT_ALT             -0.336782
C(sex)[Female]       -0.243606
LDL_chole            -0.126947
BMI                  -0.092042
SGOT_AST             -0.044302
age                  -0.037948
urine_protein        -0.011203
sight_right          -0.006312
hear_left            -0.004899
hear_right           -0.000733
waistline            -0.000113
HDL_LDL_Ratio        -0.000008
Total_HDL_Ratio       0.000013
Hearing_State         0.000147
triglyceride          0.001285
serum_creatinine      0.005729
height                0.006821
Liver_Damage_Score    0.008362
BLDS                  0.009977
sight_left            0.013250
Liver_Enzyme_Ratio    0.015484
tot_chole             0.034343
weight                0.035500
SBP                   0.081778
gamma_GTP             0.157944
DBP                   0.160561
C(sex)[Male]          0.240006
hemoglobin            0.255403
HDL_chole             0.403736
dtype: float64

In [329]:
## Logistic regression to determine smoking status, with drinking status columns removed from the dataset

# Creating design matrices

formula_fe_smoker = 'Smoking_Status ~ 0 + C(sex) + age + height + weight + waistline + sight_left + sight_right + hear_left + hear_right + SBP + DBP ' + \
                  ' + BLDS + tot_chole + HDL_chole + LDL_chole + triglyceride + hemoglobin + urine_protein + serum_creatinine ' + \
                  ' + SGOT_AST + SGOT_ALT + gamma_GTP + Hearing_State + HDL_LDL_Ratio + BMI + Total_HDL_Ratio + Liver_Enzyme_Ratio + Liver_Damage_Score'
Y_fe_smoker, X_fe_smoker = dmatrices(formula_fe_smoker, sample_df_fe_smoker, return_type='dataframe')
y_fe_smoker = Y_fe_smoker['Smoking_Status'].values
#Y_fe_smoker

# Splitting the data into training and testing sets with a 80-20 split ratio

from sklearn.model_selection import train_test_split
X_fe_smoker_train, X_fe_smoker_test, y_fe_smoker_train, y_fe_smoker_test = train_test_split(X_fe_smoker, y_fe_smoker, test_size=0.2, random_state=1)

# Fit the classifier to the training data

from sklearn.linear_model import LogisticRegression
model_fe_smoker = LogisticRegression()
result_fe_smoker = model_fe_smoker.fit(X_fe_smoker_train, y_fe_smoker_train)

# Checking prediction accuracy on training data

from sklearn import metrics

prediction_fe_smoker_train = model_fe_smoker.predict(X_fe_smoker_train)
print(f"Accuracy on training data is {metrics.accuracy_score(y_fe_smoker_train, prediction_fe_smoker_train)*100:.2f}%")

# Checking prediction accuracy on test data

prediction_fe_smoker_test = model_fe_smoker.predict(X_fe_smoker_test)
print(f"Accuracy on test data is {metrics.accuracy_score(y_fe_smoker_test, prediction_fe_smoker_test)*100:.2f}%")

# Feature weightage

weights_fe_smoker = Series(model_fe_smoker.coef_[0],
                 index=X_fe_smoker_test.columns.values)
weights_fe_smoker.sort_values()

Accuracy on training data is 78.93%
Accuracy on test data is 79.04%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


C(sex)[Female]       -0.658743
BMI                  -0.271289
SGOT_ALT             -0.188731
HDL_chole            -0.098581
tot_chole            -0.056197
SGOT_AST             -0.048522
SBP                  -0.040734
LDL_chole            -0.037996
height               -0.016323
urine_protein        -0.006151
hear_right           -0.003311
hear_left            -0.001503
HDL_LDL_Ratio         0.000674
Total_HDL_Ratio       0.000830
triglyceride          0.001791
Hearing_State         0.002556
Liver_Damage_Score    0.004023
waistline             0.013981
sight_left            0.014675
DBP                   0.014687
age                   0.015182
sight_right           0.028255
BLDS                  0.040588
Liver_Enzyme_Ratio    0.042560
serum_creatinine      0.091954
weight                0.092834
gamma_GTP             0.098920
C(sex)[Male]          0.648428
hemoglobin            0.667734
dtype: float64

In [331]:
## Logistic regression to determine smoking OR drinking status, with constituent columns removed from the dataset

# Creating a dataframe without the constituent columns

sample_df_drinker_or_smoker = sample_df.drop(columns=['SMK_stat_type_cd','Smoking_Status','DRK_YN','Drinking_Status'])

# Creating design matrices

formula_drinker_or_smoker = 'DorS ~ 0 + C(sex) + age + height + weight + waistline + sight_left + sight_right + hear_left + hear_right + SBP + DBP ' + \
                  ' + BLDS + tot_chole + HDL_chole + LDL_chole + triglyceride + hemoglobin + urine_protein + serum_creatinine ' + \
                  ' + SGOT_AST + SGOT_ALT + gamma_GTP + Hearing_State + HDL_LDL_Ratio + BMI + Total_HDL_Ratio + Liver_Enzyme_Ratio + Liver_Damage_Score'
Y_drinker_or_smoker, X_drinker_or_smoker = dmatrices(formula_drinker_or_smoker, sample_df_drinker_or_smoker, return_type='dataframe')
y_drinker_or_smoker = Y_drinker_or_smoker['DorS'].values

# Splitting the data into training and testing sets with a 80-20 split ratio

from sklearn.model_selection import train_test_split
X_drinker_or_smoker_train, X_drinker_or_smoker_test, y_drinker_or_smoker_train, y_drinker_or_smoker_test = train_test_split(X_drinker_or_smoker, y_drinker_or_smoker, test_size=0.2, random_state=1)

# Fit the classifier to the training data

from sklearn.linear_model import LogisticRegression
model_drinker_or_smoker = LogisticRegression()
result_drinker_or_smoker = model_drinker_or_smoker.fit(X_drinker_or_smoker_train, y_drinker_or_smoker_train)

# Checking prediction accuracy on training data

from sklearn import metrics

prediction_drinker_or_smoker_train = model_drinker_or_smoker.predict(X_drinker_or_smoker_train)
print(f"Accuracy on training data is {metrics.accuracy_score(y_drinker_or_smoker_train, prediction_drinker_or_smoker_train)*100:.2f}%")

# Checking prediction accuracy on test data

prediction_drinker_or_smoker_test = model_drinker_or_smoker.predict(X_drinker_or_smoker_test)
print(f"Accuracy on test data is {metrics.accuracy_score(y_drinker_or_smoker_test, prediction_drinker_or_smoker_test)*100:.2f}%")

# Feature weightage

weights_drinker_or_smoker = Series(model_drinker_or_smoker.coef_[0],
                 index=X_drinker_or_smoker_test.columns.values)
weights_drinker_or_smoker.sort_values()


Accuracy on training data is 78.22%
Accuracy on test data is 78.33%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


C(sex)[Female]       -0.527705
SGOT_ALT             -0.316667
BMI                  -0.144749
LDL_chole            -0.109499
SGOT_AST             -0.072609
age                  -0.027574
HDL_LDL_Ratio        -0.000030
Total_HDL_Ratio       0.000013
waistline             0.000312
triglyceride          0.001256
hear_left             0.004148
urine_protein         0.004417
SBP                   0.004509
hear_right            0.004788
height                0.006382
Liver_Damage_Score    0.006998
tot_chole             0.007029
sight_right           0.008135
Hearing_State         0.010254
sight_left            0.013955
BLDS                  0.025119
Liver_Enzyme_Ratio    0.033161
weight                0.058072
serum_creatinine      0.060025
DBP                   0.101870
gamma_GTP             0.150221
HDL_chole             0.299609
hemoglobin            0.504621
C(sex)[Male]          0.522945
dtype: float64