In [1]:
import statsmodels.api as sm
import pandas as pd 
from sklearn.feature_selection import VarianceThreshold

# In order to generate the model for the `e-cigarette use`, the dataset was further processed by dropping redundant features, and renaming the columns for easy referencing 

In [2]:
df = pd.read_csv("cleaned_data/final_combined_datasets.csv")

df.drop(["SEQN", "DMDHHSZA", "DMDHHSZB", "DMDHRGND", "DMDHRAGE", "DMDHRGND", "DMDHRBR4", "DMDHREDU", "DMDHRMAR",
        "DMDHSEDU", "MCQ010", "MCQ053", "SDDSRVYR", "RIDSTATR", "RIDRETH1", "AIALANGA", "RIDEXMON", "WTINT2YR",
        "DMDHHSZE", "WTMEC2YR", "SDMVPSU", "SDMVSTRA", "SMQ849", "INDHHIN2", "INDFMPIR", ], 
        inplace=True, axis=1)

df.rename(columns={"DMQMILIZ": "veteran_status", 
                   "RIAGENDR": "gender", 
                   "RIDAGEYR": "age(year)", 
                   "RIDRETH3": "race_and_hispanic_origin", 
                   "DMDBORN4": "country_of_birth", 
                   "DMDCITZN": "usa_citizenship", 
                   "DMDEDUC2": "highest_education_grade_received", 
                   "DMDMARTL": "marital_status", 
                   "DMDHHSIZ": "no_of_people_in_the_household", 
                   "DMDFMSIZ": "family_size", 
                   "INDFMIN2": "total_family_income",
                   "MCQ220": "presence_of_cancer/malignancy", 
#                    "SMQ849": "no_of_times_smoked_e_cigarette_last_5_days", 
#                    "INDHHIN2": "total_household_income",
#                    "INDFMPIR": "ratio_of_family_income_to_guideline" 
#                    "DMDHHSZE": "no_of_adults_>=_60yrs_in_the_household", 
#                    "WTINT2YR": "full_sample_2_year_interview_weight", 
#                    "WTMEC2YR": "full_sample_2_year_MEC_exam_weight",  
#                    "SDMVPSU": "pseudo-PSU variance", 
#                    "SDMVSTRA": "pseudo_stratum_variance", 
                  }, inplace = True)

In [3]:
df = df.sample(frac=1)
df

Unnamed: 0,gender,age(year),race_and_hispanic_origin,country_of_birth,usa_citizenship,highest_education_grade_received,marital_status,no_of_people_in_the_household,family_size,total_family_income,veteran_status,presence_of_cancer/malignancy,e_cigarette_use
4782,2.0,48.0,3.0,1.0,1.0,3.0,1.0,4.0,4.0,5.0,2.0,2.0,Did not smoke
4498,1.0,16.0,4.0,1.0,1.0,4.0,1.0,4.0,4.0,7.0,2.0,2.0,Did not smoke
9574,1.0,64.0,4.0,1.0,1.0,4.0,1.0,3.0,3.0,7.0,2.0,2.0,Did not smoke
7083,1.0,61.0,1.0,1.0,1.0,3.0,1.0,4.0,4.0,4.0,1.0,2.0,Did not smoke
4609,1.0,31.0,4.0,1.0,1.0,4.0,5.0,4.0,1.0,7.0,1.0,2.0,Smoked
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6129,1.0,34.0,3.0,2.0,1.0,2.0,5.0,2.0,2.0,3.0,2.0,2.0,Did not smoke
1966,2.0,67.0,6.0,2.0,1.0,3.0,1.0,6.0,6.0,15.0,2.0,2.0,Did not smoke
288,2.0,36.0,6.0,2.0,1.0,3.0,5.0,5.0,5.0,7.0,2.0,2.0,Smoked
9961,1.0,45.0,3.0,2.0,1.0,5.0,1.0,3.0,3.0,99.0,2.0,2.0,Did not smoke


In [5]:
# df.to_csv("../e_cigarette_model_bulding/prepared_data.csv", index=False)

In [4]:
codified_e_cigarette_use = []
for name in df["e_cigarette_use"]:
    if name == "Smoked":
        codified_e_cigarette_use.append(1)
    else:
        codified_e_cigarette_use.append(0)
        
df["e_cigarette_use"] = codified_e_cigarette_use

In [5]:
df["e_cigarette_use"].value_counts()

0    13508
1      215
Name: e_cigarette_use, dtype: int64

In [6]:
df.head(3)

Unnamed: 0,gender,age(year),race_and_hispanic_origin,country_of_birth,usa_citizenship,highest_education_grade_received,marital_status,no_of_people_in_the_household,family_size,total_family_income,veteran_status,presence_of_cancer/malignancy,e_cigarette_use
26,2.0,50.0,4.0,2.0,1.0,4.0,6.0,2.0,1.0,6.0,1.0,2.0,0
5448,2.0,74.0,3.0,1.0,1.0,4.0,1.0,2.0,2.0,77.0,2.0,1.0,0
10391,1.0,66.0,2.0,2.0,1.0,5.0,1.0,3.0,3.0,15.0,2.0,2.0,0


# Feature Selection was performed to remove low variance features that could otherwise affect the results of the machine learning model

In [7]:
def variance_threshold_selector(data, threshold=0.00):
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]

In [8]:
df = variance_threshold_selector(df)
df.to_csv("cleaned_data/low_variance_data.csv", index=False)

In [9]:
df.head(2)

Unnamed: 0,gender,age(year),race_and_hispanic_origin,country_of_birth,usa_citizenship,highest_education_grade_received,marital_status,no_of_people_in_the_household,family_size,total_family_income,veteran_status,presence_of_cancer/malignancy,e_cigarette_use
26,2.0,50.0,4.0,2.0,1.0,4.0,6.0,2.0,1.0,6.0,1.0,2.0,0
5448,2.0,74.0,3.0,1.0,1.0,4.0,1.0,2.0,2.0,77.0,2.0,1.0,0


In [10]:
df.head(2)

Unnamed: 0,gender,age(year),race_and_hispanic_origin,country_of_birth,usa_citizenship,highest_education_grade_received,marital_status,no_of_people_in_the_household,family_size,total_family_income,veteran_status,presence_of_cancer/malignancy,e_cigarette_use
26,2.0,50.0,4.0,2.0,1.0,4.0,6.0,2.0,1.0,6.0,1.0,2.0,0
5448,2.0,74.0,3.0,1.0,1.0,4.0,1.0,2.0,2.0,77.0,2.0,1.0,0


In [11]:
independent_variable = df.pop("e_cigarette_use")


dependent_variables = df.copy() 

# Now, let's check the value counts of participants that smoked E-cigarette in the last 5 days and those that did not smoke

In [12]:
independent_variable.value_counts()

0    13508
1      215
Name: e_cigarette_use, dtype: int64

# Model Creation

In [13]:
results = sm.Logit(independent_variable, dependent_variables).fit()

Optimization terminated successfully.
         Current function value: 0.076673
         Iterations 10


In [14]:
print(results.summary())

                           Logit Regression Results                           
Dep. Variable:        e_cigarette_use   No. Observations:                13723
Model:                          Logit   Df Residuals:                    13711
Method:                           MLE   Df Model:                           11
Date:                Fri, 31 Mar 2023   Pseudo R-squ.:                 0.04942
Time:                        19:10:57   Log-Likelihood:                -1052.2
converged:                       True   LL-Null:                       -1106.9
Covariance Type:            nonrobust   LLR p-value:                 2.410e-18
                                       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
gender                              -0.4472      0.145     -3.087      0.002      -0.731      -0.163
age(year)                           -0.0189      0.004     -5.150

From the Logistic regression performed `gender`, `age(year)`, `usa_citizenship`, `highest_education_grade_received`, `no_of_people_in_the_household`, `family_size`,  all have p-value less than 0.05 thus are statistically significant in accounting for e-cigarette use while `country_of_birth`, `race_and_hispanic_origin`, `marital_status`, `no_of_people_in_the_household`, `no_of_adults_>=_60yrs_in_the_household`, `full_sample_2_year_interview_weight`, `full_sample_2_year_MEC_exam_weight`, `pseudo-PSU variance`, `pseudo_stratum_variance`, `total_household_income`, `total_family_income`, and `ratio_of_family_income_to_guideline` all have pvalue greater than 0.05 and thus are statistically insignificant in accounting for e-cigarette use.

**Based on the statistically significant variable, the e-cigarette use model can be expressed as:**

`e_ciggarette_use` = `-3.0935(gender)` + `0.0751(age)` - `1.8116(country_of_birth)` - `2.1672(usa_citizenship)` + `0.3428(highest_education_grade_received)` + `0.1432(no_of_people_in_the_household)` - `0.1680(no_of_adults_>=_60yrs_in_the_household)`