In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import RocCurveDisplay
from sklearn import metrics

import pickle
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib notebook

In [2]:
df = pd.read_csv("final_data.csv")
df.head(5)

Unnamed: 0,gender,age(year),race_and_hispanic_origin,country_of_birth,usa_citizenship,highest_education_grade_received,marital_status,no_of_people_in_the_household,family_size,total_family_income,veteran_status,e_cigarette_use
0,2.0,48.0,3.0,1.0,1.0,3.0,1.0,4.0,4.0,5.0,2.0,Did not smoke
1,1.0,16.0,4.0,1.0,1.0,4.0,1.0,4.0,4.0,7.0,2.0,Did not smoke
2,1.0,64.0,4.0,1.0,1.0,4.0,1.0,3.0,3.0,7.0,2.0,Did not smoke
3,1.0,61.0,1.0,1.0,1.0,3.0,1.0,4.0,4.0,4.0,1.0,Did not smoke
4,1.0,31.0,4.0,1.0,1.0,4.0,5.0,4.0,1.0,7.0,1.0,Smoked


In [3]:
# df["e_cigarette_use"].value_counts()

In [4]:
X = df.drop("e_cigarette_use", axis=1) 

y = df["e_cigarette_use"].map({"Smoked":1, "Did not smoke": 0})

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
model_log_rex = LogisticRegression(max_iter=1000)
model_svc = SVC()

In [7]:
param_grid_svc = {'C': [0.0001, 0.001, 0.01, 0.01, 0.1, 1, 10, 100, 1000, 1000]}

param_grid_log_rex = {'C': [0.0001, 0.001, 0.01, 0.01, 0.1, 1, 10, 100, 1000, 1000]}

svc = GridSearchCV(model_svc, param_grid_svc, cv=10)
logrex = GridSearchCV(model_log_rex, param_grid_log_rex, cv=10)

In [8]:
logrex.fit(X_train, y_train)

In [None]:
svc.fit(X_train, y_train)

In [None]:
best_params_log_rex = logrex.best_params_
best_model_log_rex = logrex.best_estimator_
test_accuracy_log_rex = best_model.score(X_test, y_test)

print(best_params_log_rex)
print(test_accuracy_log_rex)

In [None]:
best_params_svc = svc.best_params_
best_model_svc = svc.best_estimator_
test_accuracy_svc = best_model.score(X_test, y_test)

print(best_params_svc)
print(test_accuracy_svc)

In [None]:
with open("log_rex_model.pkl", mode="wb") as file1, open("svc_model.pkl", mode="wb") as file2:
    pickle.dump(best_model_log_rex, file1)
    pickle.dump(best_model_svc, file2)

In [None]:
# with open("log_rex_model.pkl", mode="rb") as file:
#     my_model = pickle.load(file)
    
# my_model.score(X_test, y_test)

In [None]:
# predictions = my_model.predict(X_test)
# predictions

# fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions)
# roc_auc = metrics.auc(fpr, tpr)
# display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='Logistic Regression')
# display.plot()
# plt.show()

In [None]:
fig, ax = plt.subplots(1,1, figsize=(6,4))
RocCurveDisplay.from_estimator(best_model_svc, X_test, y_test, ax=ax)
ax.set_xlabel("False Positive Rate")
ax.set_ylabel("True Positive Rate")
fig.tight_layout()

In [None]:
fig, ax = plt.subplots(1,1, figsize=(6,4))
RocCurveDisplay.from_estimator(best_model_log_rex, X_test, y_test, ax=ax)
ax.set_xlabel("False Positive Rate")
ax.set_ylabel("True Positive Rate")
fig.tight_layout()