In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score, classification_report
from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore")
import random
import pickle
random.seed(100)

In [2]:
df = pd.read_csv("datasets/downsampled_dataset_after_feature_selection.csv")
X = df.drop(['SMK_stat_type_cd','DRK_YN'], axis=1)
y_smoke = df['SMK_stat_type_cd']
y_drink = df['DRK_YN']

# Split the dataset into a training set (80%) and a test set (20%)
x_smk_train, x_smk_test, y_smk_train, y_smk_test = train_test_split(X, y_smoke, test_size=0.2, random_state=42)
x_dk_train, x_dk_test, y_dk_train, y_dk_test = train_test_split(X, y_drink, test_size=0.2, random_state=42)

In [9]:
# Logistic Regression
logistic_dk_model = LogisticRegression(max_iter=10000)
logistic_dk_model.fit(x_dk_train, y_dk_train)

logistic_smk_model = LogisticRegression(max_iter=10000)
logistic_smk_model.fit(x_smk_train, y_smk_train)

# GradientBoosting
gb_dk_model = GradientBoostingClassifier()
gb_dk_model.fit(x_dk_train, y_dk_train)

gb_smk_model = GradientBoostingClassifier()
gb_smk_model.fit(x_smk_train, y_smk_train)

# SVM
svc_dk_model = SVC()
svc_dk_model.fit(x_dk_train, y_dk_train)

svc_smk_model = SVC()
svc_smk_model.fit(x_smk_train, y_smk_train)

# RandomForest
rf_dk_model = RandomForestClassifier()
rf_dk_model.fit(x_dk_train, y_dk_train)

rf_smk_model = RandomForestClassifier()
rf_smk_model.fit(x_smk_train, y_smk_train)

# AdaBoost
ada_dk_model = AdaBoostClassifier()
ada_dk_model.fit(x_dk_train, y_dk_train)

ada_smk_model = AdaBoostClassifier()
ada_smk_model.fit(x_smk_train, y_smk_train)


In [13]:
pkl_filename = [("LogisticPickleDrinking.pkl",logistic_dk_model),("LogisticPickleSmoking.pkl",logistic_smk_model),
                ("GradientBoostPickleDrinking.pkl",gb_dk_model),("GradientBoostPickleSmoking.pkl",gb_smk_model),
                ("SVMPickleDrinking.pkl",svc_dk_model),("SVMPickleSmoking.pkl",svc_smk_model),
                ("RandomForestPickleDrinking.pkl", rf_dk_model), ("RandomForestPickleSmoking.pkl", rf_smk_model),
                ("AdaBoostPickleDrinking.pkl",ada_dk_model),("AdaBoostPickleSmoking.pkl",ada_smk_model)]
for i in range(10):
    print(f"{pkl_filename[i][0]}","saved.")
    with open('./saved_stacked_models/'+pkl_filename[i][0], 'wb') as file:
        pickle.dump(pkl_filename[i][1], file)

LogisticPickleDrinking.pkl saved.
LogisticPickleSmoking.pkl saved.
GradientBoostPickleDrinking.pkl saved.
GradientBoostPickleSmoking.pkl saved.
SVMPickleDrinking.pkl saved.
SVMPickleSmoking.pkl saved.
RandomForestPickleDrinking.pkl saved.
RandomForestPickleSmoking.pkl saved.
AdaBoostPickleDrinking.pkl saved.
AdaBoostPickleSmoking.pkl saved.


In [3]:
# Load from file
with open('./saved_stacked_models/AdaBoostPickleDrinking.pkl', 'rb') as file:
    pickle_model = pickle.load(file)

dk_score = pickle_model.score(x_dk_test, y_dk_test)
print("Test score: {0:.2f} %".format(100 * dk_score))
dk_y_predict = pickle_model.predict(x_dk_test)

Test score: 67.06 %
