In [1]:
import pandas as pd
import numpy as np
import os
import json

from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate

from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import make_scorer

In [2]:
#Import data
data_path = "../data/preprocessed"
train_path = os.path.join(data_path, "train.csv")
test_path = os.path.join(data_path, "test.csv")

#Load data mappings
mappings = json.load(open("data_mapping.json", "r"))

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

df = pd.concat([train_df, test_df], axis=0)
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = mappings['drugs']),
                                     df[mappings['drugs']],
                                     test_size=0.2,
                                     random_state=522)

    

In [3]:
#column transformer
preprocessor =  make_column_transformer(
        (StandardScaler(), mappings['numerical']),
        (OrdinalEncoder(categories = [
                                        list(mappings['categories']["Age"].values()), 
                                        list(mappings['categories']["Education"].values()),
                                        list(mappings['categories']["Impulsiveness"].values()),
                                        list(mappings['categories']["SensationSeeking"].values()) 
                                        ]), mappings['ordinal']),
        (OneHotEncoder(drop='if_binary', handle_unknown='ignore'), mappings['categorical']),
        ("drop", mappings['drop'])
    )

# Custom transformations

## Make custom scoring functions

In [4]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
scorers = {
            'f1_score': make_scorer(f1_score, average='micro'),
            'precision_score': make_scorer(precision_score, average='micro'),
            'recall_score': make_scorer(recall_score, average='micro'),
            # 'accuracy_score': make_scorer(accuracy_score)
          }

## DummyClassifier

In [5]:
dummy_cv_results = {}
# Get the mean accuracy for each drug
for drug in mappings['drugs']: 
    dc = DummyClassifier(strategy="stratified")
    # dc.fit(X_train, y_train[drug])
    dummy_cv_results[drug] = pd.DataFrame(
                                cross_validate(dc, 
                                               X_train,
                                               y_train[drug],
                                               return_train_score = True,
                                               scoring=scorers)).mean().round(4)

class_results = pd.DataFrame(dummy_cv_results)
class_results = class_results.drop(index = ["fit_time", "score_time"]).T
class_results = class_results.reset_index()
class_results = class_results.rename(columns = {"index": "target_drug"})
class_results = class_results.set_index("target_drug").T



In [6]:
class_results

target_drug,Alcohol,Cannabis,Chocolate,Caffeine,Cocaine,Mushrooms,Nicotine
test_f1_score,0.2745,0.1777,0.3508,0.569,0.3647,0.3296,0.2069
train_f1_score,0.275,0.1577,0.3299,0.5612,0.3412,0.3256,0.2041
test_precision_score,0.2745,0.1777,0.3508,0.569,0.3647,0.3296,0.2069
train_precision_score,0.275,0.1577,0.3299,0.5612,0.3412,0.3256,0.2041
test_recall_score,0.2745,0.1777,0.3508,0.569,0.3647,0.3296,0.2069
train_recall_score,0.275,0.1577,0.3299,0.5612,0.3412,0.3256,0.2041


## Creation of SVC pipeline

In [7]:
svc_pipe =  make_pipeline(
    preprocessor, 
    SVC()
)

param_dist = {
    "svc__class_weight": ["balanced", None],
    "svc__gamma": 10.0 ** np.arange(-4, 4),
    "svc__C": 10.0 ** np.arange(-4, 4)
}

# Save the best model and score for each drug
svc_best_estimator = {}
svc_best_score_by_drug = {}

for drug in mappings['drugs']: 
    random_search = RandomizedSearchCV(svc_pipe, 
                                       param_distributions = param_dist,
                                       n_jobs = -1,
                                       n_iter = 10,
                                       cv = 5, 
                                       return_train_score = True,
                                       scoring=scorers,
                                       refit="f1_score",
                                       random_state = 522
                    )
    random_search.fit(X_train, y_train[drug])
    svc_best_estimator[drug] = random_search.best_estimator_
    svc_best_score_by_drug[drug] = [round(random_search.cv_results_['mean_train_f1_score'].mean(), 4), 
                                    round(random_search.cv_results_['mean_test_f1_score'].mean(), 4),
                                    round(random_search.cv_results_['mean_train_recall_score'].mean(), 4),
                                    round(random_search.cv_results_['mean_test_recall_score'].mean(), 4),
                                    round(random_search.cv_results_['mean_train_precision_score'].mean(), 4),
                                    round(random_search.cv_results_['mean_test_precision_score'].mean(), 4)
                                    ]
    
score_by_drug = pd.DataFrame(svc_best_score_by_drug).T
score_by_drug = score_by_drug.reset_index()
score_by_drug = score_by_drug.rename(columns = {"index": "target_drug",
                                                0: "svc_train_f1_score",
                                                1: 'svc_test_f1_score',
                                                2: 'svc_train_recall_score',
                                                3: 'svc_test_recall_score',
                                                4: 'svc_train_precision_score',
                                                5: 'svc_test_precision_score'
                                }).set_index("target_drug").T
class_results = pd.concat( [class_results,score_by_drug] ,axis=0)



## Creation of Logistic Regression pipeline

In [8]:
lr_pipe =  make_pipeline(
    preprocessor, 
    LogisticRegression(random_state=522, 
                       max_iter=10000,
                       multi_class="multinomial")
)

param_dist = {
    "logisticregression__class_weight": ["balanced", None],
    "logisticregression__C": 10.0 ** np.arange(-4, 4)
}

# Save the best model and score for each drug
lr_best_estimator = {}
lr_best_score_by_drug = {}

for drug in mappings['drugs']: 
    random_search = RandomizedSearchCV(lr_pipe, 
                                       param_distributions = param_dist,
                                       n_jobs = -1,
                                       n_iter = 10,
                                       cv = 5, 
                                       return_train_score = True,
                                       random_state = 522,
                                       scoring=scorers,
                                       refit='f1_score'
                    )
    random_search.fit(X_train, y_train[drug])
    lr_best_estimator[drug] = random_search.best_estimator_
    lr_best_score_by_drug[drug] = [round(random_search.cv_results_['mean_train_f1_score'].mean(), 4), 
                                    round(random_search.cv_results_['mean_test_f1_score'].mean(), 4),
                                    round(random_search.cv_results_['mean_train_recall_score'].mean(), 4),
                                    round(random_search.cv_results_['mean_test_recall_score'].mean(), 4),
                                    round(random_search.cv_results_['mean_train_precision_score'].mean(), 4),
                                    round(random_search.cv_results_['mean_test_precision_score'].mean(), 4)
                                ]
    
lr_score_by_drug = pd.DataFrame(lr_best_score_by_drug).T
lr_score_by_drug = lr_score_by_drug.reset_index()
lr_score_by_drug = lr_score_by_drug.rename(columns = {"index": "target_drug",
                                                0: "lr_train_f1_score",
                                                1: 'lr_test_f1_score',
                                                2: 'lr_train_recall_score',
                                                3: 'lr_test_recall_score',
                                                4: 'lr_train_precision_score',
                                                5: 'lr_test_precision_score'
                                }).set_index("target_drug").T
class_results = pd.concat( [class_results, lr_score_by_drug] ,axis=0)



## Creation of KNN pipeline

In [9]:
knn_pipe =  make_pipeline(
    preprocessor, 
    KNeighborsClassifier(n_jobs=-1)
)

param_dist = {
    "kneighborsclassifier__weights": ["uniform", "distance"],
    "kneighborsclassifier__n_neighbors": [i for i in range(5, 26, 2)]
}

# Save the best model and score for each drug
knn_best_estimator = {}
knn_best_score_by_drug = {}

for drug in mappings['drugs']: 
    random_search = RandomizedSearchCV(knn_pipe, 
                                       param_distributions = param_dist,
                                       n_jobs = -1,
                                       n_iter = 20,
                                       cv = 5, 
                                       return_train_score = True,
                                       random_state = 522,
                                       scoring=scorers,
                                       refit='f1_score'
                    )
    random_search.fit(X_train, y_train[drug])
    knn_best_estimator[drug] = random_search.best_estimator_
    knn_best_score_by_drug[drug] = [round(random_search.cv_results_['mean_train_f1_score'].mean(), 4), 
                                    round(random_search.cv_results_['mean_test_f1_score'].mean(), 4),
                                    round(random_search.cv_results_['mean_train_recall_score'].mean(), 4),
                                    round(random_search.cv_results_['mean_test_recall_score'].mean(), 4),
                                    round(random_search.cv_results_['mean_train_precision_score'].mean(), 4),
                                    round(random_search.cv_results_['mean_test_precision_score'].mean(), 4)
                                ]
    
knn_score_by_drug = pd.DataFrame(knn_best_score_by_drug).T
knn_score_by_drug = knn_score_by_drug.reset_index()
knn_score_by_drug = knn_score_by_drug.rename(columns = {"index": "target_drug",
                                                0: "knn_train_f1_score",
                                                1: 'knn_test_f1_score',
                                                2: 'knn_train_recall_score',
                                                3: 'knn_test_recall_score',
                                                4: 'knn_train_precision_score',
                                                5: 'knn_test_precision_score'
                                }).set_index("target_drug").T

class_results = pd.concat( [class_results, knn_score_by_drug] ,axis=0)



## Decision tree classifier

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

dt_pipe =  make_pipeline(
    preprocessor, 
    DecisionTreeClassifier(random_state=522,
                           )
)

param_dist = {
    "decisiontreeclassifier__max_depth": [i for i in range(3, 20)],
}

# Save the best model and score for each drug
dt_best_estimator = {}
dt_best_score_by_drug = {}

for drug in mappings['drugs']: 
    random_search = GridSearchCV(dt_pipe, 
                                    #    param_distributions = param_dist,
                                    param_grid=param_dist,
                                       n_jobs = -1,
                                    #    n_iter = 20,
                                       cv = 5, 
                                       return_train_score = True,
                                    #    random_state = 522,
                                       scoring=scorers,
                                       refit='f1_score'
                    )
    random_search.fit(X_train, y_train[drug])
    dt_best_estimator[drug] = random_search.best_estimator_
    dt_best_score_by_drug[drug] = [round(random_search.cv_results_['mean_train_f1_score'].mean(), 4), 
                                    round(random_search.cv_results_['mean_test_f1_score'].mean(), 4),
                                    round(random_search.cv_results_['mean_train_recall_score'].mean(), 4),
                                    round(random_search.cv_results_['mean_test_recall_score'].mean(), 4),
                                    round(random_search.cv_results_['mean_train_precision_score'].mean(), 4),
                                    round(random_search.cv_results_['mean_test_precision_score'].mean(), 4)
                                ]
    
dt_score_by_drug = pd.DataFrame(dt_best_score_by_drug).T
dt_score_by_drug = dt_score_by_drug.reset_index()
dt_score_by_drug = dt_score_by_drug.rename(columns = {"index": "target_drug",
                                                0: "dt_train_f1_score",
                                                1: 'dt_test_f1_score',
                                                2: 'dt_train_recall_score',
                                                3: 'dt_test_recall_score',
                                                4: 'dt_train_precision_score',
                                                5: 'dt_test_precision_score'
                                }).set_index("target_drug").T

class_results = pd.concat( [class_results, dt_score_by_drug] ,axis=0)



## NN classifier

In [11]:
from sklearn.neural_network import MLPClassifier

#Creation of NN pipeline
nn_pipe =  make_pipeline(
    preprocessor, 
    MLPClassifier((32,32,16),
                 learning_rate="invscaling",
                 random_state=522,
                 max_iter=5000,
                 batch_size=8,
                 learning_rate_init=0.00001,
                 early_stopping=True
                 )
)

# Save the best model and score for each drug
nn_best_estimator = {}
nn_best_score_by_drug = {}

for drug in mappings['drugs']:
    print(f"Drug: {drug}")
    nn_pipe.fit(X_train, y_train[drug])
    
    nn_best_estimator[drug] = nn_pipe
    nn_best_score_by_drug[drug] = [round(f1_score(y_train[drug], nn_pipe.predict(X_train), average='micro'), 4), 
                                    round(f1_score(y_test[drug], nn_pipe.predict(X_test), average='micro'), 4),
                                    round(recall_score(y_train[drug], nn_pipe.predict(X_train), average='micro'), 4),
                                    round(recall_score(y_test[drug], nn_pipe.predict(X_test), average='micro'), 4),
                                    round(precision_score(y_train[drug], nn_pipe.predict(X_train), average='micro'), 4),
                                    round(precision_score(y_test[drug], nn_pipe.predict(X_test), average='micro'), 4)
                                ]
    
score_by_nn = pd.DataFrame(nn_best_score_by_drug).T
score_by_nn = score_by_nn.reset_index()
score_by_nn = score_by_nn.rename(columns ={"index": "target_drug",
                                                0: "nn_train_f1_score",
                                                1: 'nn_test_f1_score',
                                                2: 'nn_train_recall_score',
                                                3: 'nn_test_recall_score',
                                                4: 'nn_train_precision_score',
                                                5: 'nn_test_precision_score'
                                   }).set_index("target_drug").T

class_results = pd.concat( [class_results, score_by_nn] ,axis=0)

Drug: Alcohol
Drug: Cannabis
Drug: Chocolate
Drug: Caffeine
Drug: Cocaine
Drug: Mushrooms
Drug: Nicotine


In [12]:
from sklearn.metrics import confusion_matrix  # Recommended method in sklearn 1.0

cm = pd.DataFrame(confusion_matrix(
    y_test['Alcohol'], dt_best_estimator['Alcohol'].predict(X_test), labels=[f"CL{i}" for i in range(0,7)]
))
cm.style.background_gradient(cmap='PuBu', low=0, high=1)

Unnamed: 0,0,1,2,3,4,5,6
0,0,0,0,0,0,8,0
1,0,0,0,0,0,8,0
2,0,0,0,0,0,14,0
3,0,0,0,0,0,34,0
4,0,0,0,0,1,51,0
5,0,0,0,0,0,149,2
6,0,0,0,0,5,104,1


In [13]:
class_results.reset_index().style.background_gradient(cmap='PuBu', low=0, high=1)


target_drug,index,Alcohol,Cannabis,Chocolate,Caffeine,Cocaine,Mushrooms,Nicotine
0,test_f1_score,0.2745,0.1777,0.3508,0.569,0.3647,0.3296,0.2069
1,train_f1_score,0.275,0.1577,0.3299,0.5612,0.3412,0.3256,0.2041
2,test_precision_score,0.2745,0.1777,0.3508,0.569,0.3647,0.3296,0.2069
3,train_precision_score,0.275,0.1577,0.3299,0.5612,0.3412,0.3256,0.2041
4,test_recall_score,0.2745,0.1777,0.3508,0.569,0.3647,0.3296,0.2069
5,train_recall_score,0.275,0.1577,0.3299,0.5612,0.3412,0.3256,0.2041
6,svc_train_f1_score,0.5052,0.5153,0.4695,0.5649,0.5824,0.569,0.5004
7,svc_test_f1_score,0.3034,0.264,0.2757,0.4799,0.4327,0.4052,0.2803
8,svc_train_recall_score,0.5052,0.5153,0.4695,0.5649,0.5824,0.569,0.5004
9,svc_test_recall_score,0.3034,0.264,0.2757,0.4799,0.4327,0.4052,0.2803


# Treating it as a regression problem

In [14]:
ranges = [i/7 for i in range(1,8)]
y_train_reg = y_train.replace({f"CL{i}": ranges[i] for i in range(7)})
y_test_reg = y_test.replace({f"CL{i}": ranges[i] for i in range(7)})

In [15]:
from sklearn.metrics import mean_squared_error
def get_class(num):
    intervals = [i/7 for i in range(1,8)]
    if num <= intervals[0]:
        return "CL0"
    elif num <= intervals[1]:
        return "CL1"
    elif num <= intervals[2]:
        return "CL2"
    elif num <= intervals[3]:
        return "CL3"
    elif num <= intervals[4]:
        return "CL4"
    elif num <= intervals[5]:
        return "CL5"
    else:
        return "CL6"
    
def get_regression_accuracy(estimator):
    #Tries to get a classification accuracy based on the quantitative metric predicted
    accuracy = {}
    error = make_scorer(mean_squared_error, greater_is_better=False)
    for drug in mappings['drugs']:
        preds = np.array(list(map(get_class, estimator[drug].predict(X_train))))
        res = (y_train[drug] == preds).value_counts()
        accuracy[drug] = res[True]/res.sum()
    return pd.DataFrame({"target_drug": list(accuracy.keys()),
                         "accuracy": list(accuracy.values()),
                         "train_error": [error(estimator[drug], X_train, y_train_reg[drug]) 
                                         for drug in mappings['drugs']],
                         "test_error": [error(estimator[drug], X_test, y_test_reg[drug]) 
                                         for drug in mappings['drugs']]
                        }), preds

## Dummy regressor

In [16]:
dummy_r_results = {}
dr_estimator = {}

# Get the mean accuracy for each drug
for drug in mappings['drugs']:
    dr = DummyRegressor(strategy='median')
    dr.fit(X_train, y_train_reg[drug])
    # dummy_r_results[drug] = dr.predict(X_train)
    dr_estimator[drug] = dr
    
output = get_regression_accuracy(dr_estimator)
reg_scores =  pd.DataFrame({"target_drug": mappings['drugs'],
                            # "dummy_accuracy": output[0]['accuracy'],
                            "dummy_train_error":abs( output[0]['train_error']),
                            "dummy_test_error": abs(output[0]['test_error'])}).set_index('target_drug').T
reg_scores

target_drug,Alcohol,Cannabis,Chocolate,Caffeine,Cocaine,Mushrooms,Nicotine
dummy_train_error,0.038407,0.106412,0.024035,0.032453,0.07469,0.072768,0.120284
dummy_test_error,0.040708,0.107995,0.026038,0.024089,0.072322,0.072051,0.117252


## Creation of Ridge Regression pipeline

In [17]:
ridge_pipe =  make_pipeline(
    preprocessor, 
    Ridge(max_iter=1000)
)

param_dist = {"ridge__alpha": 10.0 ** np.arange(-5, 5, 1)}

# Save the best model and score for each drug
ridge_best_estimator = {}
ridge_best_score_by_drug = {}

for drug in mappings['drugs']: 
    random_search = RandomizedSearchCV(ridge_pipe, 
                                       param_distributions = param_dist,
                                       n_jobs = -1,
                                       n_iter = 10,
                                       cv = 5, 
                                       return_train_score = True,
                                       random_state = 522
                    )
    random_search.fit(X_train, y_train_reg[drug])
    ridge_best_estimator[drug] = random_search.best_estimator_
    
output = get_regression_accuracy(ridge_best_estimator)
ridge_results = pd.DataFrame({"target_drug": mappings['drugs'],
                            # "ridge_accuracy": output[0]['accuracy'],
                            "ridge_train_error": abs(output[0]['train_error']),
                            "ridge_test_error": abs(output[0]['test_error'])}).set_index("target_drug").T
reg_scores = pd.concat( [reg_scores, ridge_results] ,axis=0)

## NN regressor

In [20]:
from sklearn.neural_network import MLPRegressor
#Creation of NN pipeline
nn_pipe =  make_pipeline(
    preprocessor, 
    MLPRegressor((32,32,16),
                 learning_rate="invscaling",
                 random_state=522,
                 max_iter=5000,
                 batch_size=16,
                 learning_rate_init=0.0001)
)

# Save the best model and score for each drug
nn_best_estimator = {}
nn_best_score_by_drug = {}

for drug in mappings['drugs']: 
    nn_pipe.fit(X_train, y_train_reg[drug])
    nn_best_score_by_drug[drug] = [
                                ]
    nn_best_estimator[drug] = nn_pipe
    
output = get_regression_accuracy(nn_best_estimator)
nn_results = pd.DataFrame({"target_drug": mappings['drugs'],
                            # "ridge_accuracy": output[0]['accuracy'],
                            "nn_train_error": abs(output[0]['train_error']),
                            "nn_test_error": abs(output[0]['test_error'])}).set_index("target_drug").T
reg_scores = pd.concat( [reg_scores, nn_results] ,axis=0)


In [24]:
x_enc = preprocessor.fit_transform(X_train)
pd.DataFrame(x_enc, columns=preprocessor.get_feature_names_out()).corr().style.background_gradient(cmap = "PuOr")

Unnamed: 0,standardscaler__Neuroticism,standardscaler__Extraversion,standardscaler__Openness,standardscaler__Agreeableness,standardscaler__Conscientiousness,ordinalencoder__Age,ordinalencoder__Education,ordinalencoder__Impulsiveness,ordinalencoder__SensationSeeking,onehotencoder__Gender_Male
standardscaler__Neuroticism,1.0,-0.419846,-0.002448,-0.230351,-0.39329,-0.145798,-0.067213,0.17447,0.078918,-0.090144
standardscaler__Extraversion,-0.419846,1.0,0.265184,0.14532,0.307213,-0.02815,0.104393,0.127137,0.223371,-0.051465
standardscaler__Openness,-0.002448,0.265184,1.0,0.031497,-0.052453,-0.218567,0.099613,0.268531,0.419225,0.10969
standardscaler__Agreeableness,-0.230351,0.14532,0.031497,1.0,0.252305,0.071504,0.074752,-0.224166,-0.210723,-0.225166
standardscaler__Conscientiousness,-0.39329,0.307213,-0.052453,0.252305,1.0,0.17506,0.196166,-0.35395,-0.230692,-0.177939
ordinalencoder__Age,-0.145798,-0.02815,-0.218567,0.071504,0.17506,1.0,0.09538,-0.197012,-0.344975,-0.094555
ordinalencoder__Education,-0.067213,0.104393,0.099613,0.074752,0.196166,0.09538,1.0,-0.117474,-0.099501,-0.193472
ordinalencoder__Impulsiveness,0.17447,0.127137,0.268531,-0.224166,-0.35395,-0.197012,-0.117474,1.0,0.629139,0.168423
ordinalencoder__SensationSeeking,0.078918,0.223371,0.419225,-0.210723,-0.230692,-0.344975,-0.099501,0.629139,1.0,0.244582
onehotencoder__Gender_Male,-0.090144,-0.051465,0.10969,-0.225166,-0.177939,-0.094555,-0.193472,0.168423,0.244582,1.0


## Support Vector Regression

In [25]:
from sklearn.svm import SVR

svr_pipe =  make_pipeline(
    preprocessor, 
    SVR()
)

param_dist = {
    "svr__gamma": 10.0 ** np.arange(-4, 4),
    "svr__C": 10.0 ** np.arange(-4, 4)
}

# Save the best model and score for each drug
svr_best_estimator = {}
svr_best_score_by_drug = {}

for drug in mappings['drugs']: 
    random_search = RandomizedSearchCV(svr_pipe, 
                                       param_distributions = param_dist,
                                       n_jobs = -1,
                                       n_iter = 10,
                                       cv = 5, 
                                       return_train_score = True,
                                       random_state = 522
                    )
    random_search.fit(X_train, y_train_reg[drug])
    svr_best_estimator[drug] = random_search.best_estimator_
    
output = get_regression_accuracy(svr_best_estimator)
svr_results = pd.DataFrame({"target_drug": mappings['drugs'],
                            # "ridge_accuracy": output[0]['accuracy'],
                            "svr_train_error": abs(output[0]['train_error']),
                            "svr_test_error": abs(output[0]['test_error'])}).set_index("target_drug").T
reg_scores = pd.concat( [reg_scores, svr_results] ,axis=0)


## Decision Tree Regressor

In [27]:
from sklearn.tree import DecisionTreeRegressor

dr_pipe =  make_pipeline(
    preprocessor, 
    DecisionTreeRegressor(random_state=522)
)

param_dist = {
    "decisiontreeregressor__max_depth": [i for i in range(3, 20)],
}

# Save the best model and score for each drug
dr_best_estimator = {}
dr_best_score_by_drug = {}

for drug in mappings['drugs']: 
    random_search = GridSearchCV(dr_pipe, 
                                    #    param_distributions = param_dist,
                                    param_grid=param_dist,
                                       n_jobs = -1,
                                    #    n_iter = 20,
                                       cv = 5, 
                                       return_train_score = True
                    )
    random_search.fit(X_train, y_train_reg[drug])
    dr_best_estimator[drug] = random_search.best_estimator_
    
output = get_regression_accuracy(dr_best_estimator)
dr_results = pd.DataFrame({"target_drug": mappings['drugs'],
                            # "ridge_accuracy": output[0]['accuracy'],
                            "dr_train_error": abs(output[0]['train_error']),
                            "dr_test_error": abs(output[0]['test_error'])}).set_index("target_drug").T
reg_scores = pd.concat( [reg_scores, dr_results] ,axis=0)

In [28]:
reg_scores.style.background_gradient(cmap='BuPu_r', low=1,high=0)

target_drug,Alcohol,Cannabis,Chocolate,Caffeine,Cocaine,Mushrooms,Nicotine
dummy_train_error,0.038407,0.106412,0.024035,0.032453,0.07469,0.072768,0.120284
dummy_test_error,0.040708,0.107995,0.026038,0.024089,0.072322,0.072051,0.117252
ridge_train_error,0.034105,0.060842,0.023724,0.026593,0.039098,0.03161,0.097838
ridge_test_error,0.037698,0.059446,0.025831,0.019697,0.038104,0.031011,0.095532
nn_train_error,0.103095,0.073253,0.126404,0.150067,0.149931,0.142792,0.079503
nn_test_error,0.121137,0.079613,0.136292,0.170755,0.149549,0.13574,0.105529
svr_train_error,0.034778,0.060095,0.023718,0.027341,0.039286,0.030821,0.095057
svr_test_error,0.037751,0.058992,0.026528,0.020657,0.038765,0.030633,0.098902
dr_train_error,0.033662,0.06903,0.022803,0.026303,0.039163,0.031343,0.100041
dr_test_error,0.037849,0.072815,0.026258,0.019955,0.039654,0.033366,0.098131


# Training a model with the orginal numbers from the dataset.

In [39]:
data_path = "../data/raw"

#Load data mappings
mappings = json.load(open("data_mapping.json", "r"))


df = pd.read_csv(os.path.join(data_path, "drug_consumption.data"), names=mappings['column_headers'])
df = df.drop(columns=mappings['drop'])

X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(df.drop(columns = mappings['drugs']),
                                     df[mappings['drugs']],
                                     test_size=0.2)
   

## Dummy Classifier

In [40]:
dummy_results = {}
# Get the mean accuracy for each drug
for drug in mappings['drugs']: 
    dc_v2 = DummyClassifier(strategy="stratified")
    # dc_v2.fit(X_train_raw, y_train_raw[drug])
    dummy_results[drug] = pd.DataFrame(
                                cross_validate(dc_v2, 
                                               X_train_raw,
                                               y_train_raw[drug],
                                               cv = 5,
                                               return_train_score = True,
                                               scoring=scorers
                                )).mean().round(4)


raw_results = pd.DataFrame(dummy_results)

raw_results = raw_results.drop(index = ["fit_time", "score_time"]).T
raw_results = raw_results.reset_index()
raw_results = raw_results.rename(columns = {"index": "target_drug"})
raw_results = raw_results.set_index("target_drug").T

raw_results



target_drug,Alcohol,Cannabis,Chocolate,Caffeine,Cocaine,Mushrooms,Nicotine
test_f1_score,0.2752,0.1691,0.3468,0.5557,0.3793,0.3402,0.2109
train_f1_score,0.2681,0.1751,0.3471,0.5638,0.3621,0.3394,0.2024
test_precision_score,0.2752,0.1691,0.3468,0.5557,0.3793,0.3402,0.2109
train_precision_score,0.2681,0.1751,0.3471,0.5638,0.3621,0.3394,0.2024
test_recall_score,0.2752,0.1691,0.3468,0.5557,0.3793,0.3402,0.2109
train_recall_score,0.2681,0.1751,0.3471,0.5638,0.3621,0.3394,0.2024


## SVC Classifier

In [41]:
#column transformer
#Not using "drop" here as we dropped the columns after importing the dataset
preprocessor_v2 =  make_column_transformer(
        (StandardScaler(), mappings['numerical'] + mappings['ordinal']),
        (OneHotEncoder(drop='if_binary', dtype=int, handle_unknown='ignore'), mappings['categorical'])
    )

svc_pipe_v2 =  make_pipeline(
    preprocessor_v2, 
    SVC()
)

param_dist = {
    "svc__class_weight": ["balanced", None],
    "svc__gamma": 10.0 ** np.arange(-4, 4),
    "svc__C": 10.0 ** np.arange(-4, 4)
}

# Save the best model and score for each drug
raw_svc_best_estimator = {}
raw_svc_best_score_by_drug = {}

for drug in mappings['drugs']: 
    random_search = RandomizedSearchCV(svc_pipe_v2, 
                                       param_distributions = param_dist,
                                       n_jobs = -1,
                                       n_iter = 10,
                                       cv = 3, 
                                       return_train_score = True,
                                       random_state = 522,
                                       scoring=scorers,
                                       refit='f1_score'
                    )
    random_search.fit(X_train_raw, y_train_raw[drug])
    raw_svc_best_estimator[drug] = random_search.best_estimator_
    raw_svc_best_score_by_drug[drug] = [round(random_search.cv_results_['mean_train_f1_score'].mean(), 4), 
                                    round(random_search.cv_results_['mean_test_f1_score'].mean(), 4),
                                    round(random_search.cv_results_['mean_train_recall_score'].mean(), 4),
                                    round(random_search.cv_results_['mean_test_recall_score'].mean(), 4),
                                    round(random_search.cv_results_['mean_train_precision_score'].mean(), 4),
                                    round(random_search.cv_results_['mean_test_precision_score'].mean(), 4)
                                    ]
    
svc_results = pd.DataFrame(raw_svc_best_score_by_drug).T
svc_results = svc_results.reset_index()
svc_results = svc_results.rename(columns = {"index": "target_drug",
                                                0: "svc_train_f1_score",
                                                1: 'svc_test_f1_score',
                                                2: 'svc_train_recall_score',
                                                3: 'svc_test_recall_score',
                                                4: 'svc_train_precision_score',
                                                5: 'svc_test_precision_score'
                                }).set_index("target_drug").T

raw_results = pd.concat( [raw_results, svc_results] ,axis=0)



## Decision Tree classifier

In [45]:
dt_pipe_v2 =  make_pipeline(
    preprocessor_v2, 
    DecisionTreeClassifier(random_state=522)
)

param_dist = {
    "decisiontreeclassifier__max_depth": [i for i in range(3, 20)],
}

# Save the best model and score for each drug
dt_best_estimator = {}
dt_best_score_by_drug = {}

for drug in mappings['drugs']: 
    random_search = GridSearchCV(dt_pipe_v2, 
                                    param_grid=param_dist,
                                       n_jobs = -1,
                                       cv = 5, 
                                       return_train_score = True,
                                       scoring=scorers,
                                       refit='f1_score'
                    )
    random_search.fit(X_train_raw, y_train_raw[drug])
    dt_best_estimator[drug] = random_search.best_estimator_
    dt_best_score_by_drug[drug] = [round(random_search.cv_results_['mean_train_f1_score'].mean(), 4), 
                                    round(random_search.cv_results_['mean_test_f1_score'].mean(), 4),
                                    round(random_search.cv_results_['mean_train_recall_score'].mean(), 4),
                                    round(random_search.cv_results_['mean_test_recall_score'].mean(), 4),
                                    round(random_search.cv_results_['mean_train_precision_score'].mean(), 4),
                                    round(random_search.cv_results_['mean_test_precision_score'].mean(), 4)
                                ]
    
dt_score_by_drug = pd.DataFrame(dt_best_score_by_drug).T
dt_score_by_drug = dt_score_by_drug.reset_index()
dt_score_by_drug = dt_score_by_drug.rename(columns = {"index": "target_drug",
                                                0: "dt_train_f1_score",
                                                1: 'dt_test_f1_score',
                                                2: 'dt_train_recall_score',
                                                3: 'dt_test_recall_score',
                                                4: 'dt_train_precision_score',
                                                5: 'dt_test_precision_score'
                                }).set_index("target_drug").T

raw_results = pd.concat( [raw_results, dt_score_by_drug] ,axis=0)



## NN classification

In [50]:
from sklearn.neural_network import MLPClassifier

#Creation of NN pipeline
nn_pipe =  make_pipeline(
    preprocessor_v2, 
    MLPClassifier((32,32,16),
                 learning_rate="invscaling",
                 random_state=522,
                 max_iter=5000,
                 batch_size=16,
    )
)

# Save the best model and score for each drug
nn_best_estimator = {}
nn_best_score_by_drug = {}

for drug in mappings['drugs']: 
    nn_pipe.fit(X_train_raw, y_train_raw[drug])
    
    nn_best_estimator[drug] = nn_pipe
    nn_best_score_by_drug[drug] = [round(f1_score(y_train_raw[drug], nn_pipe.predict(X_train_raw), average='weighted'), 4), 
                                    round(f1_score(y_test_raw[drug], nn_pipe.predict(X_test_raw), average='weighted'), 4),
                                    round(recall_score(y_train_raw[drug], nn_pipe.predict(X_train_raw), average='weighted'), 4),
                                    round(recall_score(y_test_raw[drug], nn_pipe.predict(X_test_raw), average='weighted'), 4),
                                    round(precision_score(y_train_raw[drug], nn_pipe.predict(X_train_raw), average='weighted', ), 4),
                                    round(precision_score(y_test_raw[drug], nn_pipe.predict(X_test_raw), average='weighted'), 4)
                                ]
    
score_by_nn = pd.DataFrame(nn_best_score_by_drug).T
score_by_nn = score_by_nn.reset_index()
score_by_nn = score_by_nn.rename(columns ={"index": "target_drug",
                                                0: "nn_train_f1_score",
                                                1: 'nn_test_f1_score',
                                                2: 'nn_train_recall_score',
                                                3: 'nn_test_recall_score',
                                                4: 'nn_train_precision_score',
                                                5: 'nn_test_precision_score'
                                   }).set_index("target_drug").T

raw_results = pd.concat( [raw_results, score_by_nn] ,axis=0)

  _warn_prf(average, modifier, msg_start, len(result))


In [51]:
raw_results.reset_index().style.background_gradient(cmap='BuPu', low=0, high=1)

target_drug,index,Alcohol,Cannabis,Chocolate,Caffeine,Cocaine,Mushrooms,Nicotine
0,test_f1_score,0.2752,0.1691,0.3468,0.5557,0.3793,0.3402,0.2109
1,train_f1_score,0.2681,0.1751,0.3471,0.5638,0.3621,0.3394,0.2024
2,test_precision_score,0.2752,0.1691,0.3468,0.5557,0.3793,0.3402,0.2109
3,train_precision_score,0.2681,0.1751,0.3471,0.5638,0.3621,0.3394,0.2024
4,test_recall_score,0.2752,0.1691,0.3468,0.5557,0.3793,0.3402,0.2109
5,train_recall_score,0.2681,0.1751,0.3471,0.5638,0.3621,0.3394,0.2024
6,svc_train_f1_score,0.4904,0.4981,0.5039,0.5956,0.5429,0.5482,0.4892
7,svc_test_f1_score,0.301,0.254,0.3172,0.5069,0.399,0.3961,0.272
8,svc_train_recall_score,0.4904,0.4981,0.5039,0.5956,0.5429,0.5482,0.4892
9,svc_test_recall_score,0.301,0.254,0.3172,0.5069,0.399,0.3961,0.272


In [59]:
x_enc = preprocessor_v2.fit_transform(X_train_raw)
pd.DataFrame(x_enc, columns=preprocessor_v2.get_feature_names_out()).corr().style.background_gradient(cmap = "BrBG")

Unnamed: 0,standardscaler__Neuroticism,standardscaler__Extraversion,standardscaler__Openness,standardscaler__Agreeableness,standardscaler__Conscientiousness,standardscaler__Age,standardscaler__Education,standardscaler__Impulsiveness,standardscaler__SensationSeeking,onehotencoder__Gender_0.48246
standardscaler__Neuroticism,1.0,-0.441351,0.002069,-0.204002,-0.380731,-0.109714,-0.098402,0.160635,0.06657,0.069124
standardscaler__Extraversion,-0.441351,1.0,0.245108,0.145109,0.299102,-0.051831,0.102547,0.111876,0.205145,0.057626
standardscaler__Openness,0.002069,0.245108,1.0,0.020134,-0.069453,-0.219905,0.055995,0.283078,0.427611,-0.115196
standardscaler__Agreeableness,-0.204002,0.145109,0.020134,1.0,0.221047,0.065368,0.097323,-0.221888,-0.215456,0.229081
standardscaler__Conscientiousness,-0.380731,0.299102,-0.069453,0.221047,1.0,0.162775,0.226963,-0.330749,-0.214537,0.179481
standardscaler__Age,-0.109714,-0.051831,-0.219905,0.065368,0.162775,1.0,0.139478,-0.171551,-0.328774,0.102709
standardscaler__Education,-0.098402,0.102547,0.055995,0.097323,0.226963,0.139478,1.0,-0.142943,-0.132109,0.222263
standardscaler__Impulsiveness,0.160635,0.111876,0.283078,-0.221888,-0.330749,-0.171551,-0.142943,1.0,0.619181,-0.160997
standardscaler__SensationSeeking,0.06657,0.205145,0.427611,-0.215456,-0.214537,-0.328774,-0.132109,0.619181,1.0,-0.230672
onehotencoder__Gender_0.48246,0.069124,0.057626,-0.115196,0.229081,0.179481,0.102709,0.222263,-0.160997,-0.230672,1.0


## KNN classification