# Random Forest Classifier

In [120]:
import pandas as pd
import numpy as np
import os
import json

from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, PolynomialFeatures
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate

from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix  # Recommended method in sklearn 1.0
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

In [124]:
data_path = "../data/processed"

#Load data mappings
mappings = json.load(open("data_mapping.json", "r"))

train_df = pd.read_csv(os.path.join(data_path, "train.csv"))
test_df = pd.read_csv(os.path.join(data_path, "test.csv"))
# df = pd.read_csv(os.path.join(data_path, "drug_consumption.data"), names=mappings['column_headers'])
# df = df.drop(columns=mappings['drop'])
# for drug in mappings['drugs']:
#     df[drug] = df[drug].replace({"CL0": "C0",                           
#                                 "CL1": "C0",
#                                 "CL2": "C0",
#                                 "CL3": "C1",
#                                 "CL4": "C1",
#                                 "CL5": "C2",
#                                 "CL6": "C2"})

# X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = mappings['drugs']),
#                                      df[mappings['drugs']],
#                                      test_size=0.2)
X_train = train_df.drop(columns=mappings['drugs'])
y_train = train_df[mappings['drugs']]
X_test = test_df.drop(columns=mappings['drugs'])
y_test = test_df[mappings['drugs']]


In [125]:
preprocessor =  make_column_transformer(
        (make_pipeline(
            PolynomialFeatures(degree=3),
            StandardScaler()
        ), mappings['numerical'] + mappings['ordinal']),
        (OneHotEncoder(drop='if_binary', dtype=int, handle_unknown='ignore'), mappings['categorical'])
    )

In [126]:
#Scoring functions to use
scorers = {
            'f1_score': make_scorer(f1_score, average='weighted', zero_division=True),
            # 'precision_score': make_scorer(precision_score, average='weighted',zero_division=True),
            # 'recall_score': make_scorer(recall_score, average='weighted', zero_division=True),
            # 'accuracy_score': make_scorer(accuracy_score)
          }

In [127]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)
    mean_scores = pd.DataFrame(scores).round(4).mean()
    std_scores = pd.DataFrame(scores).round(4).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [129]:
results = {}
dummy_cv_results = {}
# Get the mean accuracy for each drug
for drug in mappings['drugs']: 
    results[drug] = {}
    dc = DummyClassifier(strategy="most_frequent")
    # dc.fit(X_train, y_train[drug])
    dummy_cv_results[drug] = mean_std_cross_val_scores( dc, 
                                                        X_train,
                                                        y_train[drug],
                                                        return_train_score = True,
                                                        n_jobs=-1,
                                                        scoring=scorers) 
results = pd.DataFrame(dummy_cv_results)
results = results.drop(index = ["fit_time", "score_time"]).T
results = results.reset_index()
results = results.rename(columns = {"index": "target_drug"})

results = results.set_index("target_drug").T

In [130]:
results

target_drug,Alcohol,Cannabis,Chocolate,Caffeine,Cocaine,Mushrooms,Nicotine
test_f1_score,0.542 (+/- 0.003),0.305 (+/- 0.002),0.701 (+/- 0.002),0.835 (+/- 0.003),0.681 (+/- 0.002),0.675 (+/- 0.003),0.263 (+/- 0.002)
train_f1_score,0.542 (+/- 0.001),0.305 (+/- 0.001),0.701 (+/- 0.000),0.835 (+/- 0.001),0.681 (+/- 0.001),0.675 (+/- 0.001),0.263 (+/- 0.001)


In [132]:
rf_pipe =  make_pipeline(
    # preprocessor,
    RandomForestClassifier(min_samples_leaf=2,
                           n_jobs=-1,
                           random_state=522)
)

param_dist = {
    "randomforestclassifier__max_depth": [i for i in range(3, 35, 2)],
    "randomforestclassifier__max_features": [i for i in range(6, 15, 2)],
    "randomforestclassifier__class_weight": ["balanced", "balanced_subsample", None],
    "randomforestclassifier__n_estimators": [i for i in range(50, 250, 10)]
}

# Save the best model and score for each drug
rf_best_estimator = {}
rf_best_score_by_drug = {}

for drug in mappings['drugs']: 
    random_search = RandomizedSearchCV(rf_pipe, 
                                param_distributions=param_dist,
                                n_jobs = -1,
                                n_iter=30,
                                cv = 3, 
                                return_train_score = True,
                                scoring=scorers,
                                refit='f1_score')
    
    random_search.fit(X_train, y_train[drug])
    rf_best_estimator[drug] = random_search.best_estimator_
    
    rf_best_score_by_drug[drug] = mean_std_cross_val_scores(random_search,
                              X_train,
                              y_train[drug],
                              cv=3,
                              n_jobs=-1,
                              return_train_score=True)
    
    
rf_score_by_drug = pd.DataFrame(rf_best_score_by_drug)
rf_score_by_drug = rf_score_by_drug.drop(index = ["fit_time", "score_time"]).T
rf_score_by_drug = rf_score_by_drug.reset_index()
rf_score_by_drug = rf_score_by_drug.rename(columns = {"index": "target_drug"})
rf_score_by_drug = rf_score_by_drug.set_index("target_drug").T
rf_score_by_drug.index = ['rf_test_f1_score', 'rf_train_f1_score']

    
# results = pd.concat([results, rf_score_by_drug])

In [109]:
import json
from sklearn.model_selection import train_test_split
mappings = json.load(open("data_mapping.json", "r"))
for key, values in mappings["categories"].items():
    #The float values are keys and are henced saved as string values in the JSON object
    #Hence we are casting them back to float here
    values = {float(k):v for k,v in values.items()}
    df[key] = df.replace({key:values})[key]

train_df, test_df = train_test_split(df, train_size=0.8, random_state=522)    

In [110]:
train_df.to_csv(os.path.join("t.csv"), index=False)

In [111]:
preprocessor =  make_column_transformer(
                        (StandardScaler(), mappings['numerical']),
                        (OrdinalEncoder(categories = [
                            list(mappings['categories']['Age'].values()),
                            list(mappings['categories']['Education'].values()),
                            list(mappings['categories']['Impulsiveness'].values()),
                            list(mappings['categories']['SensationSeeking'].values()),
                        ]), mappings['ordinal']),
                        (OneHotEncoder(drop='if_binary', dtype=int, handle_unknown='ignore'), mappings['categorical']),
                        ("drop", mappings['drop']))

In [112]:
preprocessor.fit(train_df)
X_test_enc = pd.DataFrame(preprocessor.transform(test_df), 
                          columns=preprocessor.get_feature_names_out())
y_train = train_df[mappings['drugs']]
y_test = test_df[mappings['drugs']]

In [113]:
y_test.replace({"CL0": "C0",
                "CL1": "C0",
                "CL2": "C0",
                "CL3": "C1",
                "CL4": "C1",
                "CL5": "C2",
                "CL6": "C2"},
                inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test.replace({"CL0": "C0",
