# Conversion Rate Project

## Project description

# Imports

In [106]:
# Data
import pandas as pd
import numpy as np

# Technical
import os
import typing
from IPython.display import clear_output
import time

# Plotting
import plotly.graph_objects as go
import plotly.express as px

# ML
import optuna

In [107]:
# Read our files into 4 separate dataframes
train_data = pd.read_csv(r"data\conversion_data_train.csv")

test_data_x = pd.read_csv(r'data\conversion_data_test.csv')
test_data_y = pd.read_csv(r'data\conversion_data_test_labels.csv')

# EDA

In [108]:
train_data.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0
2,Germany,20,0,Seo,14,1
3,US,23,1,Seo,3,0
4,US,28,1,Direct,3,0


In [109]:
fig = px.bar(train_data['converted'].value_counts())
fig.show()

In [110]:
train_data[['converted']].mean().to_frame()

Unnamed: 0,0
converted,0.032258


In [111]:
train_data.describe(include='all')

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
count,284580,284580.0,284580.0,284580,284580.0,284580.0
unique,4,,,3,,
top,US,,,Seo,,
freq,160124,,,139477,,
mean,,30.564203,0.685452,,4.873252,0.032258
std,,8.266789,0.464336,,3.341995,0.176685
min,,17.0,0.0,,1.0,0.0
25%,,24.0,0.0,,2.0,0.0
50%,,30.0,1.0,,4.0,0.0
75%,,36.0,1.0,,7.0,0.0


In [112]:
for col in train_data.columns:
    fig = px.bar(train_data[col].value_counts(), title=col,
                 width=600, height=300)
    fig.show()

In [113]:
px.scatter_matrix(train_data.sample(10000))

In [114]:
train_data.isna().sum()

country                0
age                    0
new_user               0
source                 0
total_pages_visited    0
converted              0
dtype: int64

Looks like we might have some age outliers. Let's take a look and deal with them.

In [115]:
# Look at highest ages in our data
display(train_data['age'].sort_values(ascending=False).head().to_frame())

# Keep only data with ages under 100
train_data_x = train_data[train_data['age'] < 100].reset_index(drop=True)

Unnamed: 0,age
233196,123
11331,111
230590,79
268311,77
175251,73


In [116]:
# Split train data into x/y
train_data_x = train_data.drop('converted', axis=1)
train_data_y = train_data[['converted']]

# Preprocessing

In [117]:
n_feats = train_data_x.select_dtypes(np.number).columns.to_list()
c_feats = train_data_x.select_dtypes(object).columns.to_list()

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder, FunctionTransformer

# Create numeric transformer
n_trans = Pipeline([
        ('scaler', StandardScaler()),
        # ('poly', PolynomialFeatures(degree=3)),
        # ('log', FunctionTransformer(np.log1p, validate=True))
    ])

# Create categoric transformer
c_trans = Pipeline([
        ('encoder', OneHotEncoder(drop='first'))
    ])

# Create preprocessor
preprocessor = ColumnTransformer([
        ('num', n_trans, n_feats),
        ('cat', c_trans, c_feats)
    ])

x_train = preprocessor.fit_transform(train_data_x)
y_train = train_data_y.copy().to_numpy().ravel()
x_test = preprocessor.transform(test_data_x)
y_test = test_data_y.copy().to_numpy().ravel()

# Model evaluation

### What models should we look at?
- Logistic Regression
- SVMs
- Random Forests
- Ensemble models (AdaBoost, Gradient Boost, Voting Classifiers and Stacking Classifiers)

### How are we going to evaluate our models?

For each model, we will run an optuna study (hyperparameter optimization library) on key hyperparamaters. We will evaluate the models using their F1 scores.

In [118]:
from sklearn.metrics import f1_score

# Amount of trials to allow with no progress
stall_thresh = 100

## Optuna Hyperparameter Optimization

### Model Optimizer

In [119]:
def random_sample(arr: np.ndarray, n: int, seed: int = 0, random: bool = False) -> np.array:
    
    if random:
        np.random.seed()
    else:
        np.random.seed(seed)
        
    if n == 0:
        n = len(arr)
        
    return arr[np.random.choice(len(arr), size=n, replace=False)]

def objective(trial, model, params, sample_size=0):
    model_params = {}
    for name, param_type, param_range in params:
        if param_type == 'int':
            model_params[name] = trial.suggest_int(name, int(param_range[0]), int(param_range[1]))
        elif param_type == 'float':
            model_params[name] = trial.suggest_float(name, param_range[0], param_range[1])
        elif param_type == 'cat':
            model_params[name] = trial.suggest_categorical(name, param_range)
        else:
            raise ValueError(f"Unsupported parameter type: {param_type}")
    
    clf = model(**model_params)
    clf.fit(
        random_sample(x_train, sample_size),
        random_sample(y_train, sample_size),
        )
    score = f1_score(y_test, clf.predict(x_test))
    return score

# It is hard to find a good amount of trials to allow for each model, so I wrote an optuna callback that will stop
# a study after n trials have passed without progress
class StopWhenNoProgress:
    def __init__(self, threshold: int):
        self.threshold = threshold
        self._consecutive_stall_count = 0
        self._last_reset_value = 0
        
    def __call__(self, study:optuna.study.Study, trial: optuna.trial.FrozenTrial) -> None:
        # If score is worse than best, count as stall
        if trial.value < study.best_value:
            self._consecutive_stall_count += 1

        # If score is better or equal to best, and hasn't been used to reset before, then reset
        elif (trial.value >= study.best_value) & (trial.value != self._last_reset_value):
            self._consecutive_stall_count = 0
            self._last_reset_value = study.best_value   

        # Stop once over threshold
        if self._consecutive_stall_count > self.threshold:
            study.stop()        

In [120]:
class ModelOptimizer():
    def __init__(self, model, trial_thresh=stall_thresh):
        self.model = model
        self.model_name = str(model).split(".")[-1].replace(">", "").replace("'", '')
        self._trial_thresh = trial_thresh
        
    def find_best(self, model_params, sample_size=0):
        self.study = optuna.create_study(direction='maximize')
        self.study.optimize(
            lambda trial: objective(
                trial,
                self.model,
                model_params,
                sample_size),
            callbacks=[StopWhenNoProgress(self._trial_thresh)]
        )
        
        self.best_params = self.study.best_params
        print(f"Achieved best value {round(self.study.best_value, 6)} at trial {self.study.best_trial.number} with paramaters {self.study.best_params}.")
        
    def eval_best(self) -> float:
        self.best_model = self.model(**self.best_params)
        test_pred = self.best_model.fit(x_train, y_train).predict(x_test)
        self.best_score = f1_score(y_test, test_pred)
        
        return self.best_score
    
    def log_best(self) -> list:
        return [self.model_name, self.best_score, self.best_params]

In [121]:
# Create our results dataframe
model_scores = pd.DataFrame(columns=[
    'model',
    'f1',
    'params',
])

# Import models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier, StackingClassifier
from xgboost import XGBClassifier

# Create model paramaters to search
lr_params = [
    ('penalty', 'cat', [None, 'l2']),
    ('C', 'float', (0.01, 1e5)),
    ('tol', 'float', (1e-6, 1e-2)),
]

rf_params = [
    ('n_estimators', 'int', (1, 1e3)),
    ('max_depth', 'int', (1, 1e3)),
    ('min_samples_split', 'int', (2, 1e3))
]

ada_params = [
    ('estimator', 'cat', [LogisticRegression(C=2000, tol=1e-2)]),
    ('n_estimators', 'int', (1, 1e3)),
    ('learning_rate', 'float', (0.01, 10)),
    ('algorithm', 'cat', ['SAMME']),
    ('random_state', 'cat', [0])
]

xgb_params = [
    ('objective', 'cat', ['binary:logistic']),
    ('booster', 'cat', ['gblinear']),
    ('n_estimators', 'int', (10, 1e3)),
    ('learning_rate', 'float', (0.01, 10)),
    ('scale_pos_weight', 'float', (1, 10))
]

# Create list of models to test with our ModelOptimizer
models = {
    LogisticRegression : (lr_params, 0),
    RandomForestClassifier : (rf_params, 20000),
    AdaBoostClassifier : (ada_params, 10000),
    XGBClassifier : (xgb_params, 0)
    }

for model, params in models.items():
    model_opt = ModelOptimizer(model)
    model_opt.find_best(model_params=params[0], sample_size=params[1])
    model_opt.eval_best()
    model_scores.loc[len(model_scores)] = model_opt.log_best()
    
display(model_scores.sort_values(by='f1', ascending=False))

[I 2024-09-22 14:41:55,164] A new study created in memory with name: no-name-9180b24e-6392-4d94-815a-47389b02712c

Setting penalty=None will ignore the C and l1_ratio parameters

[I 2024-09-22 14:41:55,342] Trial 0 finished with value: 0.7373626373626374 and parameters: {'penalty': None, 'C': 334.59693147506204, 'tol': 0.0011808967058263727}. Best is trial 0 with value: 0.7373626373626374.

Setting penalty=None will ignore the C and l1_ratio parameters

[I 2024-09-22 14:41:55,551] Trial 1 finished with value: 0.7301762114537445 and parameters: {'penalty': None, 'C': 50056.82612349168, 'tol': 0.002040273928276092}. Best is trial 0 with value: 0.7373626373626374.
[I 2024-09-22 14:41:55,699] Trial 2 finished with value: 0.7044025157232704 and parameters: {'penalty': 'l2', 'C': 75311.12048732062, 'tol': 0.003266107065462766}. Best is trial 0 with value: 0.7373626373626374.

Setting penalty=None will ignore the C and l1_ratio parameters

[I 2024-09-22 14:41:55,853] Trial 3 finished with val

Achieved best value 0.754615 at trial 12 with paramaters {'penalty': 'l2', 'C': 99402.48498709069, 'tol': 3.082633193055766e-05}.


[I 2024-09-22 14:42:30,192] A new study created in memory with name: no-name-b65449f0-9639-43aa-9f48-d876fbe26026
[I 2024-09-22 14:42:34,657] Trial 0 finished with value: 0.6509711595055915 and parameters: {'n_estimators': 935, 'max_depth': 749, 'min_samples_split': 991}. Best is trial 0 with value: 0.6509711595055915.
[I 2024-09-22 14:42:35,629] Trial 1 finished with value: 0.7105985722130698 and parameters: {'n_estimators': 191, 'max_depth': 331, 'min_samples_split': 397}. Best is trial 1 with value: 0.7105985722130698.
[I 2024-09-22 14:42:38,238] Trial 2 finished with value: 0.6931246506428173 and parameters: {'n_estimators': 537, 'max_depth': 669, 'min_samples_split': 560}. Best is trial 1 with value: 0.7105985722130698.
[I 2024-09-22 14:42:41,815] Trial 3 finished with value: 0.7128065395095368 and parameters: {'n_estimators': 694, 'max_depth': 681, 'min_samples_split': 330}. Best is trial 3 with value: 0.7128065395095368.
[I 2024-09-22 14:42:41,908] Trial 4 finished with value: 0

Achieved best value 0.74514 at trial 76 with paramaters {'n_estimators': 418, 'max_depth': 563, 'min_samples_split': 28}.


[I 2024-09-22 14:48:35,193] A new study created in memory with name: no-name-3f82a3ba-8887-4a62-b661-19c96f39a0de

Choices for a categorical distribution should be a tuple of None, bool, int, float and str for persistent storage but contains LogisticRegression(C=2000, tol=0.01) which is of type LogisticRegression.

[I 2024-09-22 14:48:41,285] Trial 0 finished with value: 0.0 and parameters: {'estimator': LogisticRegression(C=2000, tol=0.01), 'n_estimators': 831, 'learning_rate': 6.052701609867911, 'algorithm': 'SAMME', 'random_state': 0}. Best is trial 0 with value: 0.0.

Choices for a categorical distribution should be a tuple of None, bool, int, float and str for persistent storage but contains LogisticRegression(C=2000, tol=0.01) which is of type LogisticRegression.

[I 2024-09-22 14:48:48,114] Trial 1 finished with value: 0.0 and parameters: {'estimator': LogisticRegression(C=2000, tol=0.01), 'n_estimators': 907, 'learning_rate': 6.095842407153918, 'algorithm': 'SAMME', 'random_sta

Achieved best value 0.753049 at trial 185 with paramaters {'estimator': LogisticRegression(C=2000, tol=0.01), 'n_estimators': 606, 'learning_rate': 0.045413042007599456, 'algorithm': 'SAMME', 'random_state': 0}.


[I 2024-09-22 15:08:59,545] A new study created in memory with name: no-name-7b6da68e-78c1-462e-ba9f-90030605c9d9
[I 2024-09-22 15:09:04,066] Trial 0 finished with value: 0.0 and parameters: {'objective': 'binary:logistic', 'booster': 'gblinear', 'n_estimators': 377, 'learning_rate': 7.2493987454776425, 'scale_pos_weight': 1.202414769982584}. Best is trial 0 with value: 0.0.
[I 2024-09-22 15:09:05,287] Trial 1 finished with value: 0.0 and parameters: {'objective': 'binary:logistic', 'booster': 'gblinear', 'n_estimators': 95, 'learning_rate': 7.204257614301512, 'scale_pos_weight': 2.366295460172433}. Best is trial 0 with value: 0.0.
[I 2024-09-22 15:09:09,700] Trial 2 finished with value: 0.0 and parameters: {'objective': 'binary:logistic', 'booster': 'gblinear', 'n_estimators': 379, 'learning_rate': 7.055898816659709, 'scale_pos_weight': 2.9992761767235927}. Best is trial 0 with value: 0.0.
[I 2024-09-22 15:09:20,620] Trial 3 finished with value: 0.0 and parameters: {'objective': 'bina

Achieved best value 0.767551 at trial 52 with paramaters {'objective': 'binary:logistic', 'booster': 'gblinear', 'n_estimators': 683, 'learning_rate': 0.06510772560860068, 'scale_pos_weight': 1.5552333264214817}.


Unnamed: 0,model,f1,params
3,XGBClassifier,0.767551,"{'objective': 'binary:logistic', 'booster': 'g..."
0,LogisticRegression,0.754615,"{'penalty': 'l2', 'C': 99402.48498709069, 'tol..."
1,RandomForestClassifier,0.754045,"{'n_estimators': 418, 'max_depth': 563, 'min_s..."
2,AdaBoostClassifier,0.753564,"{'estimator': LogisticRegression(C=2000, tol=0..."


### Stacking Classification

In [122]:
model = StackingClassifier(
    estimators=[
        ('lr', LogisticRegression(
            **model_scores[model_scores['model'] == 'LogisticRegression']['params'].iloc[0]
                )
         ),
        ('xgb', XGBClassifier(
            **model_scores[model_scores['model'] == 'XGBClassifier']['params'].iloc[0]
                )
         ),
        ('rf', RandomForestClassifier(
            **model_scores[model_scores['model'] == 'RandomForestClassifier']['params'].iloc[0]
                )
         )
        ],
)

test_pred = model.fit(x_train, y_train).predict(x_test)

print(f"Final score: {f1_score(y_test, test_pred)}")

model_scores.loc[len(model_scores)] = [
    'StackingClassifier',
    f1_score(y_test, test_pred),
    None,
    ]

Final score: 0.7592891760904685


In [124]:
model_scores.to_csv(r"C:\Users\rapha\My Drive\Work\jedha_dsfs\coursework\p_conversion_rate\model_scores.csv")

### Voting Classification

In [123]:
def voting_objective(trial):
    
    
    w1, w2, w3 = trial.suggest_float('w1', 0, 1), trial.suggest_float('w2', 0, 1), trial.suggest_float('w3', 0, 1)
    
    params = {
        'estimators' : [
            ('lr', LogisticRegression(**model_scores[model_scores['model'] == 'LogisticRegression']['params'].iloc[0])),
            ('xgb', XGBClassifier(**model_scores[model_scores['model'] == 'XGBClassifier']['params'].iloc[0])),
            ('rf', RandomForestClassifier(**model_scores[model_scores['model'] == 'RandomForestClassifier']['params'].iloc[0]))
            ],
        'voting' : trial.suggest_categorical('voting', ['soft', 'hard']),
        'weights' : [w1, w2, w3]
    }
    
    sample_size = 0
    model = VotingClassifier(**params)
    model = model.fit(random_sample(x_train, sample_size), random_sample(y_train, sample_size))
    score = f1_score(y_test, model.predict(x_test))
    
    return score

study = optuna.create_study(direction='maximize')
study.optimize(voting_objective,
               callbacks=[StopWhenNoProgress(stall_thresh)])

print(
    f"Our best F1 score was {round(study.best_trial.value, 6)}\
 with these paramaters: {study.best_trial.params}",
)

model = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression(
            **model_scores[model_scores['model'] == 'LogisticRegression']['params'].iloc[0]
                )
         ),
        ('xgb', XGBClassifier(
            **model_scores[model_scores['model'] == 'XGBClassifier']['params'].iloc[0]
                )
         ),
        ('rf', RandomForestClassifier(
            **model_scores[model_scores['model'] == 'RandomForestClassifier']['params'].iloc[0]
                )
         )
        ],
    voting=study.best_trial.params['voting'],
    weights=[study.best_trial.params['w1'],
             study.best_trial.params['w2'],
             study.best_trial.params['w3']],
)
test_pred = model.fit(x_train, y_train).predict(x_test)

print(f"Final score: {f1_score(y_test, test_pred)}")

model_scores.loc[len(model_scores)] = [
    'VotingClassifier',
    f1_score(y_test, test_pred),
    {'weights': [study.best_trial.params['w1'],
             study.best_trial.params['w2'],
             study.best_trial.params['w3']]}
    ]

[I 2024-09-22 15:28:56,340] A new study created in memory with name: no-name-3a7826c4-fe85-4dbd-969e-b47becee1953
[I 2024-09-22 15:29:41,089] Trial 0 finished with value: 0.7529538131041891 and parameters: {'w1': 0.7403701618377905, 'w2': 0.4512293592320388, 'w3': 0.9902192522268771, 'voting': 'soft'}. Best is trial 0 with value: 0.7529538131041891.
[I 2024-09-22 15:30:25,627] Trial 1 finished with value: 0.7568145376803849 and parameters: {'w1': 0.5093621436547283, 'w2': 0.7036673962244822, 'w3': 0.6947017625593918, 'voting': 'soft'}. Best is trial 1 with value: 0.7568145376803849.
[I 2024-09-22 15:31:09,735] Trial 2 finished with value: 0.7551240560949298 and parameters: {'w1': 0.8906482205030998, 'w2': 0.19431989760085056, 'w3': 0.212631824189633, 'voting': 'soft'}. Best is trial 1 with value: 0.7568145376803849.
[I 2024-09-22 15:31:54,343] Trial 3 finished with value: 0.7544522396114409 and parameters: {'w1': 0.11918506554704855, 'w2': 0.3155293355226507, 'w3': 0.7832781060999878, 

KeyboardInterrupt: 

In [104]:
model_scores.sort_values(by='f1', ascending=False)

Unnamed: 0,model,f1,params
3,XGBClassifier,0.768025,"{'objective': 'binary:logistic', 'booster': 'g..."
7,VotingClassifier,0.768025,"{'weights': [0.719683742126398, 0.980334097089..."
5,VotingClassifier,0.764953,"{'weights': [0.07878693996328957, 0.7574443745..."
6,VotingClassifier,0.758289,"{'weights': [0.7044868976963848, 0.15713050290..."
4,StackingClassifier,0.75725,
0,LogisticRegression,0.754615,"{'penalty': None, 'C': 7995.340359038978, 'tol..."
1,RandomForestClassifier,0.749458,"{'n_estimators': 17, 'max_depth': 886, 'min_sa..."
2,AdaBoostClassifier,0.749367,"{'estimator': LogisticRegression(C=2000, tol=0..."


# save model and reevaluate on new data (smote etc)

Improvements: better implementations of imbalanced learning techniques
adjust stall threshold by trial duration