This jupyter notebook is testing the performance of final model

In [80]:
import dill 
import joblib 
import pandas as pd 
from datetime import datetime

from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

from category_encoders import HashingEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

from imblearn.over_sampling import SMOTE

#Scoring metric
scoring_metric = 'roc_auc'

#Targeted actions
target_actions = ['sub_car_claim_click', 'sub_car_claim_submit_click', 'sub_open_dialog_click', 'sub_custom_question_submit_click', 'sub_call_number_click', 'sub_callback_submit_click', 'sub_submit_success', 'sub_car_request_submit_click']
def target_func(df : pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['target_action'] = 0
    df.loc[df['event_action'].isin(target_actions), 'target_action'] = 1
    df['event_action'] = df['target_action']
    return df.drop(columns = 'target_action')

#Filtering data
def filter_data(df : pd.DataFrame) -> pd.DataFrame:
    cols_to_drop = ['device_model', 'event_value', 'visit_date', 'session_id', 'event_category', 'hit_type', 'hit_time', 'utm_keyword', 'client_id', 'device_brand']
    return df.drop(columns = cols_to_drop)

#Adding new features or replacing the existing ones with more advantageous ones
def add_features(df : pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["hit_date"] = df["hit_date"].astype("datetime64[ns]").dt.month
    return df

# Dropping rows with NaNs in specific columns
def dropna_specific(df: pd.DataFrame) -> pd.DataFrame:
    return df.dropna(subset=['device_os', 'hit_referer', 'event_label'])

#Filling up missing values utm_source, utm_campaign and utm_adcontent
def corr_filling_utms(df : pd.DataFrame) -> pd.DataFrame:
    columns_to_change = ['utm_source', 'utm_adcontent', 'utm_campaign']
    df = df.copy()
    def filler_function(target_column):
        filler_col = df.groupby(['utm_medium'])[target_column].value_counts().groupby(level = 0).nlargest(1).index

        medium_col = []
        target_col = []

        for i in filler_col:
            medium_col.append(i[1])
            target_col.append(i[2])

        medium_to_target_map = dict(zip(medium_col, target_col))
        df[target_column].fillna(df['utm_medium'].map(medium_to_target_map), inplace = True)
    for col in columns_to_change:
        filler_function(col)
    return df

#reducing the number of unique value in visit_time column
def categorize_visit_time(df : pd.DataFrame) -> pd.DataFrame : 
    import pandas as pd
    def categorize_time(hour):
        if 0 <= hour < 12:
            return "Morning"
        elif 12 <= hour < 18:
            return "Afternoon"
        else:
            return "Evening"
    df['visit_time'] = pd.to_datetime(df['visit_time'], format='%H:%M:%S')    
    df['visit_time'] = df['visit_time'].dt.hour.apply(categorize_time)

    return df


#Dropping remnant NAN values in the dataset
def dropna_final(df : pd.DataFrame) -> pd.DataFrame:  
    return df.dropna()


#Data loading
df_hits = pd.read_csv(filepath_or_buffer = "./data/ga_hits.csv", low_memory = False)
df_session = pd.read_csv(filepath_or_buffer = './data/ga_sessions.csv', low_memory = False)
df = pd.merge(left = df_hits, right = df_session, on = "session_id")

#Balancing up data distribution
def balance(x, y):
    smote = SMOTE(random_state = 123)
    x_final, y_final = smote.fit_resample(X = x, y = y)
    return [x_final, y_final]

# Preprocess the entire DataFrame first
df = target_func(df)
df = filter_data(df)
df = dropna_specific(df)
df = corr_filling_utms(df)
df = dropna_final(df)
#df = add_features(df)
#df = categorize_visit_time(df)

X = df.drop(columns = 'event_action')
y = df['event_action']

df.head()

Unnamed: 0,hit_date,hit_number,hit_referer,hit_page_path,event_action,event_label,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,device_category,device_os,device_screen_resolution,device_browser,geo_country,geo_city
387,2021-12-23,105,FwdMTcXzWAwhtsnMAbhS,podpiska.sberauto.com/,0,VppGBKuEPJVLSsOLERbm,17:18:04,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,desktop,Windows,1536x864,Chrome,Russia,Ulyanovsk
396,2021-12-23,98,FwdMTcXzWAwhtsnMAbhS,podpiska.sberauto.com/,0,OHfyUfDKdPgBdvelDlfG,17:18:04,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,desktop,Windows,1536x864,Chrome,Russia,Ulyanovsk
456,2021-12-23,104,VloVXNWduHeTjUoDkjkO,sberauto.com/cars?yzclid=6771251748358633377&r...,0,uDrwlhtkwJJbRcbyRaTW,13:50:20,1,vFcAhRxLfOWKhvxjELkx,organic,okTXSMadDkjvntEHzIjp,LLfCasrxQzJIyuldcuWy,desktop,Windows,1920x1080,Chrome,Russia,Saint Petersburg
461,2021-12-23,10,VloVXNWduHeTjUoDkjkO,sberauto.com/cars?yzclid=6771251748358633377&r...,0,hAHqGICPFQiPwtzubOzs,13:50:20,1,vFcAhRxLfOWKhvxjELkx,organic,okTXSMadDkjvntEHzIjp,LLfCasrxQzJIyuldcuWy,desktop,Windows,1920x1080,Chrome,Russia,Saint Petersburg
462,2021-12-23,53,VloVXNWduHeTjUoDkjkO,sberauto.com/cars?yzclid=6771251748358633377&r...,0,pzAKUYdRKNEUXxxBFUPX,13:50:20,1,vFcAhRxLfOWKhvxjELkx,organic,okTXSMadDkjvntEHzIjp,LLfCasrxQzJIyuldcuWy,desktop,Windows,1920x1080,Chrome,Russia,Saint Petersburg


In [86]:
df[df['event_action'] == 1].iloc[0]

hit_date                                                           2021-12-23
hit_number                                                                 18
hit_referer                                              CENJYQocGncDfpQMoNnT
hit_page_path               sberauto.com/cars/all/skoda/rapid/bf24b977?ren...
event_action                                                                1
event_label                                              EsLbNNEnCkXWoaesnKlS
visit_time                                                           23:33:28
visit_number                                                                1
utm_source                                               QxAxdyPLuQMEcrdZWdWb
utm_medium                                                                cpc
utm_campaign                                             VBmazutCflYumtDHrQYe
utm_adcontent                                            JNHcPlZPxEMWDnRiyoBf
device_category                                                 

In [81]:
numerical_features = ['hit_date', 'hit_number', 'visit_number']
ohe_cols = ['visit_time', 'device_category', 'device_os', 'device_browser']
hash_cat = ['hit_referer', 'event_label', 'hit_page_path', 'utm_source', 'utm_campaign', 'utm_adcontent', 'device_screen_resolution', 'geo_country', 'geo_city', 'utm_medium']

#Feature engineering
numerical_transformer = Pipeline(steps = [
    ('scaler', StandardScaler())
])

ohe_transformation = Pipeline(steps = [
    ('ohe', OneHotEncoder(handle_unknown = 'ignore'))
])
hasher = Pipeline(steps = [
    ('hasher', HashingEncoder())
])

column_transformer = ColumnTransformer(transformers=[
    ('numerical', numerical_transformer, numerical_features),
    ('ohe_transformation', ohe_transformation, ohe_cols),
    ('hashing', hasher, hash_cat)
])

#preprecessor for pipelines
preprocessor = Pipeline(steps = [
    ('utm_filler', FunctionTransformer(corr_filling_utms)),
    ('feature_add', FunctionTransformer(add_features)),
    ('visit_time_categorizer', FunctionTransformer(categorize_visit_time)),
    ('column_transformer', column_transformer)
])



In [87]:
test_dict = {
    "hit_date" : "2021-12-23",
    "hit_number" : 18,
    "hit_referer" : "CENJYQocGncDfpQMoNnT",
    "hit_page_path" : 'sberauto.com/cars/all/skoda/rapid/bf24b977?rental_page=rental_car&fbclid=paaabzcjlk3yejxtayysx8tjzlfry2oqdd5o9rvw6ccdcukolrjk361gy-of0&external_browser_redirect=true',
    "event_label" : "EsLbNNEnCkXWoaesnKlS",
    "visit_time" : "23:33:28",
    "visit_number" : 1,
    "utm_source" : "QxAxdyPLuQMEcrdZWdWb",
    "utm_medium" : "cpc",
    "utm_campaign" : "VBmazutCflYumtDHrQYe",
    "utm_adcontent" : "JNHcPlZPxEMWDnRiyoBf",
    "device_category" : "mobile",
    "device_os" : "Android",
    "device_screen_resolution" : "393x851",
    "device_browser" : "Android Webview",
    "geo_country" : "Russia",
    "geo_city" : "Saint Petersburg",
}

In [88]:
df_test_final = pd.DataFrame(test_dict, index = [0])

In [82]:
models = [
    MLPClassifier(),
    RandomForestClassifier(),
]

best_score = .0
best_pipe = None 
for model in models:
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    score = cross_val_score(pipe, X, y, cv = 4, scoring = scoring_metric, error_score = 'raise')
    print(f'model : {type(model).__name__}, {scoring_metric}_mean:{score.mean():.4f}, {scoring_metric}_std : {score.std():.4f}')
    if score.mean() > best_score : 
        best_score = score.mean()
        best_pipe = pipe

print(f'best model: {type(best_pipe.named_steps["classifier"]).__name__}, {scoring_metric}: {best_score:.4f}')

#Fitting perfect pipeline for whole dataset
best_pipe.fit(X = X, y = y)

model : MLPClassifier, roc_auc_mean:0.7964, roc_auc_std : 0.0847
model : RandomForestClassifier, roc_auc_mean:0.8048, roc_auc_std : 0.1013
best model: RandomForestClassifier, roc_auc: 0.8048


In [72]:
list(test_dict.values())

[12,
 18,
 'CENJYQocGncDfpQMoNnT',
 'sberauto.com/cars/all/skoda/rapid/bf24b977?rental_page=rental_car&fbclid=paaabzcjlk3yejxtayysx8tjzlfry2oqdd5o9rvw6ccdcukolrjk361gy-of0&external_browser_redirect=true',
 'EsLbNNEnCkXWoaesnKlS',
 'Evening',
 1,
 'QxAxdyPLuQMEcrdZWdWb',
 'cpc',
 'VBmazutCflYumtDHrQYe',
 'JNHcPlZPxEMWDnRiyoBf',
 'mobile',
 'Android',
 '393x851',
 'Android Webview',
 'Russia',
 'Saint Petersburg']

In [89]:
best_pipe.predict(df_test_final)

array([1], dtype=int64)

In [None]:
model_filename = f'./models/user_action.pkl'
dill.dump({'model' : best_pipe,
    'metadata' :{
        'name' : 'User action predictor',
        'author' : 'Umidjon Sattorov',
        'version' : 1,
        'date' : datetime.now(),
        'type' : type(best_pipe.named_steps['classifier']).__name__,
        'accuracy' : best_score
    }
}, open('./models/user_action_predictor_1.pkl', 'wb'))

print(f'Model is saved as {model_filename} in models directory')