In [1]:
import pandas as pd
import dvc.api
import mlflow
import matplotlib.pyplot as plt

import seaborn as sns

In [2]:
path = 'data\AdSmartABdata.csv'
repo = 'https://github.com/SameC137/abtest-mlops'
rev = 'v1'
data_url = dvc.api.get_url(path=path, repo=repo,rev=rev)
if rev=="v1":
    collected_data = pd.read_csv(data_url)
else:
    collected_data = pd.read_csv(data_url,index_col=0)
collected_data

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,yes,no
0,0008ef63-77a7-448b-bd1e-075f42c55e39,exposed,2020-07-10,8,Generic Smartphone,6,Chrome Mobile,0,0
1,000eabc5-17ce-4137-8efe-44734d914446,exposed,2020-07-07,10,Generic Smartphone,6,Chrome Mobile,0,0
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0,1
3,00187412-2932-4542-a8ef-3633901c98d9,control,2020-07-03,15,Samsung SM-A705FN,6,Facebook,0,0
4,001a7785-d3fe-4e11-a344-c8735acacc2c,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0
...,...,...,...,...,...,...,...,...,...
8072,ffea24ec-cec1-43fb-b1d1-8f93828c2be2,exposed,2020-07-05,7,Generic Smartphone,6,Chrome Mobile,0,0
8073,ffea3210-2c3e-426f-a77d-0aa72e73b20f,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0
8074,ffeaa0f1-1d72-4ba9-afb4-314b3b00a7c7,control,2020-07-04,9,Generic Smartphone,6,Chrome Mobile,0,0
8075,ffeeed62-3f7c-4a6e-8ba7-95d303d40969,exposed,2020-07-05,15,Samsung SM-A515F,6,Samsung Internet,0,0


## Select only users with a response

In [3]:
responded=collected_data.loc[(collected_data["yes"]==1) | (collected_data["no"]==1)]



# Remove auction Id

In [4]:
features= responded.drop(["auction_id","no"],axis=1, inplace=False)
features=features.reset_index(drop=True)

In [5]:
y=features["yes"]

In [6]:


# lb = LabelEncoder() 

# features["experiment"]=lb.fit_transform(features["experiment"])

# features["device_make"]=lb.fit_transform(features["device_make"])

# features["browser"]=lb.fit_transform(features["browser"])

# features["date"]=lb.fit_transform(features["date"])

# features.drop("yes",axis=1,inplace=True)

In [7]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler


def encode_scale_features(df,columns):
    lb=LabelEncoder()
    norm = MinMaxScaler()
    for i in columns:
        df[i]=lb.fit_transform(df[i])   
    norm_fit = norm.fit_transform(df)
    out=pd.DataFrame(norm_fit,columns=df.columns)
    return out

In [8]:
if rev=="v2":
    feat=["experiment","device_make","date"]
else:
    feat=["experiment","device_make","browser","date"]

features.drop("yes",axis=1,inplace=True)
X=encode_scale_features(features,feat)
    

In [9]:
X

Unnamed: 0,experiment,date,hour,device_make,platform_os,browser
0,1.0,0.285714,0.086957,0.035088,1.0,0.285714
1,1.0,0.142857,0.695652,0.114035,1.0,0.142857
2,1.0,0.428571,0.347826,0.114035,1.0,0.142857
3,0.0,0.714286,0.173913,0.377193,1.0,0.571429
4,0.0,0.000000,0.652174,0.114035,1.0,0.142857
...,...,...,...,...,...,...
1238,1.0,0.285714,0.913043,0.114035,1.0,0.142857
1239,1.0,0.142857,0.043478,0.114035,1.0,0.142857
1240,0.0,0.857143,0.304348,0.114035,1.0,0.142857
1241,0.0,1.000000,0.695652,0.114035,1.0,0.142857


In [10]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
   X, y, test_size=0.3, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.33, random_state=1)


In [11]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import mean_squared_error, accuracy_score,log_loss
# import mlflow
# import mlflow.sklearn


# from mlflow.models.signature import infer_signature
# from mlflow.utils.environment import _mlflow_conda_env


# model = LogisticRegression()
# with mlflow.start_run() as run:
#     model.fit(X_train y_train)
#     pred = cls.predict(X_test)
#     mlflow.log_metric(f"accuracy", kfold_scores.mean())
#     mlflow.log_metric(f"std_accuracy", kfold_scores.std())
#     print(mean_squared_error(y_test, pred))
#     print("Logged data and model in run {}".format(run.info.run_id))
    

In [12]:
# def eval_metrics(actual, pred):
#     rmse = np.sqrt(mean_squared_error(actual, pred))
#     mae = mean_absolute_error(actual, pred)
#     r2 = r2_score(actual, pred)
#     return rmse, mae, r2
    

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score, log_loss
import mlflow
import mlflow.sklearn

from urllib.parse import urlparse

from mlflow.models.signature import infer_signature
from mlflow.utils.environment import _mlflow_conda_env
import numpy as np


with mlflow.start_run(run_name='untuned_linear_regression'):
    
    mlflow.log_param('data_version', rev)
    feature_cols=pd.DataFrame(list(X_train.columns))
    feature_cols.to_csv('features.csv',header=False,index=False)
    mlflow.log_artifact("features.csv")

    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, pred))
    loss=log_loss(y_val,pred)
    acc = accuracy_score(y_val, pred)

    
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("log_loss", loss)
    
    mlflow.log_metric("accuracy", acc)
    
    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    if tracking_url_type_store != "file":
        mlflow.sklearn.log_model(model, "model", registered_model_name="LinearRegressionModel")
    else:
        mlflow.sklearn.log_model(model, "model")






In [14]:

feature_importances = pd.DataFrame((model.coef_).transpose() , index=X_train.columns.tolist(), columns=['importance'])
feature_importances.sort_values('importance', ascending=False)
# X_train.columns.tolist()

Unnamed: 0,importance
platform_os,0.53948
device_make,0.459148
experiment,0.149235
hour,-0.016258
date,-0.062368
browser,-0.081871


In [15]:
from xgboost import XGBClassifier


with mlflow.start_run(run_name='untuned_kgboost'):
    
    mlflow.log_param('data_version', rev)
    feature_cols=pd.DataFrame(list(X_train.columns))
    feature_cols.to_csv('features.csv',header=False,index=False)
    mlflow.log_artifact("features.csv")

    model = XGBClassifier()
    model.fit(X_train, y_train)
    
    pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, pred))
    loss=log_loss(y_val,pred)
    acc = accuracy_score(y_val, pred)

    
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("log_loss", loss)
    
    mlflow.log_metric("accuracy", acc)
    
    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    if tracking_url_type_store != "file":
        mlflow.sklearn.log_model(model, "model", registered_model_name="XGBoost")
    else:
        mlflow.sklearn.log_model(model, "model")










In [16]:

from sklearn.tree import DecisionTreeClassifier


with mlflow.start_run(run_name='untuned_decisiontree'):
    
    mlflow.log_param('data_version', rev)
    feature_cols=pd.DataFrame(list(X_train.columns))
    feature_cols.to_csv('features.csv',header=False,index=False)
    mlflow.log_artifact("features.csv")

    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    
    pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, pred))
    loss=log_loss(y_val,pred)
    acc = accuracy_score(y_val, pred)

    
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("log_loss", loss)
    
    mlflow.log_metric("accuracy", acc)
    
    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    if tracking_url_type_store != "file":
        mlflow.sklearn.log_model(model, "model", registered_model_name="DecisionTreeClassifier")
    else:
        mlflow.sklearn.log_model(model, "model")

In [17]:
feature_importances = pd.DataFrame((model.feature_importances_).transpose() , index=X_train.columns.tolist(), columns=['importance'])
feature_importances.sort_values('importance', ascending=False)
# X_train.columns.tolist()

Unnamed: 0,importance
hour,0.356108
device_make,0.237602
date,0.205865
experiment,0.107381
browser,0.093044
platform_os,0.0


In [18]:
class CreateModel:
    def __init__(self, X_train, X_test, y_train, y_test,data_version,name,model):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        
        self.model=NULL
        self.featureImportance=[]
        self.name=name
        self.data_version=data_version
    def train(self,params):
        with mlflow.start_run(run_name=self.name):
            mlflow.log_param('data_version', this.data_version)
            feature_cols=pd.DataFrame(list(self.X_train.columns))
            feature_cols.to_csv('features.csv',header=False,index=False)
            mlflow.log_artifact("features.csv")

            model = self.model(**params)
            model.fit(self.X_train, self.y_train)

            pred = model.predict(self.X_test)
            rmse = np.sqrt(mean_squared_error(self.y_test, pred))
            loss=log_loss(self.y_test,pred)
            acc = accuracy_score(self.y_test, pred)


            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("log_loss", loss)

            mlflow.log_metric("accuracy", acc)

            tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
            if tracking_url_type_store != "file":
                mlflow.sklearn.log_model(model, "model", registered_model_name=self.name)
            else:
                mlflow.sklearn.log_model(model, "model")
    def trainKFold(self,folds,params):
        with mlflow.start_run(run_name=self.name):
            mlflow.log_param('data_version', this.data_version)
            feature_cols=pd.DataFrame(list(self.X_train.columns))
            feature_cols.to_csv('features.csv',header=False,index=False)
            mlflow.log_artifact("features.csv")
            
            kf=KFold(n_splits=folds, random_state=None)
            model = self.model(**params)
            model.fit(self.X_train, self.y_train)

            pred = model.predict(self.X_test)
            rmse = np.sqrt(mean_squared_error(self.y_test, pred))
            loss=log_loss(self.y_test,pred)
            acc = accuracy_score(self.y_test, pred)

            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("log_loss", loss)

            mlflow.log_metric("accuracy", acc)

            tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
            if tracking_url_type_store != "file":
                mlflow.sklearn.log_model(model, "model", registered_model_name=self.name)
            else:
                mlflow.sklearn.log_model(model, "model")
    
    def getFeatureImportance(self):
        feature_importances = pd.DataFrame((self.model.feature_importances_).transpose() , index=self.X_train.columns.tolist(), columns=['importance'])
        return feature_importances.sort_values('importance', ascending=False)

In [19]:
X_Cross, X_test, y_Cross, y_test = train_test_split(
   X, y, test_size=0.1, random_state=1)

In [20]:

from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

with mlflow.start_run(run_name='kfold_decisiontree'):
    
    mlflow.log_param('data_version', rev)
    feature_cols=pd.DataFrame(list(X_Cross.columns))
    feature_cols.to_csv('features.csv',header=False,index=False)
    mlflow.log_artifact("features.csv")
    
    kf = KFold(n_splits=5)

    model = DecisionTreeClassifier()
    scores=[]
    for train_index, test_index in kf.split(X_Cross):
        X_train, X_val, y_train, y_val = X_Cross.iloc[train_index], X_Cross.iloc[test_index], y_Cross.iloc[train_index], y_Cross.iloc[test_index]
        model.fit(X_train, y_train)
        predict_valid=model.predict(X_val)
        valid_loss=log_loss(y_val,predict_valid)
        scores.append(valid_loss)
    print(scores)
    
    mlflow.log_metric("avergae_validation_log_loss",np.mean( scores))
    # model.fit(X_train, y_train)
    

    pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, pred))
    loss=log_loss(y_test,pred)
    acc = accuracy_score(y_test, pred)


    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("log_loss", loss)
    
    mlflow.log_metric("accuracy", acc)
    
    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    if tracking_url_type_store != "file":
        mlflow.sklearn.log_model(model, "model", registered_model_name="KfoldDecisionTreeClassifier")
    else:
        mlflow.sklearn.log_model(model, "model")

[17.577952180753382, 17.423739796916365, 14.494068606414288, 16.10790282589968, 16.417753691945173]


In [21]:
from xgboost import XGBClassifier


with mlflow.start_run(run_name='kfold_kgboost'):
    
    mlflow.log_param('data_version', rev)
    feature_cols=pd.DataFrame(list(X_train.columns))
    feature_cols.to_csv('features.csv',header=False,index=False)
    mlflow.log_artifact("features.csv")

    kf = KFold(n_splits=5)

    model = XGBClassifier(use_label_encoder=False)
    scores=[]
    for train_index, test_index in kf.split(X_Cross):
        X_train, X_val, y_train, y_val = X_Cross.iloc[train_index], X_Cross.iloc[test_index], y_Cross.iloc[train_index], y_Cross.iloc[test_index]
        model.fit(X_train, y_train)
        predict_valid=model.predict(X_val)
        valid_loss=log_loss(y_val,predict_valid)
        scores.append(valid_loss)
    print(scores)
    
    mlflow.log_metric("avergae_validation_log_loss",np.mean( scores))

    
    pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, pred))
    loss=log_loss(y_test,pred)
    acc = accuracy_score(y_test, pred)

    
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("log_loss", loss)
    
    mlflow.log_metric("accuracy", acc)
    
    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    if tracking_url_type_store != "file":
        mlflow.xgboost.log_model(model, "model", registered_model_name="KfoldXGBoost")
    else:
        mlflow.xgboost.log_model(model, "model")

[17.42376478433606, 17.26957381828735, 14.648259572462996, 15.333487213469256, 15.643287880572949]


In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score, log_loss
import mlflow
import mlflow.sklearn

from urllib.parse import urlparse

from mlflow.models.signature import infer_signature
from mlflow.utils.environment import _mlflow_conda_env
import numpy as np


with mlflow.start_run(run_name='kfold_linear_regression'):
    
    mlflow.log_param('data_version', rev)
    feature_cols=pd.DataFrame(list(X_train.columns))
    feature_cols.to_csv('features.csv',header=False,index=False)
    mlflow.log_artifact("features.csv")

    kf = KFold(n_splits=5)

    model = LogisticRegression()
    scores=[]
    for train_index, test_index in kf.split(X_Cross):
        X_train, X_val, y_train, y_val = X_Cross.iloc[train_index], X_Cross.iloc[test_index], y_Cross.iloc[train_index], y_Cross.iloc[test_index]
        model.fit(X_train, y_train)
        predict_valid=model.predict(X_val)
        valid_loss=log_loss(y_val,predict_valid)
        scores.append(valid_loss)
    print(scores)
    
    mlflow.log_metric("avergae_validation_log_loss",np.mean( scores))

    
    pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, pred))
    loss=log_loss(y_test,pred)
    acc = accuracy_score(y_test, pred)

    
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("log_loss", loss)
    
    mlflow.log_metric("accuracy", acc)
    
    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    if tracking_url_type_store != "file":
        mlflow.sklearn.log_model(model, "model", registered_model_name="KfoldLinearRegressionModel")
    else:
        mlflow.sklearn.log_model(model, "model")



[16.190069283271306, 16.4985511650475, 15.727532081439028, 16.107816770570867, 13.939459509915352]


In [23]:
# import findspark
# findspark.init()

# import pyspark

# from hyperopt import fmin, tpe, hp, SparkTrials, Trials, STATUS_OK
# from hyperopt.pyll import scope
# from math import exp
# import numpy as np
# import xgboost as xgb
 
# search_space = {
#   'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
#   'learning_rate': hp.loguniform('learning_rate', -3, 0),
#   'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
#   'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
#   'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
#   'objective': 'binary:logistic',
#   'seed': 123, # Set a seed for deterministic training
# }
 
# def train_model(params):
#   # With MLflow autologging, hyperparameters and the trained model are automatically logged to MLflow.
#   mlflow.xgboost.autolog()
#   with mlflow.start_run(nested=True):
#     train = xgb.DMatrix(data=X_train, label=y_train)
#     test = xgb.DMatrix(data=X_test, label=y_test)
#     # Pass in the test set so xgb can track an evaluation metric. XGBoost terminates training when the evaluation metric
#     # is no longer improving.
#     booster = xgb.train(params=params, dtrain=train, num_boost_round=1000,\
#                         evals=[(test, "test")], early_stopping_rounds=50)
   
#     pred = booster.predict(test)
#     rmse = np.sqrt(mean_squared_error(y_test, pred))
#     loss=log_loss(y_test,pred)
#     acc = accuracy_score(y_test, pred)
    
#     mlflow.log_metric("rmse", rmse)
#     mlflow.log_metric("log_loss", loss)
    
#     mlflow.log_metric("accuracy", acc)

 
#     signature = infer_signature(X_train, booster.predict(train))
#     mlflow.xgboost.log_model(booster, "model", registered_model_name="HyperXGBoost")
    
#     # Set the loss to log loss so fmin min the loss
#     return {'status': STATUS_OK, 'loss': -1*loss, 'booster': booster.attributes()}
 
# # Greater parallelism will lead to speedups, but a less optimal hyperparameter sweep. 
# # A reasonable value for parallelism is the square root of max_evals.
# spark_trials = SparkTrials(parallelism=10)
 
# # Run fmin within an MLflow run context so that each hyperparameter configuration is logged as a child run of a parent
# # run called "xgboost_models" .
# with mlflow.start_run(run_name='hyper_xgboost'):
#   best_params = fmin(
#     fn=train_model, 
#     space=search_space, 
#     algo=tpe.suggest, 
#     max_evals=96,
#     trials=spark_trials, 
#     rstate=np.random.RandomState(123)
#   )

In [32]:

from sklearn.model_selection import GridSearchCV

mlflow.xgboost.autolog()

params = {
        'min_child_weight': [ 5,6,7,8,9 ,10],
        'gamma': [ 5,6,7,8,9],
        'subsample': [1.0, 1.2, 1.3],
        'colsample_bytree': [ 0.8, 0.9,1.0],
        'max_depth': [2,3,4]
        }

model = XGBClassifier(learning_rate=0.1, n_estimators=200, objective='binary:logistic', eval_metric="logloss",
                    silent=True, nthread=2)

cvFold = KFold(n_splits=5)
gridSearch = GridSearchCV(estimator=model, param_grid=params, n_jobs=-1,  cv=cvFold, scoring="neg_log_loss")
with mlflow.start_run(run_name='hyperparam_xgboost') as run:
        searchResults = gridSearch.fit(X_train, y_train)
        
        pred=searchResults.predict(X_test)
        loss=log_loss(y_test,pred)
        acc = accuracy_score(y_test, pred)

        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("log_loss", loss)

        mlflow.log_metric("accuracy", acc)



bestModel = searchResults.best_estimator_



              colsample_bynode=None, colsample_bytree=None,
              eval_metric='logloss', gamma=None, gpu_id=None,
              importance_type='gain', interaction_constr...`
 -0.69017963         nan         nan -0.69017963         nan         nan
 -0.69017963         nan         nan -0.69017963         nan         nan
 -0.68957294         nan         nan -0.68952651         nan         nan
 -0.68966313         nan         nan -0.68965646         nan         nan
 -0.68965646         nan         nan -0.68965646         nan         nan
 -0.69010603         nan         nan -0.68982155         nan         nan
 -0.68971938         nan         nan -0.68971271         nan         nan
 -0.68971271         nan         nan -0.68971271         nan         nan
 -0.69058112         nan         nan -0.69058112         nan         nan
 -0.69062605         nan         nan -0.69062605         nan         nan
 -0.69062605         nan         nan -0.69062605         nan         nan
 -0.69047353  

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




2021/07/24 18:47:03 INFO mlflow.sklearn.utils: Logging the 5 best runs, 805 runs will be omitted.


In [25]:

mlflow.sklearn.autolog()

params = {
        'criterion': ['gini','entropy'],
        'max_depth':[4,5,6,7,8,9,10]
        }

model = DecisionTreeClassifier()

cvFold = KFold(n_splits=5)
gridSearch = GridSearchCV(estimator=model, param_grid=params, n_jobs=-1,  cv=cvFold, scoring="neg_log_loss")
with mlflow.start_run(run_name='hyperparam_decision_tree') as run:
        searchResults = gridSearch.fit(X_train, y_train)
        
        pred=searchResults.predict(X_test)
        loss=log_loss(y_test,pred)
        acc = accuracy_score(y_test, pred)

        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("log_loss", loss)

        mlflow.log_metric("accuracy", acc)



bestModel = searchResults.best_estimator_
# print("Log Loss: {:.2f}".format(loss))

# mlflow.xgboost.log_model(bestModel, "model", registered_model_name="HyperXGBoost")


# tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
# if tracking_url_type_store != "file":
#         mlflow.xgboost.log_model(bestModel, "model", registered_model_name="HyperXGBoost")
# else:
#         mlflow.xgboost.log_model(model, "model")



2021/07/24 18:31:09 INFO mlflow.sklearn.utils: Logging the 5 best runs, 9 runs will be omitted.


In [26]:

mlflow.sklearn.autolog()

params = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}

model = LogisticRegression()

cvFold = KFold(n_splits=5)
gridSearch = GridSearchCV(estimator=model, param_grid=params, n_jobs=-1,  cv=cvFold, scoring="neg_log_loss")
with mlflow.start_run(run_name='hyperparam_logistic_regression') as run:
        searchResults = gridSearch.fit(X_train, y_train)
        
        pred=searchResults.predict(X_test)
        loss=log_loss(y_test,pred)
        acc = accuracy_score(y_test, pred)

        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("log_loss", loss)

        mlflow.log_metric("accuracy", acc)



bestModel = searchResults.best_estimator_
# print("Log Loss: {:.2f}".format(loss))

# mlflow.xgboost.log_model(bestModel, "model", registered_model_name="HyperXGBoost")


# tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
# if tracking_url_type_store != "file":
#         mlflow.xgboost.log_model(bestModel, "model", registered_model_name="HyperXGBoost")
# else:
#         mlflow.xgboost.log_model(model, "model")



         nan -0.69506129         nan -0.69594087         nan -0.69609918
         nan -0.69611652]
2021/07/24 18:31:10 INFO mlflow.sklearn.utils: Logging the 5 best runs, 9 runs will be omitted.


In [34]:
logged_model = 'runs:/c352bb377e77478c83d7b4367a1f6ce2/best_estimator'

# Load model as a PyFuncModel.
loaded_model = mlflow.sklearn.load_model(logged_model)

# Predict on a Pandas DataFrame.
# loaded_model.predict(pd.DataFrame(data))


feature_importances = pd.DataFrame((loaded_model.coef_).transpose() , index=X.columns.tolist(), columns=['importance'])
feature_importances.sort_values('importance', ascending=False)

Unnamed: 0,importance
hour,0.004296
experiment,0.003759
device_make,0.002148
platform_os,0.001155
date,0.000813
browser,-0.003061


In [30]:
#Decision Tree
logged_model = 'runs:/aaebc2dc3fdc4a2181345f81eebb7ace/best_estimator'

# Load model as a PyFuncModel.
loaded_model = mlflow.sklearn.load_model(logged_model)

# Predict on a Pandas DataFrame.
# loaded_model.predict(pd.DataFrame(data))


feature_importances = pd.DataFrame((loaded_model.feature_importances_).transpose() , index=X_train.columns.tolist(), columns=['importance'])
feature_importances.sort_values('importance', ascending=False)

Unnamed: 0,importance
device_make,0.428198
browser,0.294645
hour,0.240644
experiment,0.036513
date,0.0
platform_os,0.0


In [30]:
#Decision Tree
logged_model = 'runs:/aaebc2dc3fdc4a2181345f81eebb7ace/best_estimator'

# Load model as a PyFuncModel.
loaded_model = mlflow.sklearn.load_model(logged_model)

# Predict on a Pandas DataFrame.
# loaded_model.predict(pd.DataFrame(data))


feature_importances = pd.DataFrame((loaded_model.feature_importances_).transpose() , index=X_train.columns.tolist(), columns=['importance'])
feature_importances.sort_values('importance', ascending=False)

Unnamed: 0,importance
device_make,0.428198
browser,0.294645
hour,0.240644
experiment,0.036513
date,0.0
platform_os,0.0


In [33]:
logged_model = 'runs:/514249d44d01476db05cd9819ad12228/best_estimator'

# Load model as a PyFuncModel.
loaded_model = mlflow.sklearn.load_model(logged_model)



feature_importances = pd.DataFrame((loaded_model.feature_importances_).transpose() , index=X_train.columns.tolist(), columns=['importance'])
feature_importances.sort_values('importance', ascending=False)

Unnamed: 0,importance
hour,0.438062
browser,0.384917
device_make,0.177021
experiment,0.0
date,0.0
platform_os,0.0
