In [16]:
import pandas as pd
import dvc.api
import mlflow
import matplotlib.pyplot as plt

import seaborn as sns

In [71]:
path = 'data\AdSmartABdata.csv'
repo = 'https://github.com/SameC137/abtest-mlops'
rev = 'v2'
data_url = dvc.api.get_url(path=path, repo=repo,rev=rev)

collected_data = pd.read_csv(data_url,index_col=0)
collected_data

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,yes,no
0,0008ef63-77a7-448b-bd1e-075f42c55e39,exposed,2020-07-10,8,Generic Smartphone,6,0,0
1,000eabc5-17ce-4137-8efe-44734d914446,exposed,2020-07-07,10,Generic Smartphone,6,0,0
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,6,0,1
3,00187412-2932-4542-a8ef-3633901c98d9,control,2020-07-03,15,Samsung SM-A705FN,6,0,0
4,001a7785-d3fe-4e11-a344-c8735acacc2c,control,2020-07-03,15,Generic Smartphone,6,0,0
...,...,...,...,...,...,...,...,...
8072,ffea24ec-cec1-43fb-b1d1-8f93828c2be2,exposed,2020-07-05,7,Generic Smartphone,6,0,0
8073,ffea3210-2c3e-426f-a77d-0aa72e73b20f,control,2020-07-03,15,Generic Smartphone,6,0,0
8074,ffeaa0f1-1d72-4ba9-afb4-314b3b00a7c7,control,2020-07-04,9,Generic Smartphone,6,0,0
8075,ffeeed62-3f7c-4a6e-8ba7-95d303d40969,exposed,2020-07-05,15,Samsung SM-A515F,6,0,0


## Select only users with a response

In [73]:
responded=collected_data.loc[(collected_data["yes"]==1) | (collected_data["no"]==1)]



# Remove auction Id

In [74]:
features= responded.drop(["auction_id","no"],axis=1, inplace=False)
features=features.reset_index(drop=True)

In [75]:
y=features["yes"]

In [51]:


# lb = LabelEncoder() 

# features["experiment"]=lb.fit_transform(features["experiment"])

# features["device_make"]=lb.fit_transform(features["device_make"])

# features["browser"]=lb.fit_transform(features["browser"])

# features["date"]=lb.fit_transform(features["date"])

# features.drop("yes",axis=1,inplace=True)

In [79]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler


def encode_scale_features(df,columns):
    lb=LabelEncoder()
    norm = MinMaxScaler()
    for i in columns:
        df[i]=lb.fit_transform(df[i])   
    norm_fit = norm.fit_transform(df)
    out=pd.DataFrame(norm_fit,columns=df.columns)
    return out

In [82]:
if rev=="v2":
    feat=["experiment","device_make","date"]
elif rev=="v3":
    feat=["experiment","device_make","browser","date"]
else:
    feat=["experiment","device_make","browser","plaform_os","date"]

features=encode_scale_features(features,feat)
    

In [52]:


# creating scaler scale var.
# norm = MinMaxScaler()
# fit the scal
# norm_fit = norm.fit_transform(features)

# X=pd.DataFrame(norm_fit,columns=features.columns)

In [83]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
   X, y, test_size=0.3, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.33, random_state=1)


In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import mean_squared_error, accuracy_score,log_loss
# import mlflow
# import mlflow.sklearn


# from mlflow.models.signature import infer_signature
# from mlflow.utils.environment import _mlflow_conda_env


# model = LogisticRegression()
# with mlflow.start_run() as run:
#     model.fit(X_train y_train)
#     pred = cls.predict(X_test)
#     mlflow.log_metric(f"accuracy", kfold_scores.mean())
#     mlflow.log_metric(f"std_accuracy", kfold_scores.std())
#     print(mean_squared_error(y_test, pred))
#     print("Logged data and model in run {}".format(run.info.run_id))
    

In [None]:
# def eval_metrics(actual, pred):
#     rmse = np.sqrt(mean_squared_error(actual, pred))
#     mae = mean_absolute_error(actual, pred)
#     r2 = r2_score(actual, pred)
#     return rmse, mae, r2
    

In [66]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score, log_loss
import mlflow
import mlflow.sklearn

from urllib.parse import urlparse

from mlflow.models.signature import infer_signature
from mlflow.utils.environment import _mlflow_conda_env
import numpy as np


with mlflow.start_run(run_name='untuned_linear_regression'):
    
    mlflow.log_param('data_version', rev)
    feature_cols=pd.DataFrame(list(X_train.columns))
    feature_cols.to_csv('features.csv',header=False,index=False)
    mlflow.log_artifact("features.csv")

    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, pred))
    loss=log_loss(y_val,pred)
    acc = accuracy_score(y_val, pred)

    
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("log_loss", loss)
    
    mlflow.log_metric("accuracy", acc)
    
    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    if tracking_url_type_store != "file":
        mlflow.sklearn.
        (model, "model", registered_model_name="LinearRegressionModel")
    else:
        mlflow.sklearn.log_model(model, "model")






In [67]:

feature_importances = pd.DataFrame((model.coef_).transpose() , index=X_train.columns.tolist(), columns=['importance'])
feature_importances.sort_values('importance', ascending=False)
# X_train.columns.tolist()

Unnamed: 0,importance
device_make,0.364569
experiment,0.154695
hour,-0.00377
browser,-0.062687
date,-0.063641


In [84]:
from xgboost import XGBClassifier


with mlflow.start_run(run_name='untuned_kgboost'):
    
    mlflow.log_param('data_version', rev)
    feature_cols=pd.DataFrame(list(X_train.columns))
    feature_cols.to_csv('features.csv',header=False,index=False)
    mlflow.log_artifact("features.csv")

    model = XGBClassifier()
    model.fit(X_train, y_train)
    
    pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, pred))
    loss=log_loss(y_val,pred)
    acc = accuracy_score(y_val, pred)

    
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("log_loss", loss)
    
    mlflow.log_metric("accuracy", acc)
    
    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    if tracking_url_type_store != "file":
        mlflow.sklearn.log_model(model, "model", registered_model_name="XGBoost")
    else:
        mlflow.sklearn.log_model(model, "model")










In [89]:

from sklearn.tree import DecisionTreeClassifier


with mlflow.start_run(run_name='untuned_decisiontree'):
    
    mlflow.log_param('data_version', rev)
    feature_cols=pd.DataFrame(list(X_train.columns))
    feature_cols.to_csv('features.csv',header=False,index=False)
    mlflow.log_artifact("features.csv")

    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    
    pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, pred))
    loss=log_loss(y_val,pred)
    acc = accuracy_score(y_val, pred)

    
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("log_loss", loss)
    
    mlflow.log_metric("accuracy", acc)
    
    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    if tracking_url_type_store != "file":
        mlflow.sklearn.log_model(model, "model", registered_model_name="DecisionTreeClassifier")
    else:
        mlflow.sklearn.log_model(model, "model")

In [87]:
feature_importances = pd.DataFrame((model.feature_importances_).transpose() , index=X_train.columns.tolist(), columns=['importance'])
feature_importances.sort_values('importance', ascending=False)
# X_train.columns.tolist()

Unnamed: 0,importance
hour,0.345331
device_make,0.23705
date,0.211202
experiment,0.106343
browser,0.100074


In [88]:
class CreateModel:
    def __init__(self, X_train, X_test, y_train, y_test,data_version,name,model):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        
        self.model=NULL
        self.featureImportance=[]
        self.name=name
        self.data_version=data_version
    def train(self,params):
        with mlflow.start_run(run_name=self.name):
            mlflow.log_param('data_version', this.data_version)
            feature_cols=pd.DataFrame(list(self.X_train.columns))
            feature_cols.to_csv('features.csv',header=False,index=False)
            mlflow.log_artifact("features.csv")

            model = self.model(**params)
            model.fit(self.X_train, self.y_train)

            pred = model.predict(self.X_test)
            rmse = np.sqrt(mean_squared_error(self.y_test, pred))
            loss=log_loss(self.y_test,pred)
            acc = accuracy_score(self.y_test, pred)


            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("log_loss", loss)

            mlflow.log_metric("accuracy", acc)

            tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
            if tracking_url_type_store != "file":
                mlflow.sklearn.log_model(model, "model", registered_model_name=self.name)
            else:
                mlflow.sklearn.log_model(model, "model")
    def trainKFold(self,folds,params):
        with mlflow.start_run(run_name=self.name):
            mlflow.log_param('data_version', this.data_version)
            feature_cols=pd.DataFrame(list(self.X_train.columns))
            feature_cols.to_csv('features.csv',header=False,index=False)
            mlflow.log_artifact("features.csv")
            
            kf=KFold(n_splits=folds, random_state=None)
            model = self.model(**params)
            model.fit(self.X_train, self.y_train)

            pred = model.predict(self.X_test)
            rmse = np.sqrt(mean_squared_error(self.y_test, pred))
            loss=log_loss(self.y_test,pred)
            acc = accuracy_score(self.y_test, pred)

            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("log_loss", loss)

            mlflow.log_metric("accuracy", acc)

            tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
            if tracking_url_type_store != "file":
                mlflow.sklearn.log_model(model, "model", registered_model_name=self.name)
            else:
                mlflow.sklearn.log_model(model, "model")
    
    def getFeatureImportance(self):
        feature_importances = pd.DataFrame((self.model.feature_importances_).transpose() , index=self.X_train.columns.tolist(), columns=['importance'])
        return feature_importances.sort_values('importance', ascending=False)

SyntaxError: invalid syntax (<ipython-input-88-e5323cbb8e77>, line 1)

In [95]:
X_Cross, X_test, y_Cross, y_test = train_test_split(
   X, y, test_size=0.1, random_state=1)



In [100]:

from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

with mlflow.start_run(run_name='kfold_decisiontree'):
    
    mlflow.log_param('data_version', rev)
    feature_cols=pd.DataFrame(list(X_Cross.columns))
    feature_cols.to_csv('features.csv',header=False,index=False)
    mlflow.log_artifact("features.csv")
    
    kf = KFold(n_splits=5)

    model = DecisionTreeClassifier()
    scores=[]
    for train_index, test_index in kf.split(X_Cross):
        X_train, X_val, y_train, y_val = X_Cross.iloc[train_index], X_Cross.iloc[test_index], y_Cross.iloc[train_index], y_Cross.iloc[test_index]
        model.fit(X_train, y_train)
        predict_valid=model.predict(X_val)
        valid_loss=log_loss(y_val,predict_valid)
        scores.append(valid_loss)
    print(scores)
    
    mlflow.log_metric("avergae_validation_log_loss",np.mean( scores))
    # model.fit(X_train, y_train)
    

    pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, pred))
    loss=log_loss(y_test,pred)
    acc = accuracy_score(y_test, pred)


    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("log_loss", loss)
    
    mlflow.log_metric("accuracy", acc)
    
    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    if tracking_url_type_store != "file":
        mlflow.sklearn.log_model(model, "model", registered_model_name="KfoldDecisionTreeClassifier")
    else:
        mlflow.sklearn.log_model(model, "model")

[17.423754075441902, 17.269548830867656, 14.648259572462996, 15.798130843905588, 16.262871286586833]


In [105]:
from xgboost import XGBClassifier


with mlflow.start_run(run_name='kfold_kgboost'):
    
    mlflow.log_param('data_version', rev)
    feature_cols=pd.DataFrame(list(X_train.columns))
    feature_cols.to_csv('features.csv',header=False,index=False)
    mlflow.log_artifact("features.csv")

    kf = KFold(n_splits=5)

    model = XGBClassifier(use_label_encoder=False)
    scores=[]
    for train_index, test_index in kf.split(X_Cross):
        X_train, X_val, y_train, y_val = X_Cross.iloc[train_index], X_Cross.iloc[test_index], y_Cross.iloc[train_index], y_Cross.iloc[test_index]
        model.fit(X_train, y_train)
        predict_valid=model.predict(X_val)
        valid_loss=log_loss(y_val,predict_valid)
        scores.append(valid_loss)
    print(scores)
    
    mlflow.log_metric("avergae_validation_log_loss",np.mean( scores))

    
    pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, pred))
    loss=log_loss(y_test,pred)
    acc = accuracy_score(y_test, pred)

    
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("log_loss", loss)
    
    mlflow.log_metric("accuracy", acc)
    
    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    if tracking_url_type_store != "file":
        mlflow.xgboost.log_model(model, "model", registered_model_name="KfoldXGBoost")
    else:
        mlflow.xgboost.log_model(model, "model")

[17.42376478433606, 17.26957381828735, 14.185679535054103, 15.333487213469256, 15.488412646492005]


In [104]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score, log_loss
import mlflow
import mlflow.sklearn

from urllib.parse import urlparse

from mlflow.models.signature import infer_signature
from mlflow.utils.environment import _mlflow_conda_env
import numpy as np


with mlflow.start_run(run_name='kfold_linear_regression'):
    
    mlflow.log_param('data_version', rev)
    feature_cols=pd.DataFrame(list(X_train.columns))
    feature_cols.to_csv('features.csv',header=False,index=False)
    mlflow.log_artifact("features.csv")

    kf = KFold(n_splits=5)

    model = LogisticRegression()
    scores=[]
    for train_index, test_index in kf.split(X_Cross):
        X_train, X_val, y_train, y_val = X_Cross.iloc[train_index], X_Cross.iloc[test_index], y_Cross.iloc[train_index], y_Cross.iloc[test_index]
        model.fit(X_train, y_train)
        predict_valid=model.predict(X_val)
        valid_loss=log_loss(y_val,predict_valid)
        scores.append(valid_loss)
    print(scores)
    
    mlflow.log_metric("avergae_validation_log_loss",np.mean( scores))

    
    pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, pred))
    loss=log_loss(y_test,pred)
    acc = accuracy_score(y_test, pred)

    
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("log_loss", loss)
    
    mlflow.log_metric("accuracy", acc)
    
    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    if tracking_url_type_store != "file":
        mlflow.sklearn.log_model(model, "model", registered_model_name="KfoldLinearRegressionModel")
    else:
        mlflow.sklearn.log_model(model, "model")



[16.498451215368725, 16.498544025784728, 15.727532081439028, 16.107816770570867, 15.023654275617263]


In [116]:
import findspark
findspark.init()

import pyspark

from hyperopt import fmin, tpe, hp, SparkTrials, Trials, STATUS_OK
from hyperopt.pyll import scope
from math import exp
import numpy as np
import xgboost as xgb
 
search_space = {
  'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
  'learning_rate': hp.loguniform('learning_rate', -3, 0),
  'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
  'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
  'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
  'objective': 'binary:logistic',
  'seed': 123, # Set a seed for deterministic training
}
 
def train_model(params):
  # With MLflow autologging, hyperparameters and the trained model are automatically logged to MLflow.
  mlflow.xgboost.autolog()
  with mlflow.start_run(nested=True):
    train = xgb.DMatrix(data=X_train, label=y_train)
    test = xgb.DMatrix(data=X_test, label=y_test)
    # Pass in the test set so xgb can track an evaluation metric. XGBoost terminates training when the evaluation metric
    # is no longer improving.
    booster = xgb.train(params=params, dtrain=train, num_boost_round=1000,\
                        evals=[(test, "test")], early_stopping_rounds=50)
   
    pred = booster.predict(test)
    rmse = np.sqrt(mean_squared_error(y_test, pred))
    loss=log_loss(y_test,pred)
    acc = accuracy_score(y_test, pred)
    
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("log_loss", loss)
    
    mlflow.log_metric("accuracy", acc)

 
    signature = infer_signature(X_train, booster.predict(train))
    mlflow.xgboost.log_model(booster, "model", registered_model_name="HyperXGBoost")
    
    # Set the loss to log loss so fmin min the loss
    return {'status': STATUS_OK, 'loss': -1*loss, 'booster': booster.attributes()}
 
# Greater parallelism will lead to speedups, but a less optimal hyperparameter sweep. 
# A reasonable value for parallelism is the square root of max_evals.
spark_trials = SparkTrials(parallelism=10)
 
# Run fmin within an MLflow run context so that each hyperparameter configuration is logged as a child run of a parent
# run called "xgboost_models" .
with mlflow.start_run(run_name='hyper_xgboost'):
  best_params = fmin(
    fn=train_model, 
    space=search_space, 
    algo=tpe.suggest, 
    max_evals=96,
    trials=spark_trials, 
    rstate=np.random.RandomState(123)
  )

ValueError: Couldn't find Spark, make sure SPARK_HOME env is set or Spark is in an expected location (e.g. from homebrew installation).

In [127]:

from sklearn.model_selection import GridSearchCV

mlflow.xgboost.autolog()

params = {
        'min_child_weight': [ 5,6,7,8,9 ,10],
        'gamma': [ 5,6,7,8,9],
        'subsample': [1.0, 1.2, 1.3],
        'colsample_bytree': [ 0.8, 0.9,1.0],
        'max_depth': [2,3,4]
        }

model = XGBClassifier(learning_rate=0.1, n_estimators=200, objective='binary:logistic', eval_metric="logloss",
                    silent=True, nthread=2)

cvFold = KFold(n_splits=5)
gridSearch = GridSearchCV(estimator=model, param_grid=params, n_jobs=-1,  cv=cvFold, scoring="neg_log_loss")
with mlflow.start_run(run_name='hyperparam_xgboost') as run:
        searchResults = gridSearch.fit(X_train, y_train)
        
        pred=searchResults.predict(X_test)
        loss=log_loss(y_test,pred)
        acc = accuracy_score(y_test, pred)

        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("log_loss", loss)

        mlflow.log_metric("accuracy", acc)



bestModel = searchResults.best_estimator_
# print("Log Loss: {:.2f}".format(loss))

# mlflow.xgboost.log_model(bestModel, "model", registered_model_name="HyperXGBoost")


# tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
# if tracking_url_type_store != "file":
#         mlflow.xgboost.log_model(bestModel, "model", registered_model_name="HyperXGBoost")
# else:
#         mlflow.xgboost.log_model(model, "model")



              colsample_bynode=None, colsample_bytree=None,
              eval_metric='logloss', gamma=None, gpu_id=None,
              importance_type='gain', interaction_constr...`
 -0.69021544         nan         nan -0.69021544         nan         nan
 -0.69021544         nan         nan -0.69021544         nan         nan
 -0.68948511         nan         nan -0.68944185         nan         nan
 -0.68957847         nan         nan -0.68957847         nan         nan
 -0.68957847         nan         nan -0.68957847         nan         nan
 -0.68999959         nan         nan -0.68972647         nan         nan
 -0.68964839         nan         nan -0.68962378         nan         nan
 -0.68962378         nan         nan -0.68962378         nan         nan
 -0.6907612          nan         nan -0.6907612          nan         nan
 -0.6907612          nan         nan -0.6907612          nan         nan
 -0.6907612          nan         nan -0.6907612          nan         nan
 -0.69047353  

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




2021/07/24 12:01:37 INFO mlflow.sklearn.utils: Logging the 5 best runs, 805 runs will be omitted.


In [129]:

mlflow.sklearn.autolog()

params = {
        'criterion': ['gini','entropy'],
        'max_depth':[4,5,6,7,8,9,10]
        }

model = DecisionTreeClassifier()

cvFold = KFold(n_splits=5)
gridSearch = GridSearchCV(estimator=model, param_grid=params, n_jobs=-1,  cv=cvFold, scoring="neg_log_loss")
with mlflow.start_run(run_name='hyperparam_decision_tree') as run:
        searchResults = gridSearch.fit(X_train, y_train)
        
        pred=searchResults.predict(X_test)
        loss=log_loss(y_test,pred)
        acc = accuracy_score(y_test, pred)

        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("log_loss", loss)

        mlflow.log_metric("accuracy", acc)



bestModel = searchResults.best_estimator_
# print("Log Loss: {:.2f}".format(loss))

# mlflow.xgboost.log_model(bestModel, "model", registered_model_name="HyperXGBoost")


# tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
# if tracking_url_type_store != "file":
#         mlflow.xgboost.log_model(bestModel, "model", registered_model_name="HyperXGBoost")
# else:
#         mlflow.xgboost.log_model(model, "model")



2021/07/24 12:14:28 INFO mlflow.sklearn.utils: Logging the 5 best runs, 9 runs will be omitted.
