In [16]:
import pandas as pd
import dvc.api
import mlflow
import matplotlib.pyplot as plt

import seaborn as sns

In [43]:
path = 'data\AdSmartABdata.csv'
repo = 'https://github.com/SameC137/abtest-mlops'
rev = 'v3'
data_url = dvc.api.get_url(path=path, repo=repo,rev=rev)

collected_data = pd.read_csv(data_url,index_col=0)
collected_data

Unnamed: 0,auction_id,experiment,date,hour,device_make,browser,yes,no
0,0008ef63-77a7-448b-bd1e-075f42c55e39,exposed,2020-07-10,8,Generic Smartphone,Chrome Mobile,0,0
1,000eabc5-17ce-4137-8efe-44734d914446,exposed,2020-07-07,10,Generic Smartphone,Chrome Mobile,0,0
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,Chrome Mobile WebView,0,1
3,00187412-2932-4542-a8ef-3633901c98d9,control,2020-07-03,15,Samsung SM-A705FN,Facebook,0,0
4,001a7785-d3fe-4e11-a344-c8735acacc2c,control,2020-07-03,15,Generic Smartphone,Chrome Mobile,0,0
...,...,...,...,...,...,...,...,...
8072,ffea24ec-cec1-43fb-b1d1-8f93828c2be2,exposed,2020-07-05,7,Generic Smartphone,Chrome Mobile,0,0
8073,ffea3210-2c3e-426f-a77d-0aa72e73b20f,control,2020-07-03,15,Generic Smartphone,Chrome Mobile,0,0
8074,ffeaa0f1-1d72-4ba9-afb4-314b3b00a7c7,control,2020-07-04,9,Generic Smartphone,Chrome Mobile,0,0
8075,ffeeed62-3f7c-4a6e-8ba7-95d303d40969,exposed,2020-07-05,15,Samsung SM-A515F,Samsung Internet,0,0


## Select only users with a response

In [45]:
responded=collected_data.loc[(collected_data["yes"]==1) | (collected_data["no"]==1)]



# Remove auction Id

In [50]:
features= responded.drop(["auction_id","no"],axis=1, inplace=False)
features=features.reset_index(drop=True)

In [47]:
y=features["yes"]

In [51]:
from sklearn.preprocessing import LabelEncoder

lb = LabelEncoder() 

features["experiment"]=lb.fit_transform(features["experiment"])

features["device_make"]=lb.fit_transform(features["device_make"])

features["browser"]=lb.fit_transform(features["browser"])

features["date"]=lb.fit_transform(features["date"])

features.drop("yes",axis=1,inplace=True)

In [55]:
def encode_scale_features(df,columns):
    lb=LabelEncoder()
    for i in columns:
        df.loc[i]=lb.fit_transform(df.loc[i])
    return df

In [52]:
from sklearn.preprocessing import MinMaxScaler

# creating scaler scale var.
norm = MinMaxScaler()
# fit the scal
norm_fit = norm.fit_transform(features)

X=pd.DataFrame(norm_fit,columns=features.columns)

In [53]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
   X, y, test_size=0.3, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.33, random_state=1)


In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import mean_squared_error, accuracy_score,log_loss
# import mlflow
# import mlflow.sklearn


# from mlflow.models.signature import infer_signature
# from mlflow.utils.environment import _mlflow_conda_env


# model = LogisticRegression()
# with mlflow.start_run() as run:
#     model.fit(X_train y_train)
#     pred = cls.predict(X_test)
#     mlflow.log_metric(f"accuracy", kfold_scores.mean())
#     mlflow.log_metric(f"std_accuracy", kfold_scores.std())
#     print(mean_squared_error(y_test, pred))
#     print("Logged data and model in run {}".format(run.info.run_id))
    

In [None]:
# def eval_metrics(actual, pred):
#     rmse = np.sqrt(mean_squared_error(actual, pred))
#     mae = mean_absolute_error(actual, pred)
#     r2 = r2_score(actual, pred)
#     return rmse, mae, r2
    

In [54]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score, log_loss
import mlflow
import mlflow.sklearn

from urllib.parse import urlparse

from mlflow.models.signature import infer_signature
from mlflow.utils.environment import _mlflow_conda_env
import numpy as np


with mlflow.start_run(run_name='untuned_linear_regression'):
    
    mlflow.log_param('data_version', rev)
    feature_cols=pd.DataFrame(list(X_train.columns))
    feature_cols.to_csv('features.csv',header=False,index=False)
    mlflow.log_artifact("features.csv")

    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, pred))
    loss=log_loss(y_test,pred)
    acc = accuracy_score(y_test, pred)

    
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("log_loss", loss)
    
    mlflow.log_metric("accuracy", acc)
    
    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    if tracking_url_type_store != "file":
        mlflow.sklearn.log_model(model, "model", registered_model_name="LinearRegressionModel")
    else:
        mlflow.sklearn.log_model(model, "model")




