# Importing Libraries

In [2]:
import os
import warnings
import sys

#import dvc.api
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet, LogisticRegression
from sklearn.preprocessing import LabelEncoder
from fast_ml.model_development import train_valid_test_split
from sklearn.tree import DecisionTreeClassifier
from urllib.parse import urlparse
import mlflow
import mlflow.sklearn
import logging

In [3]:
import warnings
warnings.filterwarnings('ignore')

# Loading Data

In [4]:
df = pd.read_csv('../data/AdSmartABdata.csv')

# Exploaring The Categorical Columns

In [6]:
relevant_rows = df.query('yes == 1 | no == 1')
# relevant_rows

In [7]:
top2_platform = relevant_rows['platform_os'].value_counts().index.to_list()[:2]
top2_platform

[6, 5]

In [8]:
platform_list = []
for platform in top2_platform:
    platforms = relevant_rows.loc[relevant_rows['platform_os'] == platform]
    platform_list.append(platforms)
platform_six = platform_list[0]
platform_five = platform_list[1]


In [9]:
platform_six.to_csv('../data/platform_six.csv')
platform_five.to_csv('../data/platform_five.csv')

In [10]:
top2_browser = relevant_rows['browser'].value_counts().index.to_list()[:2]
top2_browser

['Chrome Mobile', 'Chrome Mobile WebView']

In [11]:
browser_list = []
for browser in top2_browser:
    browsers = relevant_rows.loc[relevant_rows['browser'] == browser]
    browser_list.append(browsers)
chrome_mobile_data = browser_list[0]
chrome_mobile_webview_data = browser_list[1]

In [12]:
chrome_mobile_data.to_csv('../data/chrome_mobile_data.csv')
chrome_mobile_webview_data.to_csv('../data/chrome_mobile_webview_data.csv')


In [18]:
df2 = relevant_rows.drop('no', axis=1)

In [19]:
df2

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,yes
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0
16,008aafdf-deef-4482-8fec-d98e3da054da,exposed,2020-07-04,16,Generic Smartphone,6,Chrome Mobile,1
20,00a1384a-5118-4d1b-925b-6cdada50318d,exposed,2020-07-06,8,Generic Smartphone,6,Chrome Mobile,0
23,00b6fadb-10bd-49e3-a778-290da82f7a8d,control,2020-07-08,4,Samsung SM-A202F,6,Facebook,1
27,00ebf4a8-060f-4b99-93ac-c62724399483,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0
...,...,...,...,...,...,...,...,...
8059,ffa08ff9-a132-4051-aef5-01a9c79367bc,exposed,2020-07-05,21,Generic Smartphone,6,Chrome Mobile,1
8063,ffb176df-ecd2-45d3-b05f-05b173a093a7,exposed,2020-07-04,1,Generic Smartphone,6,Chrome Mobile,1
8064,ffb79718-6f25-4896-b6b3-e58b80a6e147,control,2020-07-09,7,Generic Smartphone,6,Chrome Mobile,0
8069,ffca1153-c182-4f32-9e90-2a6008417497,control,2020-07-10,16,Generic Smartphone,6,Chrome Mobile,0


In [20]:
df2.rename(columns = {'yes': 'clicked_or_not'}, inplace=True)

In [21]:
df2

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,clicked_or_not
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0
16,008aafdf-deef-4482-8fec-d98e3da054da,exposed,2020-07-04,16,Generic Smartphone,6,Chrome Mobile,1
20,00a1384a-5118-4d1b-925b-6cdada50318d,exposed,2020-07-06,8,Generic Smartphone,6,Chrome Mobile,0
23,00b6fadb-10bd-49e3-a778-290da82f7a8d,control,2020-07-08,4,Samsung SM-A202F,6,Facebook,1
27,00ebf4a8-060f-4b99-93ac-c62724399483,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0
...,...,...,...,...,...,...,...,...
8059,ffa08ff9-a132-4051-aef5-01a9c79367bc,exposed,2020-07-05,21,Generic Smartphone,6,Chrome Mobile,1
8063,ffb176df-ecd2-45d3-b05f-05b173a093a7,exposed,2020-07-04,1,Generic Smartphone,6,Chrome Mobile,1
8064,ffb79718-6f25-4896-b6b3-e58b80a6e147,control,2020-07-09,7,Generic Smartphone,6,Chrome Mobile,0
8069,ffca1153-c182-4f32-9e90-2a6008417497,control,2020-07-10,16,Generic Smartphone,6,Chrome Mobile,0


# Splitting Numerical and categorical variables

In [22]:
numerical_column = df2.select_dtypes(exclude="object").columns.tolist()
categorical_column = df2.select_dtypes(include="object").columns.tolist()
print("Numerical Columns:", numerical_column)
print("****************")
print("Categorical Columns:", categorical_column)

Numerical Columns: ['hour', 'platform_os', 'clicked_or_not']
****************
Categorical Columns: ['auction_id', 'experiment', 'date', 'device_make', 'browser']
