# Importing Libraries

In [2]:
import os
import warnings
import sys

#import dvc.api
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet, LogisticRegression
from sklearn.preprocessing import LabelEncoder
from fast_ml.model_development import train_valid_test_split
from sklearn.tree import DecisionTreeClassifier
from urllib.parse import urlparse
import mlflow
import mlflow.sklearn
import logging

ModuleNotFoundError: No module named 'fast_ml'

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Loading Data

In [None]:
df = pd.read_csv('../data/AdSmartABdata.csv')

# Exploaring The Categorical Columns

In [None]:
relevant_rows = df.query('yes == 1 | no == 0')
# relevant_rows

In [None]:
top2_platform = relevant_rows['platform_os'].value_counts().index.to_list()[:2]
top2_platform

In [None]:
platform_list = []
for platform in top2_platform:
    platforms = relevant_rows.loc[relevant_rows['platform_os'] == platform]
    platform_list.append(platforms)
platform_six = platform_list[0]
platform_five = platform_list[1]


In [None]:
platform_six.to_csv('../data/platform_six.csv')
platform_five.to_csv('../data/platform_five.csv')

In [None]:
top2_browser = relevant_rows['browser'].value_counts().index.to_list()[:2]
top2_browser

In [None]:
browser_list = []
for browser in top2_browser:
    browsers = relevant_rows.loc[relevant_rows['browser'] == browser]
    browser_list.append(browsers)
chrome_mobile_data = browser_list[0]
chrome_mobile_webview_data = browser_list[1]

In [None]:
chrome_mobile_data.to_csv('../data/chrome_mobile_data.csv')
chrome_mobile_webview_data.to_csv('../data/chrome_mobile_webview_data.csv')


In [None]:
df2 = relevant_rows.drop('no', axis=1)

In [None]:
df2.rename(columns = {'yes': 'clicked_or_not'}, inplace=True)

In [None]:
df2

# Splitting Numerical and categorical variables

In [None]:
numerical_column = df2.select_dtypes(exclude="object").columns.tolist()
categorical_column = df2.select_dtypes(include="object").columns.tolist()
print("Numerical Columns:", numerical_column)
print("****************")
print("Categorical Columns:", categorical_column)

# Splitting the columns for one hot encoding and label encoding

In [None]:
to_one_hot_encoding = [col for col in categorical_column if df2[col].nunique() <= 10 and df2[col].nunique() > 2]

# Get Categorical Column names thoose are not in "to_one_hot_encoding"
to_label_encoding = [col for col in categorical_column if not col in to_one_hot_encoding]

print("To One Hot Encoding:", to_one_hot_encoding)
print("To Label Encoding:", to_label_encoding)


In [None]:
# We will use built in pandas function "get_dummies()" to simply to encode "to_one_hot_encoding" columns
one_hot_encoded_columns = pd.get_dummies(df2[to_one_hot_encoding])
# one_hot_encoded_columns

In [None]:
le = LabelEncoder()
df2[to_label_encoding] = df2[to_label_encoding].apply(le.fit_transform)
# df2

In [None]:
# Label Encoding

label_encoded_columns = []
# For loop for each columns
for col in to_label_encoding:
    # We define new label encoder to each new column
    le = LabelEncoder()
    # Encode our data and create new Dataframe of it, 
    # notice that we gave column name in "columns" arguments
    column_dataframe = pd.DataFrame(le.fit_transform(df2[col]), columns=[col] )
    # and add new DataFrame to "label_encoded_columns" list
    label_encoded_columns.append(column_dataframe)

# Merge all data frames
label_encoded_columns = pd.concat(label_encoded_columns, axis=1)
# label_encoded_columns

In [None]:
# Copy our DataFrame to X variable
X = df2.copy()
X.drop(['date', 'browser'], axis=1, inplace=True)
# Merge DataFrames
X = pd.concat([X, one_hot_encoded_columns], axis=1)
print("All columns:", X.columns.tolist())
# X

In [None]:
# Define Y (This is the value we will predict)
y = X["yes"]

# Droping "class" from X
X.drop(["yes"], axis=1, inplace=True)
# X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.23)

# Logistic Regression

In [None]:
# Define Logistic Regression Model
log = LogisticRegression()
# We fit our model with our train data
log.fit(X_train, y_train)
# Then predict results from X_test data
pred_log = log.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_log[0:10])
print("Actual:", y_test[0:10])

In [None]:
from matplotlib import pyplot
# get importance
importance = log.coef_[0]
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()

# Confusion Matrix

In [None]:
cm_log = confusion_matrix(y_test, pred_log)
print("Logistic Regression:\n", cm_log)

# Accuracey Score

In [None]:
# Logistic Regression
acc_log = accuracy_score(y_test, pred_log)
print("Logistic Regression accuracy:", acc_log)