In [None]:
%%writefile train.py

import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from google.colab import drive
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import warnings
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, RepeatedStratifiedKFold, StratifiedKFold
from sklearn.metrics import mean_absolute_error, accuracy_score, classification_report, roc_auc_score, roc_curve, confusion_matrix, roc_auc_score, auc, log_loss
from imblearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline
import multiprocessing as mp
#from keras.optimizers import SGD
from keras.models import Sequential
from keras.layers import Dense
from statsmodels.stats.proportion import proportion_confint
from keras.wrappers.scikit_learn import KerasClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
import random
import gzip
from datetime import datetime
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler
from lightgbm import LGBMClassifier
from sklearn.compose import ColumnTransformer
import pickle


warnings.filterwarnings("ignore")

types_train = {
    'id': np.dtype(int),
    'click': np.dtype(int),
    'hour': np.dtype(int),
    'C1': np.dtype(int),
    'banner_pos': np.dtype(int),
    'site_id': np.dtype(str),
    'site_domain': np.dtype(str), 
    'site_category': np.dtype(str),
    'app_id': np.dtype(str),
    'app_domain': np.dtype(str),
    'app_category': np.dtype(str),
    'device_id': np.dtype(str),
    'device_ip': np.dtype(str),
    'device_model': np.dtype(str),
    'device_type': np.dtype(int),
    'device_conn_type': np.dtype(int),
    'C14': np.dtype(int),
    'C15': np.dtype(int),
    'C16': np.dtype(int),
    'C17': np.dtype(int),
    'C18': np.dtype(int),
    'C19': np.dtype(int),
    'C20': np.dtype(int),
    'C21':np.dtype(int)
}

types_test = {
    'id': np.dtype(int),
    'hour': np.dtype(int),
    'C1': np.dtype(int),
    'banner_pos': np.dtype(int),
    'site_id': np.dtype(str),
    'site_domain': np.dtype(str), 
    'site_category': np.dtype(str),
    'app_id': np.dtype(str),
    'app_domain': np.dtype(str),
    'app_category': np.dtype(str),
    'device_id': np.dtype(str),
    'device_ip': np.dtype(str),
    'device_model': np.dtype(str),
    'device_type': np.dtype(int),
    'device_conn_type': np.dtype(int),
    'C14': np.dtype(int),
    'C15': np.dtype(int),
    'C16': np.dtype(int),
    'C17': np.dtype(int),
    'C18': np.dtype(int),
    'C19': np.dtype(int),
    'C20': np.dtype(int),
    'C21':np.dtype(int)
}

# Code to mount google drive in case you are loading the data from your google drive
from google.colab import drive
drive.mount('/gdrive')


n = 40428967  #total number of records in the clickstream data 
sample_size = 200000
skip_values = sorted(random.sample(range(1,n), n-sample_size)) 

parse_date = lambda val : datetime.strptime(val, '%y%m%d%H')

with gzip.open('/gdrive/My Drive/Diploma Project/avazu-ctr-prediction - Kaggle Dataset/train.gz') as f:
    df = pd.read_csv(f, parse_dates = ['hour'], date_parser = parse_date, dtype=types_train, skiprows = skip_values)

#Feature Engineering
df['hour_of_day'] = df["hour"].apply(lambda x: str(x.time())[:5])
#the feature hour_of_day only has hours to represent and not the minutes
df["hour_of_day"] = df["hour_of_day"].apply(lambda x: int(x.split(":")[0]))
days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
df["day_of_week"] = df["hour"].apply(lambda x: days[x.weekday()])

# drop unwanted columns using subjective analysis and also drop highly correlated columns
cols = list(df.columns)
if any(col in cols for col in ["id", "hour","C17", "device_type"]):
  df = df.drop(["id", "hour","C17", "device_type"], axis=1)

def convert_obj_to_int(fm):
    
    object_list_columns = fm.columns
    object_list_dtypes = fm.dtypes
    print(object_list_columns)
    print(object_list_dtypes)
    for index in range(0,len(object_list_columns)):
        if object_list_dtypes[index] == object :
            fm[object_list_columns[index]] = fm[object_list_columns[index]].apply(lambda x: hash(x))
    return fm

df_hashed = convert_obj_to_int(df)
print(df_hashed.loc[0,:])
print(df_hashed.dtypes)

#ends

def create_train_valid_test_split(dF, test_percent, shuffle=True):

  if shuffle:
    dF = dF.sample(frac = 1).reset_index().drop("index", axis=1)

  cols = list(dF.columns)
  y = dF["click"].to_numpy()
  cols.remove('click')
  X = dF.loc[:, cols].to_numpy()
  print("Data shape before splitting: {}".format(X.shape))
  print("Labels shape before splitting: {}".format(y.shape))

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_percent, random_state=1)
  # X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.12, random_state=1)

  print("Training data shape: {}".format(X_train.shape))
  print("Training labels shapre: {}".format(y_train.shape))
  # print("Validation data shape: {}".format(X_valid.shape))
  # print("Validation labels shape: {}".format(y_valid.shape))
  print("Test data shape: {}".format(X_test.shape))
  print("Test labels shape: {}".format(y_test.shape))
  
  return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = create_train_valid_test_split(df_hashed, 0.10)

#iso = IsolationForest(contamination=0.1)
#yhat = iso.fit_predict(X_train)
#mask = yhat != -1
#X_iso, y_iso = X_train[mask, :], y_train[mask]

#model = DecisionTreeClassifier()
#over = SMOTE(random_state=2, sampling_strategy=0.4, k_neighbors=1)
#under = RandomUnderSampler(sampling_strategy=0.5)
#steps = [('o', over), ('u', under)]
#pipeline = Pipeline(steps=steps)
#Xn, yn = pipeline.fit_resample(X_iso, y_iso.ravel())
#cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
#scores = cross_val_score(model, Xn, yn, scoring='roc_auc', cv=cv, n_jobs=-1)
#score = np.mean(scores)
#print("k={}, Mean ROC AUC: {:.3f}".format(3, score))

#X_train = np.copy(Xn)
#y_train = np.copy(yn)

params = {}
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
# params['num_iterations'] = 100 # default
params['feature_fraction'] = 1.
params['bagging_fraction'] = 1.
params['nthreads'] = 8
# params['scale_pos_weight'] = 1 #positive_class_fraction
params['is_unbalance'] = False
params['max_bin'] = 2^12
params['n_estimators'] = 300
        
# parameter grid to use with cross-validation
param_grid = {}
param_grid['classifier__min_data_in_leaf'] = [30] 
param_grid['classifier__max_depth'] = [-1] 
param_grid['classifier__learning_rate'] = [0.03]
param_grid['classifier__min_data_per_group'] = [5]
param_grid['classifier__num_leaves'] = [100] # <= 2**max_depth
param_grid['classifier__regression_l2'] = [0.]
       

pipe = Pipeline([
    ('scale', StandardScaler()),
    ('fselect', SelectKBest(score_func=f_classif, k=15)),
    ('classifier',LGBMClassifier(**params))
])

model = GridSearchCV(pipe, param_grid=param_grid, cv=5, scoring='neg_log_loss')
print(X_train[0])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
probs = model.predict_proba(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Model accuracy: {}".format(accuracy))

import pickle
pickle.dump(model,open('pipeline.pkl','wb'))

# estimate log_loss
logloss = log_loss((y_test+1), probs)
print(logloss)

print('Best parameters set found on development set\n')
print(model.best_params_)

xgb_roc_auc = roc_auc_score(y_test, y_pred)
print(xgb_roc_auc)

Writing train.py


In [None]:
%run train.py

Mounted at /gdrive
Index(['click', 'C1', 'banner_pos', 'site_id', 'site_domain', 'site_category',
       'app_id', 'app_domain', 'app_category', 'device_id', 'device_ip',
       'device_model', 'device_conn_type', 'C14', 'C15', 'C16', 'C18', 'C19',
       'C20', 'C21', 'hour_of_day', 'day_of_week'],
      dtype='object')
click                int64
C1                   int64
banner_pos           int64
site_id             object
site_domain         object
site_category       object
app_id              object
app_domain          object
app_category        object
device_id           object
device_ip           object
device_model        object
device_conn_type     int64
C14                  int64
C15                  int64
C16                  int64
C18                  int64
C19                  int64
C20                  int64
C21                  int64
hour_of_day          int64
day_of_week         object
dtype: object
click                                 0
C1                           

In [None]:
!pip install -q streamlit

[K     |████████████████████████████████| 9.2 MB 5.2 MB/s 
[K     |████████████████████████████████| 182 kB 36.8 MB/s 
[K     |████████████████████████████████| 164 kB 25.3 MB/s 
[K     |████████████████████████████████| 78 kB 5.8 MB/s 
[K     |████████████████████████████████| 237 kB 49.9 MB/s 
[K     |████████████████████████████████| 4.7 MB 45.3 MB/s 
[K     |████████████████████████████████| 63 kB 1.6 MB/s 
[K     |████████████████████████████████| 51 kB 6.3 MB/s 
[?25h  Building wheel for validators (setup.py) ... [?25l[?25hdone


In [None]:
import streamlit as st

In [None]:
!./ngrok authtokens 2FxqAY9VppUhMg9bAq3gCATq6YN_2EaqhKDJeKvQ1qx9WSm33

/bin/bash: ./ngrok: No such file or directory


In [None]:
!pip install pyngrok

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyngrok
  Downloading pyngrok-5.1.0.tar.gz (745 kB)
[K     |████████████████████████████████| 745 kB 5.1 MB/s 
Building wheels for collected packages: pyngrok
  Building wheel for pyngrok (setup.py) ... [?25l[?25hdone
  Created wheel for pyngrok: filename=pyngrok-5.1.0-py3-none-any.whl size=19007 sha256=1930721b72092a012f5c6ecd228ddca799f1afaf7b689deb74793b8075384f81
  Stored in directory: /root/.cache/pip/wheels/bf/e6/af/ccf6598ecefecd44104069371795cb9b3afbcd16987f6ccfb3
Successfully built pyngrok
Installing collected packages: pyngrok
Successfully installed pyngrok-5.1.0


In [None]:
from pyngrok import ngrok 

In [None]:
public_url = ngrok.connect(port='8501')
public_url

INFO:pyngrok.ngrok:Opening tunnel named: http-80-64a16626-32b8-43bb-bdb2-ce12d6cfe7b8
2022-10-13 10:28:38.184 INFO    pyngrok.ngrok: Opening tunnel named: http-80-64a16626-32b8-43bb-bdb2-ce12d6cfe7b8




INFO:pyngrok.process.ngrok:t=2022-10-13T10:28:41+0000 lvl=info msg="no configuration paths supplied"
2022-10-13 10:28:41.218 INFO    pyngrok.process.ngrok: t=2022-10-13T10:28:41+0000 lvl=info msg="no configuration paths supplied"
INFO:pyngrok.process.ngrok:t=2022-10-13T10:28:41+0000 lvl=info msg="using configuration at default config path" path=/root/.ngrok2/ngrok.yml
2022-10-13 10:28:41.230 INFO    pyngrok.process.ngrok: t=2022-10-13T10:28:41+0000 lvl=info msg="using configuration at default config path" path=/root/.ngrok2/ngrok.yml
INFO:pyngrok.process.ngrok:t=2022-10-13T10:28:41+0000 lvl=info msg="open config file" path=/root/.ngrok2/ngrok.yml err=nil
2022-10-13 10:28:41.248 INFO    pyngrok.process.ngrok: t=2022-10-13T10:28:41+0000 lvl=info msg="open config file" path=/root/.ngrok2/ngrok.yml err=nil
INFO:pyngrok.process.ngrok:t=2022-10-13T10:28:41+0000 lvl=info msg="starting web service" obj=web addr=127.0.0.1:4040
2022-10-13 10:28:41.258 INFO    pyngrok.process.ngrok: t=2022-10-13T

<NgrokTunnel: "http://4c83-35-223-104-98.ngrok.io" -> "http://localhost:80">

2022-10-13 10:28:41.480 INFO    pyngrok.process.ngrok: t=2022-10-13T10:28:41+0000 lvl=info msg="started tunnel" obj=tunnels name="http-80-64a16626-32b8-43bb-bdb2-ce12d6cfe7b8 (http)" addr=http://localhost:80 url=http://4c83-35-223-104-98.ngrok.io
INFO:pyngrok.process.ngrok:t=2022-10-13T10:28:41+0000 lvl=info msg="started tunnel" obj=tunnels name=http-80-64a16626-32b8-43bb-bdb2-ce12d6cfe7b8 addr=http://localhost:80 url=https://4c83-35-223-104-98.ngrok.io
2022-10-13 10:28:41.491 INFO    pyngrok.process.ngrok: t=2022-10-13T10:28:41+0000 lvl=info msg="started tunnel" obj=tunnels name=http-80-64a16626-32b8-43bb-bdb2-ce12d6cfe7b8 addr=http://localhost:80 url=https://4c83-35-223-104-98.ngrok.io


In [18]:
%%writefile streamlit_app.py 

import streamlit as st
import pandas as pd
import pickle

#Loading up the Regression model we created

model = pickle.load(open('pipeline.pkl', 'rb'))

#Caching the model for faster loading
#@st.cache


# Define the prediction function
def predict(C1,banner_pos,site_id,site_domain,site_category, app_id, app_domain, app_category,device_id, device_ip, device_model, device_conn_type, C14, C15, C16, C18, C19, C20, C21, hour, day):
    def convert_obj_to_int(fm):
      object_list_columns = fm.columns
      object_list_dtypes = fm.dtypes
      print(object_list_columns)
      print(object_list_dtypes)
      for index in range(0,len(object_list_columns)):
        if object_list_dtypes[index] == object :
            fm[object_list_columns[index]] = fm[object_list_columns[index]].apply(lambda x: hash(x))
      return fm

    df=pd.DataFrame([[C1,banner_pos,site_id,site_domain,site_category, app_id, app_domain, app_category,device_id, device_ip, device_model, device_conn_type, C14, C15, C16, C18, C19, C20, C21, hour, day]], columns=['C1','banner_pos','site_id','site_domain','site_category', 'app_id', 'app_domain', 'app_category','device_id', 'device_ip', 'device_model', 'device_conn_type', 'C14', 'C15', 'C16', 'C18', 'C19','C20', 'C21', 'hour', 'day'])
    df_hashed = convert_obj_to_int(df)
    print(df_hashed.loc[0,:])
    print(df_hashed.dtypes)
    X = df_hashed.loc[:,:].to_numpy()
    print(X[0])
    prediction = model.predict(X)
    return prediction


st.title('Check if your ad will be clicked or not')
#st.image("""""")
st.header('Enter the characteristics of your ad:')
day = st.selectbox('Day of week:', ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday','Saturday','Sunday'])
hour = st.number_input('Hour of day:', min_value=0, max_value=23, value=1)
C1 = st.selectbox('C1:', [1005, 1002, 1010, 1007, 1001, 1008, 1012])
banner_pos = st.selectbox('Banner_Position:', [0, 1, 5, 2, 4, 7, 3])

site_id = st.text_input('Site Id')
site_domain = st.text_input('Site Domain')
site_category = st.text_input('Site Category')
app_id = st.text_input('App Id')
app_domain = st.text_input('App Domain')
app_category = st.text_input('App Category')
device_id = st.text_input('Device Id')
device_ip = st.text_input('Device Ip')
device_model = st.text_input('Device Model')


device_conn_type = st.selectbox('Device Connection Type:', [0, 1, 5, 2, 4, 7, 3])
C15 = st.selectbox('C15:', [320,  300,  728,  216,  480, 1024,  768,  120])
C21 = st.number_input('C21',value=1)

C14 = st.number_input('C14',value=1)
C16 = st.number_input('C16',value=1)
C18 = st.number_input('C18',value=1)
C19 = st.number_input('C19',value=1)
C20 = st.number_input('C20',value=1)

if st.button('Check Click/Not click'):
    result = predict(C1,banner_pos,site_id,site_domain,site_category, app_id, app_domain, app_category,device_id, device_ip, device_model, device_conn_type, C14, C15, C16, C18, C19, C20, C21, hour, day)
    st.success('Prediction success', icon="✅")
    if result[0]==0:
      st.write('your ad will not be clicked')
    else:
      st.write('your ad will be clicked')
    

Overwriting streamlit_app.py


In [21]:
!streamlit run /content/streamlit_app.py & npx localtunnel --port 8501

2022-10-13 12:47:59.792 INFO    numexpr.utils: NumExpr defaulting to 2 threads.
[##................] | fetchMetadata: sill resolveWithNewModule ansi-styles@4.3[0m[K
Collecting usage statistics. To deactivate, set browser.gatherUsageStats to False.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.2:8501[0m
[34m  External URL: [0m[1mhttp://35.223.104.98:8501[0m
[0m
[K[?25hnpx: installed 22 in 3.572s
your url is: https://heavy-cobras-dig-35-223-104-98.loca.lt
[34m  Stopping...[0m
^C
