In [1]:
%matplotlib notebook

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
import seaborn as sns
import csv
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
pd.set_option('display.max_columns', None)

In [2]:
def get_data():
    data_frame = pd.read_csv('data/ad_data/train.csv', nrows=10000)
    exclude_from_transformation = ['id', 'click', 'hour', 'device_ip', 'device_id']
    headers = data_frame.columns.tolist()
    for header in headers:
        if header in exclude_from_transformation:
            continue
        one_hot = pd.get_dummies(data_frame[header], prefix=header)
        data_frame = data_frame.drop(header, axis=1)
        data_frame = data_frame.join(one_hot)
    X = data_frame.drop(['click', 'id', 'hour', 'device_ip', 'device_id'], axis=1)
    Y = data_frame['click']
    print(X.shape)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=1/4., random_state=0)

    return X_train, X_test, Y_train, Y_test

X_train, X_test, Y_train, Y_test = get_data()

(10000, 2820)


In [10]:
from sklearn.externals import joblib

model = joblib.load(open('models/ad_model.pkl', 'r'))
ad_data, click_labels = X_test[:5], Y_test[:5]
bids = [10, 20, 5, 12, 2]
X_test.shape

(2500, 2820)

In [11]:
def rank_ads(model, ads, bids):
    ctr_preds = model.predict(ads)
    rank_scores = np.array(ctr_preds*bids)
    idx = np.argsort(-rank_scores)
    return idx, ctr_preds, rank_scores

In [12]:
ad_rankings, ctr_preds, rank_scores = rank_ads(model, ad_data, bids)

In [13]:
ad_rankings

array([3, 0, 2, 4, 1])

In [14]:
rank_scores

array([ 1.88584557, -0.41718496,  0.66664189,  2.57251385,  0.64767247])

In [15]:
ctr_preds

array([ 0.18858456, -0.02085925,  0.13332838,  0.21437615,  0.32383623])

In [None]:
ctr_preds[ad_rankings]