In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

In [2]:
input_path = '/Users/ytu/data/biddings.csv'

In [3]:
data = pd.read_csv(input_path)
print(data.shape)

(1000000, 89)


### Explore input data

Only ~0.2% of rows have `convert = true`

In [8]:
pd.value_counts(data['convert'])[1]/data.shape[0] * 100

0.1908

#### Sample the same number of rows where `convert = false` and  `convert = true`

In [60]:
def down_sampling(minority_ratio=1, minority_to_majority=1):
    convert1_sample = data.loc[data['convert'] == 1] * minority_ratio
    convert0_sample = data.loc[data['convert'] == 0].sample(minority_to_majority * len(convert1_sample))

    # shuffle
    sample = pd.concat([convert0_sample, convert1_sample]).sample(frac=1)
    
    features = sample.drop('convert', axis=1)
    labels = sample.convert

    print("features: {}, lables = {}".format(features.shape, labels.shape))
        
    return (np.array(features), np.array(labels))


(features, labels) = down_sampling()

features: (3816, 88), lables = (3816,)


#### Logistic Regression

In [83]:
from sklearn.linear_model import LogisticRegression

regression = LogisticRegression(penalty='l2')

Cross validation with mean accuracy as test metric

In [96]:
from sklearn.model_selection import KFold

# Returns mean accuracy
def train_kfold_accuracy(n_splits):
    kf = KFold(n_splits=n_splits, shuffle=True)
    
    accuracies = np.array([])
    for train_index, test_index in kf.split(features):
    #     print("TRAIN:", train_index, "TEST:", test_index)
        feature_train, feature_test = features[train_index], features[test_index]
        label_train, label_test = labels[train_index], labels[test_index]

        model = regression.fit(feature_train, label_train)
        accuracies = np.append(accuracies, model.score(feature_test, label_test))
    
    return np.mean(accuracies)

In [102]:
for n_splits in range(2, 8):
    print(train_kfold_accuracy(n_splits))

0.6318134171907757
0.6370545073375262
0.6433438155136268
0.6386244021601147
0.6360062893081762
0.6388868885592346
