In [2]:
%config IPCompleter.use_jedi=False
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
import sklearn.preprocessing
import matplotlib.pyplot as plt

from Utils import print_memory_usage

path_train = "Data/Train.parquet"
path_val = "Data/Validation.parquet"

# Logistic Regression

The first model I want to try is a simple logistic regression model.
Lets start with a simple test model.

## Variable Encodings

One question here is how to choose a proper encoding for some of the variables.

1. Hour of the day: One approach is convert it into a cyclic variable, and another is to use fixed "binned" time intervales like morning, midday, evening. We could also convert each hour into a category, but this does not make much sense.
2. Numerical Features should be scaled to be comparable? -> Only relevant for gradient optimization, but might want to scale to 0 mean? Also affects regularization, which is on by default -> SHOULD STANDARDIZE, YES (also mentioned in elements of statistical learning)

In [3]:
def hour_to_coordinate(h):
    xh = np.sin(2*np.pi*(h)/24)
    yh = np.cos(2*np.pi*(h)/24)
    return xh,yh

In [4]:
def categorize_tod(data,dummies=False):
    #Categroies based on plots in Analysis Notebook
    hours = data["starttime"].dt.hour 
    bins=[-1,6,10,15,24] 
    names=[0,1,2,3]
    tod = pd.cut(hours,bins,labels=names)
    tod = tod.astype("int64")
    if dummies:
        tod = pd.get_dummies(tod,prefix="tod",drop_first=True)
    return tod

In [5]:
def evaluate_model(model,X_train,Y_train, X_val,Y_val):
    #print some summary statistics about the model
    #TODO: Add uncertainty estimates about these
    Y_train_pred = model.predict(X_train)
    training_acc = accuracy_score(Y_train,Y_train_pred)
    print(f"Training accuracy: {training_acc}")
    Y_val_pred = model.predict(X_val)
    val_acc = accuracy_score(Y_val,Y_val_pred)
    print(f"Val accuracy: {val_acc}")
    confusion_train = confusion_matrix(Y_train,Y_train_pred,normalize="true")
    print(f"Training confusion: ")
    print(confusion_train)
    confusion_val = confusion_matrix(Y_val,Y_val_pred,normalize="true")
    print(f"Validation confusion: ")
    print(confusion_val)
    coefficients = pd.Series([clf.intercept_[0]]+list(clf.coef_[0]),index=["intercept"]+list(X.columns))
    print("Coefficients: ")
    print(coefficients)

Note that we cannot encode the station id as a one-hot vector because it takes too much memory.
Instead I will just encode whether the station is one of the top 10 customer stations or not.

In [14]:
def top_stations(data,k=20):
    grouped = data_train.groupby("usertype")["start station id"].value_counts()
    return grouped["Customer"][0:k].index.tolist()

In [51]:
def encode_stations_by_customercount(X,data):
    grouped = data.groupby("usertype")["start station id"].value_counts()
    counts = grouped["Customer"]
    X["start customercount"] = data["start station id"].map(counts).fillna(0).astype(int)
    return X

TODO: Proper data preparation pipeline

TODO: Take log of some features that span multiple orders of magnitude?

In [52]:
#For now I omit information about station ids
def preprocess(data,scaler=None): #TODO: Use pipelines instead
    numerical = ["haversine distance","tripduration","speed", "start customercount"]
    features=["tripduration", "summer","business day", "haversine distance", "roundtrip", "speed"]
    unused = [c for c in data.columns if c not in features]
    label="usertype"
    X = data.drop(columns=unused)
    Y = data[label].copy()
    
    tod=categorize_tod(data,dummies=True)
    X = pd.concat([X,tod],axis=1,copy=False)
    interaction = tod.mul(X["business day"],axis=0)
    interaction.columns = ["business x " + c for c in tod.columns]
    X = pd.concat([X,interaction],axis=1,copy=False)
    
    #Try encoding start station as categorical
    #topstations = top_stations(data,k=20)
    #X["topstation"] = data["start station id"].isin(topstations)
    
    #Try encoding station by customer count
    X = encode_stations_by_customercount(X,data)
        
    #X["birth year"] = data["birth year"]-data["birth year"].min() #scale to smaller integer range
    #data["gender_male"] = data["gender"] == 1
    #data["gender_unknown"] = data["gender"] == 0
    Y=(Y=="Customer")
    if not scaler:
        scaler = sklearn.preprocessing.MinMaxScaler() #MinMaxScaler or StandardScaler does not seem to matter. MinMaxScaler has advantage of preserving speed = 0 values for roundtrips
        scaler = scaler.fit(X[numerical])
    X[numerical] = scaler.transform(X[numerical])  
    return X,Y,scaler

In [8]:
data_train = load_data(path_train)

In [53]:
X,Y,scaler = preprocess(data_train)

In [54]:
baseline = DummyClassifier()
baseline = baseline.fit(X,Y)
acc_base = baseline.score(X,Y)
acc_base

0.8905596285361598

In [55]:
clf = LogisticRegression(max_iter=300) #Might make sense to use balanced class weights here
clf=clf.fit(X,Y)

In [None]:
data_val = load_data(path_val)

In [56]:
X_val, Y_val,scaler = preprocess(data_val,scaler=scaler)

In [57]:
evaluate_model(clf,X,Y,X_val,Y_val)

Training accuracy: 0.9012529724987639
Val accuracy: 0.8951307947455558
Training confusion: 
[[0.98221255 0.01778745]
 [0.75754713 0.24245287]]
Validation confusion: 
[[0.9909919  0.0090081 ]
 [0.88141488 0.11858512]]
Coefficients: 
intercept              -0.459020
tripduration            2.048347
roundtrip              -1.098812
haversine distance      7.207567
business day           -0.685669
speed                 -15.702991
summer                  0.624983
tod_1                   0.082217
tod_2                   0.377071
tod_3                   0.239407
business x tod_1       -0.486538
business x tod_2        0.133096
business x tod_3       -0.298165
start customercount     2.733849
dtype: float64


Notes: By using gender=unknown, gender=male categorical features one can easily get about 94% training accuracy, and about 98% on subscribers. Using classes = balanced gives 79% accuracy overall, but like 82% on customers instead of 20% for unbalanced.

Scaling data with min-max scaler seems to have no effect
Using categorical tod encoding does not seem much different from using ordinal encoding or hours, at least if we dont use interactions.
Adding interaction terms between tod and business day does not help much.

Adding top_20_customer_start_station as label increases accuracy from about 89.5 % to 90%.
Instead using customer counts of each start_station as a feature gives 90% training but only 89.5% validation accuracy

In [23]:
clf.score(X_val,Y_val)

0.894888232649087

In [24]:
Y_pred = clf.predict(X_val)
confusion_matrix(Y_val,Y_pred,normalize="true")

array([[0.98162453, 0.01837547],
       [0.80773977, 0.19226023]])

In [25]:
baseline.score(X_val,Y_val)

0.8901187982192281