In [None]:
%config IPCompleter.use_jedi=False
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
import sklearn.preprocessing
import matplotlib.pyplot as plt

from Utils import print_memory_usage, frequency_encoding_by_usertype, frequency_encode_stations, evaluate_model

path_train = "Data/Train.parquet"
path_val = "Data/Validation.parquet"

# Logistic Regression

The first model I want to try is a simple logistic regression model.
Lets start with a simple test model.

## Variable Encodings

One question here is how to choose a proper encoding for some of the variables.

1. Hour of the day: One approach is convert it into a cyclic variable, and another is to use fixed "binned" time intervales like morning, midday, evening. We could also convert each hour into a category, but this does not make much sense.
2. Numerical Features should be scaled to be comparable? -> Only relevant for gradient optimization, but might want to scale to 0 mean? Also affects regularization, which is on by default -> SHOULD STANDARDIZE, YES (also mentioned in elements of statistical learning)

In [2]:
def hour_to_coordinate(h):
    xh = np.sin(2*np.pi*(h)/24)
    yh = np.cos(2*np.pi*(h)/24)
    return xh,yh

In [3]:
def categorize_tod(data,dummies=False):
    #Categories based on plots in Analysis Notebook
    hours = data["starttime"].dt.hour 
    bins=[-1,6,10,15,24] 
    names=[0,1,2,3]
    tod = pd.cut(hours,bins,labels=names)
    tod = tod.astype("int64")
    if dummies:
        tod = pd.get_dummies(tod,prefix="tod",drop_first=True)
    return tod

In [10]:
def print_model_coefficients(model,X):
    coefficients = pd.Series([model.intercept_[0]]+list(model.coef_[0]),index=["intercept"]+list(X.columns))
    print("Coefficients: ")
    print(coefficients)

Note that we cannot encode the station id as a one-hot vector because it takes too much memory.
Instead I will just encode whether the station is one of the top customer stations or not.

In [13]:
def top_stations(data,k=20):
    grouped = data_train.groupby("usertype")["start station id"].value_counts()
    return grouped["Customer"][0:k].index.tolist()

TODO: Proper data preparation pipeline

TODO: Take log of some features that span multiple orders of magnitude?

In [33]:
#TODO: Optimize by not copying any data
def load_and_preprocess(path,scaler=None): #TODO: Use pipelines instead
    data = pd.read_parquet(path,engine="pyarrow")
    numerical = ["haversine distance","tripduration","speed"] #features to be scaled in the end
    features=["tripduration", "summer","business day", "haversine distance", "is_roundtrip", "speed"]
    unused = [c for c in data.columns if c not in features]
    label="usertype"
    X = data.drop(columns=unused)
    Y = data[label].copy()
    
    #TOD and interaction terms
    tod=categorize_tod(data,dummies=True)
    X = pd.concat([X,tod],axis=1,copy=False)
    interaction = tod.mul(X["business day"],axis=0)
    interaction.columns = ["business x " + c for c in tod.columns]
    X = pd.concat([X,interaction],axis=1,copy=False)
    
    #Try encoding start station as categorical
    #topstations = top_stations(data,k=20)
    #X["topstation"] = data["start station id"].isin(topstations)
    
    #Try encoding station by customer count
    X = frequency_encode_stations(X,data)
    
    #Gender
    #dum = pd.get_dummies(data,columns=["gender"],drop_first=True,prefix="gender")
    
    Y=(Y=="Customer")
    if not scaler:
        scaler = sklearn.preprocessing.MinMaxScaler() #MinMaxScaler or StandardScaler does not seem to matter. MinMaxScaler has advantage of preserving speed = 0 values for roundtrips
        scaler = scaler.fit(X[numerical])
    X[numerical] = scaler.transform(X[numerical])  
    return X,Y,scaler

In [None]:
def train(datapath):
    X_train,Y_train,scaler = load_and_preprocess(path_train)
    clf = LogisticRegression(max_iter=300) #Might make sense to use balanced class weights here
    clf=clf.fit(X_train,Y_train)
    evaluate_model(clf,X_train,Y_train)
    return clf,scaler

In [16]:
baseline = DummyClassifier()
baseline = baseline.fit(X_train,Y_train)
acc_base = baseline.score(X_train,Y_train)
acc_base

0.8905468036686415

In [17]:
clf = LogisticRegression(max_iter=300) #Might make sense to use balanced class weights here
clf=clf.fit(X_train,Y_train)

In [27]:
X_train,Y_train,scaler = load_and_preprocess(path_train)

In [28]:
evaluate_model(clf,X_train,Y_train)
del X_train, Y_train

Accuracy: 0.9387394851499196
Confusion: 
[[0.98692923 0.01307077]
 [0.45334794 0.54665206]]


In [31]:
X_val, Y_val,scaler = load_and_preprocess(path_val,scaler=scaler)

In [32]:
evaluate_model(clf,X_val,Y_val);

Accuracy: 0.9389125762369973
Confusion: 
[[0.98705856 0.01294144]
 [0.45305693 0.54694307]]


(0.9389125762369973,
 array([[0.98705856, 0.01294144],
        [0.45305693, 0.54694307]]))

Notes: By using gender=unknown, gender=male categorical features one can easily get about 94% training accuracy, and about 98% on subscribers. Using classes = balanced gives 79% accuracy overall, but like 82% on customers instead of 20% for unbalanced.

Scaling data with min-max scaler seems to have no effect
Using categorical tod encoding does not seem much different from using ordinal encoding or hours, at least if we dont use interactions.
Adding interaction terms between tod and business day does not help much.

Adding top_20_customer_start_station as label increases accuracy from about 89.5 % to 90%.
Instead using customer counts of each start_station as a feature gives 90% training but only 89.5% validation accuracy

Frequency encoding bot start and end station by both subscriber and customer also gives around 90% accuracy.

Adding gender to this again gives about 94% accuracy, and 54% on customers

Further ideas:
1. add interaction term between start and end station
2. different categories for tod and summer
3. Train with class weights=balanced