In [1]:
%config IPCompleter.use_jedi=False
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

from Utils import print_memory_usage, frequency_encode_stations, evaluate_model, train, validate, load_data 

path_train = "Data/Train.parquet"
path_val = "Data/Validation.parquet"

# Logistic Regression

The first model I want to try is a simple logistic regression model.
Lets start with a simple test model.

## Variable Encodings

One question here is how to choose a proper encoding for some of the variables.

1. Hour of the day: One approach is convert it into a cyclic variable, and another is to use fixed "binned" time intervales like morning, midday, evening. We could also convert each hour into a category, but this does not make much sense.
2. Numerical Features should be scaled to be comparable? -> Only relevant for gradient optimization, but might want to scale to 0 mean? Also affects regularization, which is on by default -> SHOULD STANDARDIZE, YES (also mentioned in elements of statistical learning)

In [2]:
def print_model_coefficients(model,X):
    coefficients = pd.Series([model.intercept_[0]]+list(model.coef_[0]),index=["intercept"]+list(X.columns))
    print("Coefficients: ")
    print(coefficients)

Note that we cannot encode the station id as a one-hot vector because it takes too much memory.
Instead I will just encode whether the station is one of the top customer stations or not.

In [3]:
def top_stations(data,k=20):
    grouped = data_train.groupby("usertype")["start station id"].value_counts()
    return grouped["Customer"][0:k].index.tolist()

TODO: Proper data preparation pipeline

TODO: Take log of some features that span multiple orders of magnitude?

In [4]:
#TODO: Optimize by not copying any data
def load_and_preprocess(path,scaler=None): #TODO: Use pipelines instead
    data = pd.read_parquet(path,engine="pyarrow")
    numerical = ["haversine distance","tripduration","speed"] #features to be scaled in the end
    features=["tripduration", "summer","business day", "haversine distance", "is_roundtrip", "speed"]
    unused = [c for c in data.columns if c not in features]
    label="usertype"
    X = data.drop(columns=unused)
    Y = data[label].copy()
    
    #TOD and interaction terms
    tod=categorize_tod(data,dummies=True)
    X = pd.concat([X,tod],axis=1,copy=False)
    interaction = tod.mul(X["business day"],axis=0)
    interaction.columns = ["business x " + c for c in tod.columns]
    X = pd.concat([X,interaction],axis=1,copy=False)
    
    #Try encoding start station as categorical
    #topstations = top_stations(data,k=20)
    #X["topstation"] = data["start station id"].isin(topstations)
    
    #Try encoding station by customer count
    X = frequency_encode_stations(X,data)
    
    #Gender
    #dum = pd.get_dummies(data,columns=["gender"],drop_first=True,prefix="gender")
    
    Y=(Y=="Customer")
    if not scaler:
        scaler = sklearn.preprocessing.MinMaxScaler() #MinMaxScaler or StandardScaler does not seem to matter. MinMaxScaler has advantage of preserving speed = 0 values for roundtrips
        scaler = scaler.fit(X[numerical])
    X[numerical] = scaler.transform(X[numerical])  
    return X,Y,scaler

In [5]:
def hour_to_coordinate(data,features):
    xh = np.sin(2*np.pi*(h)/24)
    yh = np.cos(2*np.pi*(h)/24)
    data["xh"] = xh
    data["yh"] = yh
    features = features + ["xh","yh"]
    return data,features

In [15]:
def categorize_tod(data,features,add_interactions=False):
    #Categories based on plots in Analysis Notebook
    hours = data["starttime"].dt.hour 
    bins=[-1,6,10,15,24] 
    names=[0,1,2,3]
    tod = pd.cut(hours,bins,labels=names)
    tod = pd.get_dummies(tod,prefix="tod",drop_first=True)
    data = data.join(tod)  
    features = features + list(tod.columns)
    if add_interactions:
        interaction = tod.mul(data["business day"],axis=0)
        interaction.columns = ["business x " + c for c in tod.columns]
        data = data.join(interaction)
        features = features + list(interaction.columns)
    return data,features

Now lets try logistic regression with different features and preprocessings.

In [7]:
def preprocess_1(data,features):
    """
    Note that this modifies data inplace
    """
    features = features + ["hour"]
    data["hour"] = data["starttime"].dt.hour
    return data,features

In [8]:
pre = preprocess_1
features=["tripduration", "summer","business day", "haversine distance", "is_roundtrip", "speed"]
scaler = MinMaxScaler()
features_to_scale = ["tripduration","haversine distance", "speed"]
clf = LogisticRegression(max_iter=100)
print("Training: ")
clf,feature_names = train(path_train,clf,features,preprocess=pre,scaler=scaler,features_to_scale=features_to_scale,fit_scaler = True)
print("Validation: ")
validate(clf,path_val,features,preprocess=pre,scaler=scaler,features_to_scale=features_to_scale,fit_scaler=False);

Training: 
Accuracy: 0.8941802335692596
Confusion: 
[[0.98174913 0.01825087]
 [0.8183088  0.1816912 ]]
MCC: 0.27340166228644086
Validation: 
Accuracy: 0.8942307307252905
Confusion: 
[[0.98190434 0.01809566]
 [0.81954388 0.18045612]]
MCC: 0.27256028194374954


Categorizing tod instead slightly improves the model

In [8]:
def preprocess_2(data,features):
    data, features = categorize_tod(data,features)
    return data,features

In [9]:
pre = preprocess_2
features=["tripduration", "summer","business day", "haversine distance", "is_roundtrip", "speed"]
scaler = MinMaxScaler()
features_to_scale = ["tripduration","haversine distance", "speed"]
clf = LogisticRegression(max_iter=100)
print("Training: ")
clf,feature_names = train(path_train,clf,features,preprocess=pre,scaler=scaler,features_to_scale=features_to_scale,fit_scaler = True)
print("Validation: ")
validate(clf,path_val,features,preprocess=pre,scaler=scaler,features_to_scale=features_to_scale,fit_scaler=False);

Training: 
Accuracy: 0.895168003048048
Confusion: 
[[0.98156348 0.01843652]
 [0.8077737  0.1922263 ]]
MCC: 0.28574764216388404
Validation: 
Accuracy: 0.8951586467528946
Confusion: 
[[0.98161271 0.01838729]
 [0.80868733 0.19131267]]
MCC: 0.2848225615211647


TODO: Add stations, add tod-business day interaction

In [13]:
def preprocess_3(data,features):
    data, features = categorize_tod(data,features,add_interactions=True)
    return data,features

In [None]:
pre = preprocess_3
features=["tripduration", "summer","business day", "haversine distance", "is_roundtrip", "speed"]
scaler = MinMaxScaler()
features_to_scale = ["tripduration","haversine distance", "speed"]
clf = LogisticRegression(max_iter=100)
print("Training: ")
clf,feature_names = train(path_train,clf,features,preprocess=pre,scaler=scaler,features_to_scale=features_to_scale,fit_scaler = True)
print("Validation: ")
validate(clf,path_val,features,preprocess=pre,scaler=scaler,features_to_scale=features_to_scale,fit_scaler=False);

Training: 


In [None]:
print(feature_names)

Notes: By using gender=unknown, gender=male categorical features one can easily get about 94% training accuracy, and about 98% on subscribers. Using classes = balanced gives 79% accuracy overall, but like 82% on customers instead of 20% for unbalanced.

Scaling data with min-max scaler seems to have no effect
Using categorical tod encoding does not seem much different from using ordinal encoding or hours, at least if we dont use interactions.
Adding interaction terms between tod and business day does not help much.

Adding top_20_customer_start_station as label increases accuracy from about 89.5 % to 90%.
Instead using customer counts of each start_station as a feature gives 90% training but only 89.5% validation accuracy

Frequency encoding bot start and end station by both subscriber and customer also gives around 90% accuracy.

Adding gender to this again gives about 94% accuracy, and 54% on customers

Further ideas:
1. add interaction term between start and end station
2. different categories for tod and summer
3. Train with class weights=balanced