In [1]:
%config IPCompleter.use_jedi=False
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
import sklearn.preprocessing
import matplotlib.pyplot as plt

from Utils import load_data, print_memory_usage

path_train = "Data/Train.csv"
path_val = "Data/Validation.csv"

# Logistic Regression

The first model I want to try is a simple logistic regression model.
Lets start with a simple test model.

## Variable Encodings

One question here is how to choose a proper encoding for some of the variables.

1. Hour of the day: One approach is convert it into a cyclic variable, and another is to use fixed "binned" time intervales like morning, midday, evening. We could also convert each hour into a category, but this does not make much sense.
2. Numerical Features should be scaled to be comparable? -> Only relevant for gradient optimization, but might want to scale to 0 mean? Also affects regularization, which is on by default -> SHOULD STANDARDIZE, YES (also mentioned in elements of statistical learning)

In [2]:
def hour_to_coordinate(h):
    xh = np.sin(2*np.pi*(h)/24)
    yh = np.cos(2*np.pi*(h)/24)
    return xh,yh

In [3]:
def categorize_tod(data,dummies=False):
    hours = data["starttime"].dt.hour 
    bins=[-1,6,10,15,24] 
    names=[0,1,2,3]
    tod = pd.cut(hours,bins,labels=names)
    tod = tod.astype("int64")
    if dummies:
        tod = pd.get_dummies(tod,prefix="tod",drop_first=True)
    return tod

In [4]:
#For now I omit information about station ids
def preprocess(data,scaler=None): #TODO: Use pipelines instead
    #Omit Gender?
    #TODO: Add standardization?
    #TODO: Add interaction between hour and business day
    numerical = ["haversine distance","tripduration","speed"]
    features=["tripduration", "summer","business day", "haversine distance", "roundtrip", "speed"]
    unused = [c for c in data.columns if c not in features]
    label="usertype"
    X = data.drop(columns=unused)
    Y = data[label].copy()
    
    coords=hour_to_coordinate(data["starttime"].dt.hour)
    #X["xhour"] = coords[0]
    #X["yhour"] = coords[1]
    #X["xhour x business day"] = X["xhour"]*X["business day"]
    #X["yhour x business day"] = X["xhour"]*X["business day"]
    tod=categorize_tod(data,dummies=True)
    X = pd.concat([X,tod],axis=1,copy=False)
    interaction = tod.mul(X["business day"],axis=0)
    interaction.columns = ["business x " + c for c in tod.columns]
    X = pd.concat([X,interaction],axis=1,copy=False)
    #X["birth year"] = data["birth year"]-data["birth year"].min() #scale to smaller integer range
    #data["gender_male"] = data["gender"] == 1
    #data["gender_unknown"] = data["gender"] == 0
    Y=pd.factorize(Y)[0]
    if not scaler:
        scaler = sklearn.preprocessing.MinMaxScaler() #Speeds are scaled badly because of strong outliers?
        scaler = scaler.fit(X[numerical])
    X[numerical] = scaler.transform(X[numerical])  
    return X,Y,scaler

In [5]:
data_train = load_data(path_train)

In [6]:
X,Y,scaler = preprocess(data_train)

In [7]:
X

Unnamed: 0,tripduration,roundtrip,haversine distance,business day,speed,summer,tod_1,tod_2,tod_3,business x tod_1,business x tod_2,business x tod_3
0,0.133649,False,0.466527,True,0.045572,True,0,1,0,0,1,0
1,0.021522,False,0.068912,True,0.037016,True,1,0,0,1,0,0
2,0.081907,False,0.239427,True,0.037574,False,0,0,1,0,0,1
3,0.014999,False,0.049990,True,0.036373,True,0,0,1,0,0,1
4,0.085866,False,0.194148,True,0.029117,True,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
13984169,0.111179,False,0.364880,True,0.042633,True,1,0,0,1,0,0
13984170,0.087371,False,0.085961,True,0.012678,True,0,1,0,0,1,0
13984171,0.031335,False,0.073236,True,0.028226,True,0,0,1,0,0,1
13984172,0.019459,False,0.021683,False,0.012698,True,0,0,1,0,0,0


In [8]:
baseline = DummyClassifier()
baseline = baseline.fit(X,Y)
acc_base = baseline.score(X,Y)
acc_base

0.8905606437677335

In [9]:
clf = LogisticRegression(max_iter=300) #Might make sense to use balanced class weights here
clf=clf.fit(X,Y)
acc_train = clf.score(X,Y)
acc_train

0.8954962945970208

In [10]:
Y_pred = clf.predict(X)
confusion_matrix(Y,Y_pred,normalize="true")

array([[0.98216546, 0.01783454],
       [0.80977236, 0.19022764]])

In [22]:
coefficients = pd.Series([clf.intercept_[0]]+list(clf.coef_[0]),index=["intercept"]+list(X.columns))
print(coefficients)

intercept              -0.233645
tripduration            2.952737
roundtrip              -1.027844
haversine distance      7.047617
business day           -0.689343
speed                -105.875782
summer                  0.634162
tod_1                   0.129662
tod_2                   0.461759
tod_3                   0.320546
business x tod_1       -0.489299
business x tod_2        0.162279
business x tod_3       -0.291085
dtype: float64


Notes: By using gender=unknown, gender=male categorical features one can easily get about 94% training accuracy, and about 98% on subscribers. Using classes = balanced gives 79% accuracy overall, but like 82% on customers instead of 20% for unbalanced.

Scaling data with min-max scaler seems to have no effect
Using categorical tod encoding does not seem much different from using ordinal encoding or hours, at least if we dont use interactions.
Adding interaction terms between tod and business day does not help much.

In [23]:
data_val = load_data(path_val)
X_val, Y_val,scaler = preprocess(data_val,scaler=scaler)

In [24]:
clf.score(X_val,Y_val)

0.8949743738776593

In [25]:
Y_pred = clf.predict(X_val)
confusion_matrix(Y_val,Y_pred,normalize="true")

array([[0.98200261, 0.01799739],
       [0.81002535, 0.18997465]])

In [26]:
baseline.score(X_val,Y_val)

0.8901197411243915