In [60]:
import pandas as pd

## Utility functions

In [61]:
def get_station_updates(city, station):
    df = pd.read_csv('challenge_data/{}/stations/{}.csv'.format(city, station))
    df.drop_duplicates(subset='moment', inplace=True)
    df['moment'] = pd.to_datetime(df['moment'])
    return df


def get_weather_updates(city):
    df = pd.read_csv('challenge_data/{}/weather.csv'.format(city))
    df.drop_duplicates(subset='moment', inplace=True)
    df['moment'] = pd.to_datetime(df['moment'])
    return df

## Pre-processing

In [97]:
def add_temporal_features(df):
    df['weekday'] = df['moment'].map(lambda m: m.weekday())
    df['hour'] = df['moment'].map(lambda m: m.hour)
    df['minute'] = df['moment'].map(lambda m: m.minute)
    return df


def add_last_update(df):
    df['last_moment'] = df['moment'].diff(1)
    last_bikes = df['bikes'][:-1]
    last_spaces = df['spaces'][:-1]
    # The first row doesn't have any previous update, dropping one row is fine
    df.drop(0, inplace=True)
    df['last_moment'] = df['last_moment'].apply(lambda moment: moment.seconds)
    df['last_bikes'] = last_bikes.tolist()
    return df
    

    
def add_weather_updates(city_df, city_name):
    w_df = get_weather_updates(city)
    w_times = pd.Series(wdf['moment'].values, wdf['moment'])
    w_times.sort_values(inplace=True)
    city_df['moment_weather'] = w_times.reindex(city_df['moment'], method='nearest').values
    joined_df = pd.merge(left=city_df, right=w_df, left_on='moment_weather', right_on='moment', suffixes=('', '_y'))
    joined_df.drop('moment_y', 1, inplace=True)
    return joined_df

In [102]:
city = 'toulouse'
station = '00229-iut-rangueil'

df = get_station_updates(city, station)
df = add_temporal_features(df)
df = add_last_update(df)
df = add_weather_updates(df, city)

In [103]:
features = [
    'weekday',
    'hour',
    'minute',
    'last_moment',
    'last_bikes',
    'clouds',
    'humidity',
    'pressure',
    'temperature',
    'wind'
]

target = 'bikes'

X = df[features]
y = df[target]

In [115]:
from sklearn import tree
from sklearn import metrics
from sklearn import model_selection

clf = tree.DecisionTreeRegressor()


X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
metrics.mean_absolute_error(y_test, clf.predict(X_test))

0.33000208203206327

## Let's go HAM