In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import gmaps
import datetime
import lightgbm
from sklearn.model_selection import cross_val_score,TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
%matplotlib inline

In [2]:
train = pd.read_csv('yellow_tripdata_2015.csv')

In [3]:
train.shape

(69006887, 4)

In [4]:
#удалим строки без значений
train.dropna(inplace=True)

# Начнем с определения диапазонов координат с наибольшим количеством заказов

In [5]:
bins_long = [x for x in range(-180,180,10)]
bins_lat= [x for x in range(-90,90,10)]
label_long = [x for x in range(len(bins_long)-1)]
label_lat = [x for x in range(len(bins_lat)-1)] 

In [6]:
train['lon_range'] = pd.cut(train['pickup_longitude'],bins=bins_long)
train['lat_range'] = pd.cut(train['pickup_latitude'],bins=bins_lat)
train['cat_lon'] = pd.cut(train['pickup_longitude'],bins=bins_long,labels=label_long)
train['cat_lat'] = pd.cut(train['pickup_latitude'],bins=bins_lat,labels=label_lat)

In [7]:
train.head()

Unnamed: 0.1,Unnamed: 0,tpep_pickup_datetime,pickup_longitude,pickup_latitude,lon_range,lat_range,cat_lon,cat_lat
0,0,2015-07-01 00:00:00,-73.994156,40.751125,"(-80, -70]","(40, 50]",10,13
1,1,2015-07-01 00:00:00,-73.984657,40.768486,"(-80, -70]","(40, 50]",10,13
2,2,2015-07-01 00:00:00,-73.978889,40.762287,"(-80, -70]","(40, 50]",10,13
3,3,2015-07-01 00:00:00,-73.99279,40.74276,"(-80, -70]","(40, 50]",10,13
4,4,2015-07-01 00:00:00,-73.91243,40.76981,"(-80, -70]","(40, 50]",10,13


In [8]:
train.groupby(by=['lon_range','lat_range'])['pickup_latitude'].count().sort_values(ascending=False).head(5)

lon_range     lat_range
(-80, -70]    (40, 50]     68032539
(-10, 0]      (-10, 0]       973246
(-80, -70]    (30, 40]          347
(-130, -120]  (30, 40]          167
(-70, -60]    (40, 50]          113
Name: pickup_latitude, dtype: int64

# Наибольшее количество в longitude (-80, -70] latitude (40, 50]

# Разобъем найденный диапазон еще на несколько бинов

In [5]:
bins_long = [x for x in range(-80,-70,1)]
bins_lat= [x for x in range(40,50,1)]
label_long = [x for x in range(len(bins_long)-1)]
label_lat = [x for x in range(len(bins_lat)-1)] 

In [6]:
train['lon_range'] = pd.cut(train['pickup_longitude'],bins=bins_long)
train['lat_range'] = pd.cut(train['pickup_latitude'],bins=bins_lat)
train['cat_lon'] = pd.cut(train['pickup_longitude'],bins=bins_long,labels=label_long)
train['cat_lat'] = pd.cut(train['pickup_latitude'],bins=bins_lat,labels=label_lat)

In [11]:
train.groupby(by=['lon_range','lat_range'])['pickup_latitude'].count().sort_values(ascending=False).head(5)

lon_range   lat_range
(-74, -73]  (40, 41]     58870535
(-75, -74]  (40, 41]      9158453
(-74, -73]  (41, 42]         2626
(-75, -74]  (41, 42]          496
(-73, -72]  (41, 42]           53
Name: pickup_latitude, dtype: int64

# Наибольшее количество из координат longitude  (-74, -73] latitude (40, 41]
## Оставим только данные объекты

In [7]:
train.drop(train[train['cat_lon']!=6].index,inplace=True)

In [8]:
train.drop(train[train['cat_lat']!=0].index,inplace=True)

In [9]:
train.shape

(58870535, 8)

# Возьмем сэмпл из мейна и построим Heatmap

In [10]:
gmaps.configure(api_key="....")

locations = train.sample(50000)[['pickup_latitude','pickup_longitude']]

fig = gmaps.figure()
fig.add_layer(gmaps.heatmap_layer(locations))
fig

Figure(layout=FigureLayout(height='420px'))

# Заказы поступают из Нью-Йорка, остров Манхэттен и двух близлежайших аэропортов

# Отсечем по долготе объекты, относящиеся к двум аэропортам

In [11]:
train.shape

(58870535, 8)

In [12]:
train.drop(train[train['pickup_longitude']>-73.930157].index,inplace=True)

In [13]:
gmaps.configure(api_key="....")
locations = train.sample(5000)[['pickup_latitude','pickup_longitude']]

fig = gmaps.figure()
fig.add_layer(gmaps.heatmap_layer(locations))
fig

Figure(layout=FigureLayout(height='420px'))

In [14]:
train.drop(['lon_range','lat_range','cat_lon','cat_lat'],axis=1,inplace=True)

In [15]:
import gc

In [16]:
gc.collect()

47

In [17]:
train.head()

Unnamed: 0.1,Unnamed: 0,tpep_pickup_datetime,pickup_longitude,pickup_latitude
0,0,2015-07-01 00:00:00,-73.994156,40.751125
1,1,2015-07-01 00:00:00,-73.984657,40.768486
2,2,2015-07-01 00:00:00,-73.978889,40.762287
3,3,2015-07-01 00:00:00,-73.99279,40.74276
5,5,2015-07-01 00:00:00,-73.95916,40.77343


# Сгруппируем заказы по часам

In [21]:
train['tpep_pickup_datetime'] = pd.to_datetime(train['tpep_pickup_datetime'])

In [22]:
# Сформируем признак номера месяца
train['month'] = train['tpep_pickup_datetime'].apply(lambda x:x.month)

# Разобъем датафрейм на части по месяцам, сагрегируем и получим признаки с количество заказов в текущем часу.

In [23]:
df_arrgerated = pd.DataFrame(data={'Date':[], 'Mean_Latitude':[],'Mean_Longitude':[],'Count':[]})

In [24]:
monthes = [7,8,9,10,11,12]
df_arrgerated = pd.DataFrame()
for month in monthes:
    df = train[train['month']== month]
    grouped = df.groupby(df['tpep_pickup_datetime'].apply(lambda x:(x.month,x.day,x.hour)))
    dates = grouped.count().index.values
    Mean_Latitudes = grouped.mean()['pickup_longitude'].values
    Mean_Longitude = grouped.mean()['pickup_latitude'].values
    y = grouped.count()['pickup_longitude'].values
    df_to_add = pd.DataFrame(data = {'Date':dates, 'Mean_Latitude':Mean_Latitudes,'Mean_Longitude':Mean_Longitude,'Count':y})
    df_arrgerated = df_arrgerated.append(df_to_add,ignore_index=True)
    print(grouped.count().shape)
    del grouped,df

(744, 5)
(744, 5)
(720, 5)
(744, 5)
(720, 5)
(744, 5)


In [25]:
del train

In [27]:
import gc
gc.collect()

343

In [28]:
#Получившийся датафрейм
df_arrgerated.head()

Unnamed: 0,Count,Date,Mean_Latitude,Mean_Longitude
0,10246,"(7, 1, 0)",-73.979891,40.750607
1,5684,"(7, 1, 1)",-73.979766,40.748639
2,3512,"(7, 1, 2)",-73.979884,40.747874
3,2249,"(7, 1, 3)",-73.979295,40.747865
4,2428,"(7, 1, 4)",-73.977096,40.753004


# Сместим Label, так как по задаче нужно предсказать на час вперед

In [29]:
y = df_arrgerated['Count'].values[1:]
y= np.append(y,0)
df_arrgerated['Count']= y
df_arrgerated = df_arrgerated.iloc[:-1]

# Сформируем признаки

In [30]:
def costime(value, period=24):
    value *= 2 * np.pi / period
    return np.cos(value)
def sintime(value, period=24):
    value *= 2 * np.pi / period
    return np.sin(value)

In [33]:
def clear(x):
    x = str(x)
    x = x[1:-1]
    x = x.split(',')
    return x

In [34]:
df_arrgerated['Date'] = df_arrgerated['Date'].map(clear)

In [35]:
df_arrgerated['Month'] = df_arrgerated['Date'].apply(lambda x:x[0]).astype(int)
df_arrgerated['Day'] = df_arrgerated['Date'].apply(lambda x:x[1]).astype(int)
df_arrgerated['Hour'] = df_arrgerated['Date'].apply(lambda x:x[2]).astype(int)

In [36]:
df_arrgerated['Hour_cos'] = df_arrgerated['Hour'].apply(costime)
df_arrgerated['Hour_sin'] = df_arrgerated['Hour'].apply(sintime)

In [37]:
def is_weekend(x):
    a = datetime.datetime(2015, int(x[0]),int(x[1]))
    a = 0 if a.weekday()<5 else 1
    return a
def weekday(x):
    a = datetime.datetime(2015, int(x[0]),int(x[1]))
    return a.weekday()

In [38]:
df_arrgerated['is_weekend'] = df_arrgerated['Date'].apply(is_weekend)

In [39]:
df_arrgerated['weekday'] = df_arrgerated['Date'].apply(weekday)

In [40]:
y = y[:-1]

In [41]:
df_arrgerated.drop(['Count','Date'],axis=1,inplace=True)

In [42]:
df_arrgerated.head()

Unnamed: 0,Mean_Latitude,Mean_Longitude,Month,Day,Hour,Hour_cos,Hour_sin,is_weekend,weekday
0,-73.979891,40.750607,7,1,0,1.0,0.0,0,2
1,-73.979766,40.748639,7,1,1,0.965926,0.258819,0,2
2,-73.979884,40.747874,7,1,2,0.866025,0.5,0,2
3,-73.979295,40.747865,7,1,3,0.707107,0.707107,0,2
4,-73.977096,40.753004,7,1,4,0.5,0.866025,0,2


# Разделим на train/test 70/30

In [43]:
indx = int(df_arrgerated.shape[0] * 0.7)
X_train, y_train, X_test, y_test = df_arrgerated.iloc[:indx].values, y[:indx], df_arrgerated.iloc[indx:].values, y[indx:]
(X_train.shape[0], y_train.shape[0], X_test.shape[0], y_test.shape[0])

(3090, 3090, 1325, 1325)

In [44]:
model = lightgbm.LGBMRegressor()

In [45]:
tssplit = TimeSeriesSplit(n_splits=5)

In [46]:
results = cross_val_score(model,X_train,y_train,cv=tssplit,scoring='neg_mean_absolute_error')
print(np.mean(results),np.std(results))

-1107.1406775210867 202.23446303946923


In [47]:
model.fit(X_train,y_train)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
       n_jobs=-1, num_leaves=31, objective=None, random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=0)

In [48]:
y_pred = model.predict(X_test)

In [49]:
print("Средняя абсолютная ошибка от среднего таргета в тесте =  {:.0%}".format(mean_absolute_error(y_test,y_pred) / np.mean(y_test)))

Средняя абсолютная ошибка от среднего таргета в тесте =  12%
