## Жёлтое такси в Нью-Йорке. Финальный проект специализации Машинное обучение и анализ данных.

Задача проекта — научиться предсказывать количество поездок в ближайшие часы в каждом районе Нью-Йорка. Для обучения использовались данные с мая 2014г. по май 2016г. Прогнозы составлены на июнь 2016г.

### Часть 1. Чтение и обработка данных

#### 1. Произведем обработку и очистку данных, путем удаления поездок с :  
- нулевой длительностью  
- нулевым количеством пассажиров  
- нулевым расстоянием поездки по счётчику  
- координатами начала, не попадающими в прямоугольник Нью-Йорка

In [1]:
# Main system library importing
import os

# Main modelling libraries importing
import numpy as np
import pandas as pd
import datetime
import urllib3

# Main statistical library importing
from scipy import stats as sts

# Main visual library importing
# Standard visualize
import matplotlib.image as pltimg
from matplotlib import pylab as plt

# Dynamical visualize importing
from chart_studio import plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot

In [2]:
# Проверка количества строк в датафрейме из csv
def check_num_rows(filename, chunksize=1e6):
    num_rows = 0
    for chunk in pd.read_csv(filename, sep=',', header=0, chunksize=chunksize):
        num_rows += chunk.shape[0]
    return num_rows

In [3]:
# Очистка данных. Модифицированный датафрейм записывается в файл. Возвращается кол-во строк до и после очистки.
def clean_data(raw_filename, clean_filename, chunksize=1e6):
    
    # Удаляем очищенный файл, если он существует.
    if os.path.exists(clean_filename):
        os.remove(clean_filename)
    
    num_rows_before = 0
    num_rows_after = 0
    
    for chunk in pd.read_csv(raw_filename, sep=',', header=0, chunksize=chunksize, skipinitialspace=True,
                             parse_dates=[1, 2],
                             date_parser=lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')):
        
        num_rows_before += chunk.shape[0]
        
        # Так как tpep_pickup_datetime и tpep_dropoff_datetime в файле за 2014 не содержат "tpep"
        # Тогда их переименовываем
        try:
            pick_time_attr = 'tpep_pickup_datetime'
            drop_time_attr = 'tpep_dropoff_datetime'
            _ = chunk[pick_time_attr]
        except KeyError:
            pick_time_attr = 'pickup_datetime'
            drop_time_attr = 'dropoff_datetime'
        
        
        pickup_time = np.array([i.timestamp() for i in list(chunk[pick_time_attr])])
        dropoff_time = np.array([i.timestamp() for i in list(chunk[drop_time_attr])])
        chunk['long'] = dropoff_time - pickup_time
        
        chunk['tpep_pickup_datetime'] = chunk[pick_time_attr].apply(
            lambda x: x.replace(minute=0, second=0))
        chunk['tpep_dropoff_datetime'] = chunk[drop_time_attr].apply(
            lambda x: x.strftime('%Y-%m-%d %H:%M:%S'))
        
        chunk.drop(columns=['pickup_datetime', 'dropoff_datetime'], inplace=True, errors='ignore')
        
        # Проивзедем непосредственно очистку данных по заданным критериям: нулевая длительность,
        # Нулевое количество поездок, нулевая длина маршрута, начало маршрута вне "прямоугольника" Нью-Йорка
        chunk = chunk[(chunk['long'] > 0) &
                      (chunk.passenger_count > 0) & 
                      (chunk.trip_distance > 0) & 
                      (chunk['pickup_longitude'] <= -73.70001) & 
                      (chunk['pickup_longitude'] >= -74.25559) & 
                      (chunk['pickup_latitude'] <= 40.91553) & 
                      (chunk['pickup_latitude'] >= 40.49612)]
        
        num_rows_after += chunk.shape[0]
        
        # Запись. Если файл уже создан, записываем без заголовка.
        header = not(os.path.exists(clean_filename))
        chunk.to_csv(clean_filename, header=header, mode='a')
    
    return num_rows_before, num_rows_after

In [4]:
dir_with_raw_data = os.path.join(os.path.dirname(os.path.abspath('file')), 
                             'raw_data')
files_with_raw_data = os.listdir(dir_with_raw_data)
dir_with_clean_data = os.path.join(os.path.dirname(os.path.abspath('file')), 
                             'clean_data')

In [5]:
dir_with_raw_data

'/home/tater/ML_coursera/MIPT_YANDEX_FINAL_TAXI/raw_data'

In [6]:
files_with_raw_data.sort()
files_with_raw_data

['yellow_tripdata_2016-05.csv']

#### Проанализируем один из датафреймов с поездками на предмет содержащейся в нем информации

In [7]:
df_test = pd.read_csv(dir_with_raw_data + "/" +str(files_with_raw_data[0]))

In [8]:
df_test

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,1,2016-05-01 00:00:00,2016-05-01 00:17:31,1,3.60,-73.985901,40.768040,1,N,-73.983986,40.730099,1,15.0,0.5,0.5,1.50,0.0,0.3,17.80
1,2,2016-05-01 00:00:00,2016-05-01 00:07:31,1,1.68,-73.991577,40.744751,1,N,-73.975700,40.765469,1,7.5,0.5,0.5,0.88,0.0,0.3,9.68
2,2,2016-05-01 00:00:00,2016-05-01 00:07:01,6,1.09,-73.993073,40.741573,1,N,-73.980995,40.744633,1,6.5,0.5,0.5,1.56,0.0,0.3,9.36
3,2,2016-05-01 00:00:00,2016-05-01 00:19:47,1,4.21,-73.991943,40.684601,1,N,-74.002258,40.733002,1,17.0,0.5,0.5,3.66,0.0,0.3,21.96
4,2,2016-05-01 00:00:00,2016-05-01 00:06:39,1,0.56,-74.005280,40.740192,1,N,-73.997498,40.737564,1,6.0,0.5,0.5,1.46,0.0,0.3,8.76
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11836848,2,2016-05-27 12:24:58,2016-05-27 12:39:38,1,0.00,0.000000,0.000000,1,N,0.000000,0.000000,2,9.5,0.5,0.5,0.00,0.0,0.3,10.80
11836849,2,2016-05-27 12:44:54,2016-05-27 12:48:17,1,0.00,0.000000,0.000000,1,N,0.000000,0.000000,2,4.0,0.5,0.5,0.00,0.0,0.3,5.30
11836850,2,2016-05-27 12:48:51,2016-05-27 13:40:56,1,0.00,0.000000,0.000000,1,N,0.000000,0.000000,2,28.5,0.5,0.5,0.00,0.0,0.3,29.80
11836851,2,2016-05-27 14:18:21,2016-05-27 14:38:13,1,0.00,0.000000,0.000000,1,N,0.000000,0.000000,2,12.0,0.5,0.5,0.00,0.0,0.3,13.30


In [10]:
%%time
# Инициализация очистки данных
CHUNKSIZE = 1e6

print('Сверка количества строк')

for file in files_with_raw_data:
    print(file + '...')
    raw_filename = os.path.join(dir_with_raw_data, file)
    clean_filename = os.path.join(dir_with_clean_data, 'cleaned_' + file)
    num_rows_before, num_row_after = clean_data(raw_filename, clean_filename, CHUNKSIZE)
    check = check_num_rows(clean_filename, CHUNKSIZE)
    
    is_cool = 'YES' if num_row_after == check else 'NO'
    num_removed = num_rows_before - num_row_after
    
    print('{0}: до-{1}, после-{2}, check-{3}, is_cool-{4}. Удалено {5} строк'.format(file,
                                                                                     num_rows_before, 
                                                                                     num_row_after, 
                                                                                     check, 
                                                                                     is_cool, 
                                                                                     num_removed))

Сверка количества строк
yellow_tripdata_2016-05.csv...
yellow_tripdata_2016-05.csv: до-11836853, после-11626521, check-11626521, is_cool-YES. Удалено 210332 строк
CPU times: user 8min 41s, sys: 11 s, total: 8min 52s
Wall time: 8min 56s


#### 2. Посчитаем количество поездок :  

In [12]:
# Получение списка количества поездок по координатам.
def get_trips(x, y):
    west = -74.25559 
    east = -73.70001
    south = 40.49612
    north = 40.91553
    counts = sts.binned_statistic_2d(x, y, None, statistic='count', bins=50, 
                        range=[[west, east], [south, north]])
    return counts.statistic.ravel()


In [13]:
# Получение списка часов в месяце.
def get_hours(date_string):
    now_date = datetime.datetime.strptime(date_string, '%Y-%m-%d %H:%M:%S')
    now_date = now_date.replace(day=1, hour=0, minute=0, second=0)
    target_month = now_date.month
    now_month = now_date.month
    res_list = []
    while now_month == target_month:
        res_list.append(now_date.strftime('%Y-%m-%d %H:%M:%S'))
        now_date = now_date + datetime.timedelta(hours=1)
        now_month = now_date.month
    return res_list

In [14]:
# Считаем поездки. Модифицирует датафрейм.
def count_trips(count_data, clean_filename, chunksize=1e6):
    
    for chunk in pd.read_csv(clean_filename, sep=',', header=0, chunksize=chunksize):
        
        # Добавляем столбцы часов в датафрейм.
        if count_data.shape[1] < 10:
            hours = get_hours(chunk['tpep_pickup_datetime'][0])
            for hour in hours:
                values = np.zeros((count_data.shape[0],))
                count_data[hour] = values
        
        for hour in chunk['tpep_pickup_datetime'].unique():
            h_chunk = chunk[chunk['tpep_pickup_datetime'] == hour]
            x = np.array(h_chunk['pickup_longitude'])
            y = np.array(h_chunk['pickup_latitude'])
            count_data[hour] = np.array(count_data[hour]) + get_trips(x, y)

In [35]:
regions.shape[0]

2500

In [36]:
values_ = np.zeros((regions.shape[0],))
values_

array([0., 0., 0., ..., 0., 0., 0.])

In [20]:
df_test["tpep_pickup_datetime"][0]

'2016-05-01 00:00:00'

In [27]:
df_test["tpep_pickup_datetime"].unique()

array(['2016-05-01 00:00:00', '2016-05-01 00:00:01',
       '2016-05-01 00:00:02', ..., '2016-05-31 23:59:58',
       '2016-05-31 23:59:59', '2016-05-31 23:53:02'], dtype=object)

In [29]:
hour_1 = get_hours(df_test["tpep_pickup_datetime"][0])[0]
hour_1

'2016-05-01 00:00:00'

In [37]:
regions[hour_1] = values_
regions

Unnamed: 0,region,west,east,south,north,2016-05-01 00:00:00
0,1,-74.255590,-74.244478,40.496120,40.504508,0.0
1,2,-74.255590,-74.244478,40.504508,40.512896,0.0
2,3,-74.255590,-74.244478,40.512896,40.521285,0.0
3,4,-74.255590,-74.244478,40.521285,40.529673,0.0
4,5,-74.255590,-74.244478,40.529673,40.538061,0.0
...,...,...,...,...,...,...
2495,2496,-73.711122,-73.700010,40.873589,40.881977,0.0
2496,2497,-73.711122,-73.700010,40.881977,40.890365,0.0
2497,2498,-73.711122,-73.700010,40.890365,40.898754,0.0
2498,2499,-73.711122,-73.700010,40.898754,40.907142,0.0


In [33]:
regions_filename

'/home/tater/ML_coursera/MIPT_YANDEX_FINAL_TAXI/regions.csv'

In [32]:
regions_filename = os.path.join(os.path.dirname(os.path.abspath('file')), 'regions.csv')
regions = pd.read_csv(regions_filename, sep=';', header=0)
regions.head()

Unnamed: 0,region,west,east,south,north
0,1,-74.25559,-74.244478,40.49612,40.504508
1,2,-74.25559,-74.244478,40.504508,40.512896
2,3,-74.25559,-74.244478,40.512896,40.521285
3,4,-74.25559,-74.244478,40.521285,40.529673
4,5,-74.25559,-74.244478,40.529673,40.538061


In [14]:
clean_files = os.listdir(dir_with_clean_data)
dir_with_count_data = os.path.join(os.path.dirname(os.path.abspath('file')), 
                             'count_data')
clean_files.sort()

In [15]:
# Считаем, записываем.
CHUNKSIZE = 1e6

for file in clean_files:
    print(file+'...', end='')
    count_data = regions.copy(deep=True)
    clean_filename = os.path.join(dir_with_clean_data, file)
    count_filename = os.path.join(dir_with_count_data, 'count_' + file)
    
    #Если файл существует, удаляем его.
    if os.path.exists(count_filename):
        os.remove(count_filename)
    
    num_rows = count_trips(count_data, clean_filename, CHUNKSIZE)
    count_data.to_csv(count_filename, sep=',', header=True)
    print('DONE')

cleaned_yellow_tripdata_2016-05.csv...

KeyError: '2016-05-05'