In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timezone
from tqdm import tqdm
import catboost
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
import os
import random
from meteostat import Point, Daily

In [None]:
from meteostat import Point, Daily
import pandas as pd

def get_weather_features(df, lat_col='latitude', lon_col='longitude', time_col='time'):
    """
    Добавляет погодные данные из Meteostat в DataFrame.
    
    Параметры:
    - df: исходный DataFrame
    - lat_col: колонка с широтой
    - lon_col: колонка с долготой
    - time_col: колонка с временем (timestamp)
    
    Возвращает:
    - DataFrame с добавленными погодными признаками.
    """
    def fetch_weather_data(row):
        location = Point(row[lat_col], row[lon_col])
        date = pd.to_datetime(row[time_col]).date()
        
        # Запрашиваем данные о погоде на день
        weather = Daily(location, start=date, end=date)
        weather_data = weather.fetch()
        
        if not weather_data.empty:
            return weather_data.iloc[0].to_dict()
        else:
            return {
                'tavg': None, 'tmin': None, 'tmax': None, 'prcp': None, 'snow': None,
                'wdir': None, 'wspd': None, 'wpgt': None, 'pres': None, 'tsun': None,
                'rhum': None
            }
    
    # Применяем функцию к каждой строке DataFrame
    weather_df = df.apply(fetch_weather_data, axis=1, result_type='expand')
    
    # Объединяем исходный DataFrame с погодными признаками
    return pd.concat([df, weather_df], axis=1)


In [None]:
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import impute

def get_timeseries_features(df, time_col='time', value_col='value'):
    """
    Генерирует фичи для временных рядов с использованием tsfresh.
    
    Параметры:
    - df: DataFrame с временным рядом
    - time_col: колонка с временем (timestamp)
    - value_col: колонка с значениями временного ряда
    
    Возвращает:
    - DataFrame с извлечёнными фичами.
    """
    # Подготавливаем данные для tsfresh
    df = df.rename(columns={time_col: "time", value_col: "value"})
    df['id'] = 1  # Добавляем колонку id, необходимую для tsfresh
    
    # Извлекаем фичи
    features = extract_features(df, column_id="id", column_sort="time")
    
    # Импутируем отсутствующие значения
    features = impute(features)
    
    return features


In [None]:
import numpy as np

def get_geospatial_features(df, lat_col='latitude', lon_col='longitude', reference_points=None):
    """
    Генерирует географические фичи для данных с координатами.
    
    Параметры:
    - df: DataFrame с координатами
    - lat_col: колонка с широтой
    - lon_col: колонка с долготой
    - reference_points: словарь с точками для вычисления расстояний
    
    Возвращает:
    - DataFrame с добавленными географическими признаками.
    """
    def haversine(lat1, lon1, lat2, lon2):
        """Вычисляет расстояние между двумя точками на поверхности Земли в метрах."""
        R = 6371000
        phi1, phi2 = np.radians(lat1), np.radians(lat2)
        d_phi = phi2 - phi1
        d_lambda = np.radians(lon2 - lon1)
        a = np.sin(d_phi / 2) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(d_lambda / 2) ** 2
        return 2 * R * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    
    # Расстояния до заданных точек
    if reference_points:
        for name, (ref_lat, ref_lon) in reference_points.items():
            df[f'dist_to_{name}'] = df.apply(
                lambda row: haversine(row[lat_col], row[lon_col], ref_lat, ref_lon), axis=1
            )
    
    # Признаки широты и долготы (синус и косинус)
    df['lat_cos'] = np.cos(np.radians(df[lat_col]))
    df['lat_sin'] = np.sin(np.radians(df[lat_col]))
    df['lon_cos'] = np.cos(np.radians(df[lon_col]))
    df['lon_sin'] = np.sin(np.radians(df[lon_col]))
    
    return df


In [None]:
import pandas as pd

def create_lag_features(df, value_col='value', time_col='time', max_lag=10, dop_col=None):
    """
    Генерирует лаги для временных рядов. Лаги создаются для всех сдвигов от 1 до max_lag.
    
    Параметры:
    - df: исходный DataFrame
    - value_col: колонка с значениями временного ряда
    - time_col: колонка с временными метками
    - max_lag: максимальный лаг для создания сдвигов
    - dop_col: дополнительная колонка для фич (например, категориальные или числовые)
    
    Возвращает:
    - DataFrame с добавленными лагами.
    """
    
    # Сортируем по времени
    df = df.sort_values(by=time_col)
    
    # Генерация лагов для значений (value_col)
    for lag in range(1, max_lag + 1):
        df[f'{value_col}_lag_{lag}'] = df[value_col].shift(lag)
    
    # Генерация лагов для дополнительной колонки, если она есть
    if dop_col:
        for lag in range(1, max_lag + 1):
            df[f'{dop_col}_lag_{lag}'] = df[dop_col].shift(lag)
    
    return df

In [27]:
def set_all_seeds(seed=22):
    # python's seeds
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
    
set_all_seeds()
seed=22

In [28]:
def haversine(lat1, lon1, lat2, lon2):
    """
    Вычисляет расстояние между двумя точками на поверхности Земли в метрах, используя формулу гаверсинуса.
    """
    R = 6371000  # радиус Земли в метрах
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    d_phi = phi2 - phi1
    d_lambda = np.radians(lon2 - lon1)

    a = np.sin(d_phi / 2) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(d_lambda / 2) ** 2
    return 2 * R * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

def calculate_bearing(lat1, lon1, lat2, lon2):
    """
    Вычисляет азимут между двумя точками (в градусах от северного направления).
    """
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    d_lambda = np.radians(lon2 - lon1)
    x = np.sin(d_lambda) * np.cos(phi2)
    y = np.cos(phi1) * np.sin(phi2) - np.sin(phi1) * np.cos(phi2) * np.cos(d_lambda)
    return (np.degrees(np.arctan2(x, y)) + 360) % 360

def transform_geospatial_features(df, lat_col='latitude', lon_col='longitude',
                                  reference_points=None, competitor_coords=None, 
                                  grid_size=0.1, n_clusters=5, eps=0.01, min_samples=5, prefix='start', seed=42):
    """
    Преобразует географические данные с добавлением новых признаков.
    """
    
    # 1. Признаки расстояния
    if reference_points:
        for name, ref_point in reference_points.items():
            df[f'dist_to_{name}_' + prefix] = df.apply(
                lambda row: haversine(row[lat_col], row[lon_col], ref_point[0], ref_point[1]), axis=1
            )

    if competitor_coords:
        df['dist_to_nearest_competitor_' + prefix] = df.apply(
            lambda row: min(haversine(row[lat_col], row[lon_col], comp[0], comp[1]) for comp in competitor_coords), axis=1
        )
    print(1)
    # 2. Кластеризация
    #coords = df[[lat_col, lon_col]].to_numpy()
    #kmeans = KMeans(n_clusters=n_clusters, random_state=seed)
    #df['kmeans_cluster_' + prefix] = kmeans.fit_predict(coords)

    #dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    #df['dbscan_cluster_' + prefix] = dbscan.fit_predict(coords)
    #print(2)
    # 3. Признаки сетки
    #df['grid_cell_' + prefix] = df.apply(
    #    lambda row: (int(row[lat_col] // grid_size), int(row[lon_col] // grid_size)), axis=1
    #)

    # 4. Признаки направления
    if reference_points:
        first_ref_point = list(reference_points.values())[0]
        df['azimuth_to_first_reference_' + prefix] = df.apply(
            lambda row: calculate_bearing(row[lat_col], row[lon_col], first_ref_point[0], first_ref_point[1]), axis=1
        )
    print(3)
    # Векторные признаки
    df['lat_cos_' + prefix] = np.cos(np.radians(df[lat_col]))
    df['lat_sin_' + prefix] = np.sin(np.radians(df[lat_col]))
    df['lon_cos_' + prefix] = np.cos(np.radians(df[lon_col]))
    df['lon_sin_' + prefix] = np.sin(np.radians(df[lon_col]))

    # Полярные координаты
    df['polar_lat_' + prefix] = np.radians(df[lat_col])
    df['polar_lon_' + prefix] = np.radians(df[lon_col])
    print(4)
    return df


def calculate_point_to_point_features(df, start_lat='start_latitude', start_lon='start_longitude', 
                                      end_lat='end_latitude', end_lon='end_longitude'):
    """
    Рассчитывает пространственные метрики между начальной и конечной точками.
    """
    
    df['distance'] = df.apply(
        lambda row: haversine(row[start_lat], row[start_lon], row[end_lat], row[end_lon]), axis=1
    )

    df['bearing'] = df.apply(
        lambda row: calculate_bearing(row[start_lat], row[start_lon], row[end_lat], row[end_lon]), axis=1
    )

    # Разница широты и долготы
    df['delta_lat'] = df[end_lat] - df[start_lat]
    df['delta_lon'] = df[end_lon] - df[start_lon]
    
    #return df


In [29]:
def create_many_time_features(df, dop_col):
    df = df[['datetime', dop_col]].copy()
    df['datetime'] = pd.to_datetime(df['datetime'])

    # 1. Базовые признаки даты и времени
    df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month
    df['day'] = df['datetime'].dt.day
    df['day_of_week'] = df['datetime'].dt.dayofweek  # 0 = Monday, 6 = Sunday
    df['hour'] = df['datetime'].dt.hour
    df['minute'] = df['datetime'].dt.minute
    df['second'] = df['datetime'].dt.second

    # 2. Бинарные признаки
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)  # 1, если суббота или воскресенье
    # Допустим, список праздников (потребуются данные по регионам для точности)
    holidays = ['2023-01-01', '2023-02-05']  # примерный список праздников
    df['is_holiday'] = df['datetime'].dt.date.astype(str).isin(holidays).astype(int)
    # Утренние и вечерние часы (например, с 7:00 до 10:00 и с 17:00 до 19:00) как час пик
    df['is_rush_hour'] = df['hour'].isin([7, 8, 9, 17, 18, 19]).astype(int)

    # 3. Сезонные признаки
    # Определим сезоны: зима = 12, 1, 2; весна = 3, 4, 5; лето = 6, 7, 8; осень = 9, 10, 11
    df['season'] = df['month'] % 12 // 3 + 1  # 1=зима, 2=весна, 3=лето, 4=осень
    df['quarter'] = df['datetime'].dt.quarter

    # 4. Циклические признаки
    # Циклическое представление часа, месяца и дня недели
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

    # 5. Признаки временного интервала
    # Время с начала года
    df['days_since_start_of_year'] = (
    df['datetime'] - pd.to_datetime(df['datetime'].dt.year.astype(str) + '-01-01').dt.tz_localize('UTC')).dt.days
    # Время с последнего события (например, с предыдущей строки)
    df['time_since_last_event'] = df['datetime'].diff().dt.total_seconds().fillna(0)

    # 6. Скользящие и агрегированные статистики
    # Предположим, что у нас есть значение target для каждого временного ряда
    df['rolling_mean_2'] = df[dop_col].rolling(window=2).mean()
    df['rolling_sum_2'] = df[dop_col].rolling(window=2).sum()
    df['rolling_max_2'] = df[dop_col].rolling(window=2).max()
    df['rolling_min_2'] = df[dop_col].rolling(window=2).min()

    # 7. Тренд и сезонность
    # Декомпозиция тренда и сезонности требует дополнительных библиотек, таких как statsmodels или pandas
    # Примерный код на основе STL доступен с statsmodels

    # 8. Показатель активности
    # Количество событий за последнюю неделю (например, из предыдущих данных)
    df['events_last_week'] = df['datetime'].rolling(7).count()
    # Среднее время между событиями
    df['mean_time_between_events'] = df['time_since_last_event'].rolling(window=2).mean()
    df = df.drop(columns=['datetime', dop_col])
    df = df.fillna(0)
    return df


In [30]:
reference_points = {
    'nyc': (40.724944, -74.001541),
    'jfk': (40.645494, -73.785937),
    'lga': (40.774071, -73.872067),
    'nla': (40.690764, -74.177721),
}

In [None]:
df = pd.DataFrame()
c = 1
for ch in tqdm(pd.read_csv(r'new-york-city-taxi-fare-prediction\train.csv', chunksize=10000000)):
    if not c:
        break
    c -= 1
    df = pd.concat([df, ch])
    

0it [00:00, ?it/s]

1it [00:00,  2.98it/s]


In [32]:
df = df.sort_values('pickup_datetime', ignore_index=True)

In [33]:
df = df.rename(columns={'pickup_datetime': 'datetime'})

In [34]:
df

Unnamed: 0,key,fare_amount,datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-01-01 00:41:00.0000003,6.20,2009-01-01 00:41:00 UTC,-73.994202,40.751008,-73.985817,40.750577,2
1,2009-01-01 01:21:54.0000002,4.90,2009-01-01 01:21:54 UTC,-73.951810,40.714897,-73.962162,40.709354,4
2,2009-01-01 01:31:49.0000003,8.60,2009-01-01 01:31:49 UTC,-73.994192,40.720077,-73.993356,40.742642,1
3,2009-01-01 02:05:37.0000004,11.00,2009-01-01 02:05:37 UTC,-73.978433,40.744781,-74.004713,40.734328,1
4,2009-01-01 02:07:49.0000001,17.80,2009-01-01 02:07:49 UTC,-73.984291,40.667851,-74.006015,40.735481,3
...,...,...,...,...,...,...,...,...
99995,2015-06-30 21:54:22.0000004,18.00,2015-06-30 21:54:22 UTC,-73.993111,40.768154,-73.943871,40.837780,1
99996,2015-06-30 22:14:07.00000014,57.54,2015-06-30 22:14:07 UTC,-73.776703,40.645302,-73.994560,40.745377,5
99997,2015-06-30 22:27:27.0000003,20.50,2015-06-30 22:27:27 UTC,-73.885849,40.773113,-73.956100,40.717365,1
99998,2015-06-30 22:42:39.00000014,9.00,2015-06-30 22:42:39 UTC,-73.997223,40.726227,-73.978622,40.734432,1


In [35]:
tm = create_many_time_features(df, dop_col='passenger_count')

In [36]:
tm

Unnamed: 0,year,month,day,day_of_week,hour,minute,second,is_weekend,is_holiday,is_rush_hour,...,month_sin,month_cos,days_since_start_of_year,time_since_last_event,rolling_mean_2,rolling_sum_2,rolling_max_2,rolling_min_2,events_last_week,mean_time_between_events
0,2009,1,1,3,0,41,0,0,0,0,...,5.000000e-01,0.866025,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2009,1,1,3,1,21,54,0,0,0,...,5.000000e-01,0.866025,0,2454.0,3.0,6.0,4.0,2.0,0.0,1227.0
2,2009,1,1,3,1,31,49,0,0,0,...,5.000000e-01,0.866025,0,595.0,2.5,5.0,4.0,1.0,0.0,1524.5
3,2009,1,1,3,2,5,37,0,0,0,...,5.000000e-01,0.866025,0,2028.0,1.0,2.0,1.0,1.0,0.0,1311.5
4,2009,1,1,3,2,7,49,0,0,0,...,5.000000e-01,0.866025,0,132.0,2.0,4.0,3.0,1.0,0.0,1080.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2015,6,30,1,21,54,22,0,0,0,...,1.224647e-16,-1.000000,180,894.0,1.0,2.0,1.0,1.0,7.0,1125.5
99996,2015,6,30,1,22,14,7,0,0,0,...,1.224647e-16,-1.000000,180,1185.0,3.0,6.0,5.0,1.0,7.0,1039.5
99997,2015,6,30,1,22,27,27,0,0,0,...,1.224647e-16,-1.000000,180,800.0,3.0,6.0,5.0,1.0,7.0,992.5
99998,2015,6,30,1,22,42,39,0,0,0,...,1.224647e-16,-1.000000,180,912.0,1.0,2.0,1.0,1.0,7.0,856.0


In [37]:
start_longitude = 'pickup_longitude'
start_latitude = 'pickup_latitude'
end_latitude = 'dropoff_latitude'
end_longitude = 'dropoff_longitude'
coords_cols = [start_latitude, start_longitude, end_longitude, end_latitude]

In [38]:
for x in coords_cols:
    df = df[(-90 <= df[x]) & (df[x] <= 90)]

In [39]:
df

Unnamed: 0,key,fare_amount,datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-01-01 00:41:00.0000003,6.20,2009-01-01 00:41:00 UTC,-73.994202,40.751008,-73.985817,40.750577,2
1,2009-01-01 01:21:54.0000002,4.90,2009-01-01 01:21:54 UTC,-73.951810,40.714897,-73.962162,40.709354,4
2,2009-01-01 01:31:49.0000003,8.60,2009-01-01 01:31:49 UTC,-73.994192,40.720077,-73.993356,40.742642,1
3,2009-01-01 02:05:37.0000004,11.00,2009-01-01 02:05:37 UTC,-73.978433,40.744781,-74.004713,40.734328,1
4,2009-01-01 02:07:49.0000001,17.80,2009-01-01 02:07:49 UTC,-73.984291,40.667851,-74.006015,40.735481,3
...,...,...,...,...,...,...,...,...
99995,2015-06-30 21:54:22.0000004,18.00,2015-06-30 21:54:22 UTC,-73.993111,40.768154,-73.943871,40.837780,1
99996,2015-06-30 22:14:07.00000014,57.54,2015-06-30 22:14:07 UTC,-73.776703,40.645302,-73.994560,40.745377,5
99997,2015-06-30 22:27:27.0000003,20.50,2015-06-30 22:27:27 UTC,-73.885849,40.773113,-73.956100,40.717365,1
99998,2015-06-30 22:42:39.00000014,9.00,2015-06-30 22:42:39 UTC,-73.997223,40.726227,-73.978622,40.734432,1


In [40]:
gdf_start = transform_geospatial_features(df, start_latitude, start_longitude, reference_points)

1
3
4


In [41]:
calculate_point_to_point_features(gdf_start, start_lat=start_latitude, start_lon=start_longitude, end_lat=end_latitude, end_lon=end_longitude)

In [42]:
gdf_fin = transform_geospatial_features(gdf_start, end_latitude, end_longitude, reference_points, prefix='end')

1
3
4


In [43]:
df_fin = gdf_fin.drop(columns=['key', 'datetime'])

In [44]:
final = df_fin.join(tm)

In [45]:
final = final.replace([np.inf, -np.inf], np.nan).dropna()

In [46]:
len(final)

99997

In [47]:
len(df)

99997

In [48]:
X = final.drop(columns=['fare_amount'])
y = final['fare_amount']

In [49]:
y.isnull().sum()

0

In [50]:
params = {
    "n_estimators": 5000,
    "learning_rate": 0.05,
    "depth": 3,
    "use_best_model": True,
    "border_count": 64,
    "l2_leaf_reg": 1,
    "bagging_temperature": 2,
    "rsm": 0.5,
    "random_state": 22,
}

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)
train_dataset = catboost.Pool(X_train, y_train)
test_dataset = catboost.Pool(X_test, y_test)

In [52]:
model_class = catboost.CatBoostRegressor(use_best_model=True, random_state=seed)

In [53]:
model_class.fit(train_dataset, eval_set=test_dataset)

Learning rate set to 0.098534
0:	learn: 9.1695916	test: 8.9490984	best: 8.9490984 (0)	total: 156ms	remaining: 2m 35s
1:	learn: 8.6212387	test: 8.3931665	best: 8.3931665 (1)	total: 166ms	remaining: 1m 22s
2:	learn: 8.1296549	test: 7.8940612	best: 7.8940612 (2)	total: 177ms	remaining: 58.7s
3:	learn: 7.6998232	test: 7.4564428	best: 7.4564428 (3)	total: 184ms	remaining: 45.9s
4:	learn: 7.3327276	test: 7.0755205	best: 7.0755205 (4)	total: 199ms	remaining: 39.6s
5:	learn: 6.9987743	test: 6.7372295	best: 6.7372295 (5)	total: 210ms	remaining: 34.8s
6:	learn: 6.7107487	test: 6.4464851	best: 6.4464851 (6)	total: 221ms	remaining: 31.3s
7:	learn: 6.4378722	test: 6.1743171	best: 6.1743171 (7)	total: 229ms	remaining: 28.4s
8:	learn: 6.2067003	test: 5.9454284	best: 5.9454284 (8)	total: 240ms	remaining: 26.4s
9:	learn: 6.0174193	test: 5.7570447	best: 5.7570447 (9)	total: 248ms	remaining: 24.6s
10:	learn: 5.8479229	test: 5.5864148	best: 5.5864148 (10)	total: 258ms	remaining: 23.2s
11:	learn: 5.7082454

<catboost.core.CatBoostRegressor at 0x12db6161190>

In [54]:
root_mean_squared_error(y_test, model_class.predict(X_test))

4.037074077146748

In [55]:
params = {
    "n_estimators": 5000,
    "learning_rate": 0.05,
    "depth": 3,
    "border_count": 64,
    "l2_leaf_reg": 1,
    "bagging_temperature": 2,
    "rsm": 0.5,
    "random_state": 22,
}

In [56]:
model_class = catboost.CatBoostRegressor(random_state=seed)

In [57]:
model_class.fit(X, y)

Learning rate set to 0.084757
0:	learn: 9.1708633	total: 15.4ms	remaining: 15.4s
1:	learn: 8.6828840	total: 28.9ms	remaining: 14.4s
2:	learn: 8.2450691	total: 41.4ms	remaining: 13.8s
3:	learn: 7.8537704	total: 54.9ms	remaining: 13.7s
4:	learn: 7.4924450	total: 68.1ms	remaining: 13.6s
5:	learn: 7.1773240	total: 82.2ms	remaining: 13.6s
6:	learn: 6.8959696	total: 95.8ms	remaining: 13.6s
7:	learn: 6.6383855	total: 106ms	remaining: 13.2s
8:	learn: 6.4027875	total: 120ms	remaining: 13.2s
9:	learn: 6.1979141	total: 134ms	remaining: 13.3s
10:	learn: 6.0197570	total: 153ms	remaining: 13.8s
11:	learn: 5.8673794	total: 169ms	remaining: 13.9s
12:	learn: 5.7249334	total: 184ms	remaining: 14s
13:	learn: 5.5993723	total: 204ms	remaining: 14.4s
14:	learn: 5.4929988	total: 229ms	remaining: 15s
15:	learn: 5.3990630	total: 243ms	remaining: 14.9s
16:	learn: 5.3103801	total: 256ms	remaining: 14.8s
17:	learn: 5.2324602	total: 272ms	remaining: 14.8s
18:	learn: 5.1638917	total: 285ms	remaining: 14.7s
19:	lear

<catboost.core.CatBoostRegressor at 0x12ebd354d10>

In [58]:
test = pd.read_csv(r'new-york-city-taxi-fare-prediction\test.csv')

In [59]:
test = test.sort_values('pickup_datetime')
test

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
663,2009-01-01 11:04:24.0000002,2009-01-01 11:04:24 UTC,-73.990863,40.733931,-73.981629,40.728795,1
7086,2009-01-01 11:04:24.0000001,2009-01-01 11:04:24 UTC,-73.995485,40.746596,-73.982549,40.735576,2
7085,2009-01-01 11:04:24.0000003,2009-01-01 11:04:24 UTC,-73.981414,40.771064,-73.997780,40.736843,2
7084,2009-01-02 17:45:40.0000003,2009-01-02 17:45:40 UTC,-73.992696,40.749669,-73.993585,40.758911,2
7083,2009-01-02 17:45:40.0000002,2009-01-02 17:45:40 UTC,-73.999578,40.733475,-73.987156,40.750676,2
...,...,...,...,...,...,...,...
5117,2015-06-30 20:03:50.0000004,2015-06-30 20:03:50 UTC,-73.987526,40.744125,-73.976875,40.752121,1
9904,2015-06-30 20:03:50.0000002,2015-06-30 20:03:50 UTC,-73.776848,40.645035,-73.955460,40.652458,6
5116,2015-06-30 20:03:50.0000003,2015-06-30 20:03:50 UTC,-73.954910,40.777264,-73.971001,40.800117,1
5115,2015-06-30 20:03:50.0000001,2015-06-30 20:03:50 UTC,-73.864067,40.769894,-74.176849,40.694592,1


In [60]:
test = transform_geospatial_features(test, start_latitude, start_longitude, reference_points)

1
3
4


In [61]:
calculate_point_to_point_features(test, start_lat=start_latitude, start_lon=start_longitude, end_lat=end_latitude, end_lon=end_longitude)

In [62]:
test = transform_geospatial_features(test, end_latitude, end_longitude, reference_points, prefix='end')

1
3
4


In [63]:
test = test.rename(columns={'pickup_datetime': 'datetime'})
tm_test = create_many_time_features(test, dop_col='passenger_count')

In [64]:
test = test.drop(columns=['key', 'datetime'])

In [65]:
test_fin = test.join(tm_test)

In [66]:
pred = model_class.predict(test_fin.sort_index())

In [67]:
pred

array([ 9.66041539, 11.82482621,  4.53879116, ..., 53.8272736 ,
       21.73230757,  7.10858313])

In [68]:
sab = pd.read_csv(r'new-york-city-taxi-fare-prediction\sample_submission.csv')

In [69]:
sab['fare_amount'] = pred

In [70]:
sab.to_csv(r'answer_geo.csv', index=False)

In [None]:
#c = 0
#for ch in tqdm(pd.read_csv(r'new-york-city-taxi-fare-prediction\train.csv', chunksize=100000)):
#    if c == 10:
#        break
#    print(c)
#    df = ch.copy()
#    df = all_prepr(df)
#    df.to_csv(rf'dfs/chuck{c}.csv', index=False)
#    c += 1