# RAMP on predicting cyclist traffic in Paris


## Introduction

The dataset was collected with cyclist counters installed by Paris city council in multiple locations. It contains hourly information about cyclist traffic, as well as the following features,
 - counter name
 - counter site name
 - date
 - counter installation date
 - latitude and longitude
 
Available features are quite scarce. However, **we can also use any external data that can help us to predict the target variable.** 

In [192]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from geopy.distance import geodesic
import holidays
from xgboost import XGBRegressor
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime

# Useful fonctions

local_path = "data/train.parquet"
kaggle_path = "/kaggle/input/mdsb-2023/train.parquet"
_target_column_name = "log_bike_count"
paris_center = (48.8566, 2.3522)


def get_train_data(path=local_path):
    data = pd.read_parquet(path)
    # Sort by date first, so that time based cross-validation would produce correct results
    data = data.sort_values(["date", "counter_name"])
    y_array = data[_target_column_name].values
    X_df = data.drop([_target_column_name, "bike_count"], axis=1)
    return X_df, y_array


def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour

    return X.drop(columns=["date"])


def get_season(date):
    mois = date.month
    if 3 <= mois <= 5:
        return 1  # Printemps
    elif 6 <= mois <= 8:
        return 2  # Été
    elif 9 <= mois <= 11:
        return 3  # Automne
    else:
        return 4  # Hiver


def get_TimeOfDay(date):
    heure = date.hour
    if heure > 3 and heure <= 6:
        return 1
    if heure > 6 and heure <= 10:
        return 2
    elif heure > 10 and heure <= 13:
        return 3
    elif heure > 13 and heure <= 17:
        return 4
    elif heure > 17 and heure <= 22:
        return 5
    else:
        return 6


def assign_temperature(row):
    hour = row["date"].hour
    if 6 <= hour <= 11:
        return row["TEMPERATURE_MORNING_C"]
    elif 12 <= hour <= 17:
        return row["TEMPERATURE_NOON_C"]
    elif 18 <= hour <= 23:
        return row["TEMPERATURE_EVENING_C"]
    elif 0 <= hour <= 5:
        return row["TEMPERATURE_NIGHT_C"]
    else:
        return None


def classify_distance(row):
    distance = geodesic((row["latitude"], row["longitude"]), paris_center).km

    if distance < 2:  # Adjust this threshold based on your criteria
        return 1  # Center
    elif distance < 6:
        return 2  # Intermediate
    else:
        return 3  # Peripheral


def identity(x):
    return x


def is_holiday(date):
    return 0 if date in holidays else 1


# Getting the train and test data

if Path(local_path).exists():
    X_train, y_train = get_train_data()
    X_test = pd.read_parquet("data/final_test.parquet")
    data_ext = pd.read_csv(Path("data") / "external_data.csv")
    weather_2021 = pd.read_csv("data/export-paris2021.csv", sep=",")
    weather_2020 = pd.read_csv("data/export-paris2020.csv", sep=",")

elif Path(kaggle_path).exists():
    X_train, y_train = get_train_data(path=kaggle_path)
    X_test = pd.read_parquet("/kaggle/input/mdsb-2023/final_test.parquet")
    data_ext = pd.read_csv("/kaggle/input/mdsb-2023/external_data.csv")
    weather_2021 = pd.read_csv("/kaggle/input/mto-2/export-paris2021.csv", sep=",")
    weather_2020 = pd.read_csv("/kaggle/input/mto-2/export-paris2020.csv", sep=",")

else:
    print("error in path")

X_test = X_test.drop(
    columns=[
        "site_id",
        "counter_id",
        "coordinates",
        "counter_technical_id",
        "counter_installation_date",
    ]
)
X_train = X_train.drop(
    columns=[
        "site_id",
        "counter_id",
        "coordinates",
        "counter_technical_id",
        "counter_installation_date",
    ]
)

# External data preprocessing

weather = pd.concat([weather_2020, weather_2021], ignore_index=True)
weather.rename(columns={"DATE": "date_jour"}, inplace=True)
weather["date_jour"] = pd.to_datetime(weather["date_jour"])
weather.drop(
    columns=[
        "MAX_TEMPERATURE_C",
        "MIN_TEMPERATURE_C",
        "WINDSPEED_MAX_KMH",
        "HUMIDITY_MAX_PERCENT",
        "VISIBILITY_AVG_KM",
        "PRESSURE_MAX_MB",
        "CLOUDCOVER_AVG_PERCENT",
        "HEATINDEX_MAX_C",
        "DEWPOINT_MAX_C",
        "WINDTEMP_MAX_C",
        "WEATHER_CODE_MORNING",
        "WEATHER_CODE_NOON",
        "WEATHER_CODE_EVENING",
        "TOTAL_SNOW_MM",
        "UV_INDEX",
        "SUNHOUR",
        "OPINION",
        "SUNSET",
        "SUNRISE",
    ],
    inplace=True,
)

# Merging external data with train set

X_train["date_jour"] = X_train["date"].dt.date
X_train["date_jour"] = pd.to_datetime(X_train["date_jour"])
X_train = X_train.merge(right=weather, on="date_jour", how="left")

# Feature engineering on train set

X_train["is_weekend"] = X_train["date"].apply(lambda x: 1 if x.weekday() >= 5 else 0)
holidays = holidays.CountryHoliday("France")
X_train["is_holiday"] = X_train["date"].apply(is_holiday)
X_train["season"] = X_train["date"].apply(get_season)
X_train["timeOfDay"] = X_train["date"].apply(get_TimeOfDay)
# X_train['is_couvre_feu'] = X_train.apply(encode_couvre_feu, axis=1)
X_train["is_confinement"] = (
    (X_train["date"] > "2020-03-17") & (X_train["date"] < "2020-05-11")
    | (X_train["date"] > "2020-10-30") & (X_train["date"] < "2020-12-15")
    | (X_train["date"] > "2021-04-03") & (X_train["date"] < "2021-05-03")
)
# X_train['temperature'] = X_train.apply(assign_temperature, axis=1)
X_train["distance_category"] = X_train.apply(classify_distance, axis=1)
X_train["is_raining"] = X_train["PRECIP_TOTAL_DAY_MM"].apply(
    lambda x: 1 if x > 5 else 0
)
X_train = X_train.drop(
    columns=[
        "TEMPERATURE_NIGHT_C",
        "TEMPERATURE_MORNING_C",
        "TEMPERATURE_NOON_C",
        "TEMPERATURE_EVENING_C",
        "PRECIP_TOTAL_DAY_MM",
        "latitude",
        "longitude",
        "date_jour",
    ]
)

# Feeling nAn values


# Merging external data with test set

X_test["date_jour"] = X_test["date"].dt.date
X_test["date_jour"] = pd.to_datetime(X_test["date_jour"])
X_test = X_test.merge(right=weather, on="date_jour", how="left")

# Feature engineering on test set

X_test["is_weekend"] = X_test["date"].apply(lambda x: 1 if x.weekday() >= 5 else 0)
X_test["is_holiday"] = X_test["date"].apply(is_holiday)
X_test["season"] = X_test["date"].apply(get_season)
X_test["timeOfDay"] = X_test["date"].apply(get_TimeOfDay)
X_test["is_confinement"] = (
    (X_test["date"] > "2020-03-17") & (X_test["date"] < "2020-05-11")
    | (X_test["date"] > "2020-10-30") & (X_test["date"] < "2020-12-15")
    | (X_test["date"] > "2021-04-03") & (X_test["date"] < "2021-05-03")
)
# X_test['is_couvre_feu'] = X_test.apply(encode_couvre_feu, axis=1)
# X_test['temperature'] = X_test.apply(assign_temperature, axis=1)
X_test["distance_category"] = X_test.apply(classify_distance, axis=1)
X_test["is_raining"] = X_test["PRECIP_TOTAL_DAY_MM"].apply(lambda x: 1 if x > 5 else 0)
X_test = X_test.drop(
    columns=[
        "TEMPERATURE_NIGHT_C",
        "TEMPERATURE_MORNING_C",
        "TEMPERATURE_NOON_C",
        "TEMPERATURE_EVENING_C",
        "PRECIP_TOTAL_DAY_MM",
        "latitude",
        "longitude",
        "date_jour",
    ]
)
# Encoding the dataset

date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()

categorical_encoder = OneHotEncoder(handle_unknown="ignore")
categorical_cols = ["counter_name", "site_name", "season", "timeOfDay"]

binary_cols = [
    "is_weekend",
    "is_holiday",
    "is_raining",
    "is_confinement",
    "distance_category",
]
binary_encoder = FunctionTransformer(func=identity, validate=False)

numerical_encoder = StandardScaler()

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore", sparse=False), date_cols),
        ("cat", categorical_encoder, categorical_cols),
        ("bin", binary_encoder, binary_cols),
    ]
)

# Model selection

regressor = XGBRegressor(learning_rate=0.1, max_depth=11, n_estimations=300)

# Pipeline creation and fitting

pipe = make_pipeline(date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)

# Making the prediction

y_pred = pipe.predict(X_test)
y_pred[y_pred < 0] = 0
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
results.to_csv("submission.csv", index=False)

