# RAMP on predicting cyclist traffic in Paris


## Introduction

The dataset was collected with cyclist counters installed by Paris city council in multiple locations. It contains hourly information about cyclist traffic, as well as the following features,
 - counter name
 - counter site name
 - date
 - counter installation date
 - latitude and longitude
 
Available features are quite scarce. However, **we can also use any external data that can help us to predict the target variable.** 

In [4]:
from sklearn.compose import ColumnTransformer

from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import pandas as pd
from pathlib import Path

local_path = "data/train.parquet"
kaggle_path = "/kaggle/input/mdsb-2023/train.parquet"


def get_train_data(path=local_path):
    data = pd.read_parquet(path)
    # Sort by date first, so that time based cross-validation would produce correct results
    data = data.sort_values(["date", "counter_name"])
    y_array = data[_target_column_name].values
    X_df = data.drop([_target_column_name, "bike_count"], axis=1)
    return X_df, y_array


def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour

    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])


_target_column_name = "log_bike_count"

if Path(local_path).exists():
    X_train, y_train = get_train_data()
    X_test = pd.read_parquet(local_path)

elif Path(kaggle_path).exists():
    X_train, y_train = get_train_data(path=kaggle_path)
    X_test = pd.read_parquet("/kaggle/input/mdsb-2023/final_test.parquet")

else:
    print("error in path")


date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()

categorical_encoder = OneHotEncoder(handle_unknown="ignore")
categorical_cols = ["counter_name", "site_name"]

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
    ]
)

regressor = Ridge()

pipe = make_pipeline(date_encoder, preprocessor, regressor)
pipe.fit(
    X_train.drop(
        columns=[
            "site_id",
            "counter_id",
            "coordinates",
            "counter_technical_id",
            "counter_installation_date",
            "latitude",
            "longitude",
        ]
    ),
    y_train,
)


y_pred = pipe.predict(
    X_test.drop(
        columns=[
            "site_id",
            "counter_id",
            "coordinates",
            "counter_technical_id",
            "counter_installation_date",
            "latitude",
            "longitude",
        ]
    )
)
y_pred[y_pred < 0] = 0
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
results.to_csv("submission.csv", index=False)