In [31]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from geopy.distance import geodesic
import holidays
from xgboost import XGBRegressor
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime

# Useful variables

local_path = "data/train.parquet"
kaggle_path = "/kaggle/input/mdsb-2023/train.parquet"
_target_column_name = "log_bike_count"
paris_center = (48.8566, 2.3522)
holidays = holidays.CountryHoliday("France")

# Useful fonctions


def get_train_data(path=local_path):
    data = pd.read_parquet(path)
    # Sort by date first, so that time based cross-validation would produce correct results
    data = data.sort_values(["date", "counter_name"])
    y_array = data[_target_column_name].values
    X_df = data.drop([_target_column_name, "bike_count"], axis=1)
    return X_df, y_array


def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour

    return X.drop(columns=["date"])


def get_season(date):
    mois = date.month
    if 3 <= mois <= 5:
        return 1  # Printemps
    elif 6 <= mois <= 8:
        return 2  # Été
    elif 9 <= mois <= 11:
        return 3  # Automne
    else:
        return 4  # Hiver


def get_TimeOfDay(date):
    heure = date.hour
    if heure > 3 and heure <= 6:
        return 1
    if heure > 6 and heure <= 10:
        return 2
    elif heure > 10 and heure <= 13:
        return 3
    elif heure > 13 and heure <= 17:
        return 4
    elif heure > 17 and heure <= 22:
        return 5
    else:
        return 6


def assign_temperature(row):
    hour = row["date"].hour
    if 6 <= hour <= 11:
        return row["TEMPERATURE_MORNING_C"]
    elif 12 <= hour <= 17:
        return row["TEMPERATURE_NOON_C"]
    elif 18 <= hour <= 23:
        return row["TEMPERATURE_EVENING_C"]
    elif 0 <= hour <= 5:
        return row["TEMPERATURE_NIGHT_C"]
    else:
        return None


def classify_distance(row):
    distance = geodesic((row["latitude"], row["longitude"]), paris_center).km

    if distance < 2:  # Adjust this threshold based on your criteria
        return 1  # Center
    elif distance < 6:
        return 2  # Intermediate
    else:
        return 3  # Peripheral


def identity(x):
    return x


def is_holiday(date):
    return 0 if date in holidays else 1


# Getting the train and test data as well as external data

if Path(local_path).exists():
    X_train, y_train = get_train_data()
    X_test = pd.read_parquet("data/final_test.parquet")
    data_ext = pd.read_csv(Path("data") / "external_data.csv")
    weather_2021 = pd.read_csv("data/export-paris2021.csv", sep=",")
    weather_2020 = pd.read_csv("data/export-paris2020.csv", sep=",")

elif Path(kaggle_path).exists():
    X_train, y_train = get_train_data(path=kaggle_path)
    X_test = pd.read_parquet("/kaggle/input/mdsb-2023/final_test.parquet")
    data_ext = pd.read_csv("/kaggle/input/mdsb-2023/external_data.csv")
    weather_2021 = pd.read_csv("/kaggle/input/mto-2/export-paris2021.csv", sep=",")
    weather_2020 = pd.read_csv("/kaggle/input/mto-2/export-paris2020.csv", sep=",")

else:
    print("error in path")

# Droppping useless columns from original dataset

X_test = X_test.drop(
    columns=[
        "site_id",
        "counter_id",
        "coordinates",
        "counter_technical_id",
        "counter_installation_date",
    ]
)
X_train = X_train.drop(
    columns=[
        "site_id",
        "counter_id",
        "coordinates",
        "counter_technical_id",
        "counter_installation_date",
    ]
)

# External data preprocessing

data_ext["date"] = pd.to_datetime(data_ext["date"])
data_ext = data_ext.drop_duplicates()
# Rain feature
rain = data_ext[["date", "rr3"]]
rain["rr3"] = rain["rr3"] * (1 / 3)

# Merging external data with train set

X_train = pd.merge(X_train, rain, on="date", how="left")
X_train["rain"] = X_train["rr3"].interpolate(method="linear")
X_train["log_rain"] = np.log(1 + X_train["rain"])
X_train = X_train.drop(columns="rain")

# Feature engineering on train set

X_train["is_weekend"] = X_train["date"].apply(lambda x: 1 if x.weekday() >= 5 else 0)
X_train["is_holiday"] = X_train["date"].apply(is_holiday)
X_train["season"] = X_train["date"].apply(get_season)
X_train["timeOfDay"] = X_train["date"].apply(get_TimeOfDay)
# X_train['is_couvre_feu'] = X_train.apply(encode_couvre_feu, axis=1)
# X_train['temperature'] = X_train.apply(assign_temperature, axis=1)
X_train["distance_category"] = X_train.apply(classify_distance, axis=1)
X_train["is_confinement"] = (
    (X_train["date"] > "2020-03-17") & (X_train["date"] < "2020-05-11")
    | (X_train["date"] > "2020-10-30") & (X_train["date"] < "2020-12-15")
    | (X_train["date"] > "2021-04-03") & (X_train["date"] < "2021-05-03")
)
# X_train["is_raining"] = X_train["PRECIP_TOTAL_DAY_MM"].apply(lambda x: 1 if x > 5 else 0)
X_train = X_train.drop(columns=["longitude", "latitude", "rr3"])

# Merging external data with test set

X_test = pd.merge(X_test, rain, on="date", how="left")
X_test["rain"] = X_test["rr3"].interpolate(method="linear")
X_test["log_rain"] = np.log(1 + X_test["rain"])
X_test = X_test.drop(columns="rain")

# Feature engineering on test set

X_test["is_weekend"] = X_test["date"].apply(lambda x: 1 if x.weekday() >= 5 else 0)
X_test["is_holiday"] = X_test["date"].apply(is_holiday)
X_test["season"] = X_test["date"].apply(get_season)
X_test["timeOfDay"] = X_test["date"].apply(get_TimeOfDay)
# X_test['is_couvre_feu'] = X_test.apply(encode_couvre_feu, axis=1)
# X_test['temperature'] = X_test.apply(assign_temperature, axis=1)
X_test["distance_category"] = X_test.apply(classify_distance, axis=1)
X_test["is_confinement"] = (
    (X_test["date"] > "2020-03-17") & (X_test["date"] < "2020-05-11")
    | (X_test["date"] > "2020-10-30") & (X_test["date"] < "2020-12-15")
    | (X_test["date"] > "2021-04-03") & (X_test["date"] < "2021-05-03")
)
# X_test["is_raining"] = X_test["PRECIP_TOTAL_DAY_MM"].apply(lambda x: 1 if x > 5 else 0)
X_test = X_test.drop(columns=["longitude", "latitude", "rr3"])

# Encoding the dataset

date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()

categorical_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
categorical_cols = ["counter_name", "site_name", "season", "timeOfDay"]

continuous_var = ["log_rain"]

binary_cols = [
    "is_weekend",
    "is_holiday",
    # "is_raining",
    "is_confinement",
    "distance_category",
]
binary_encoder = FunctionTransformer(func=identity, validate=False)

numerical_encoder = StandardScaler()

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore", sparse=False), date_cols),
        ("cat", categorical_encoder, categorical_cols),
        ("bin", binary_encoder, binary_cols),
        ("con", numerical_encoder, continuous_var),
    ]
)

# Model selection

regressor = XGBRegressor(
    learning_rate=0.08289884001486468,
    max_depth=8,
    n_estimators=408,
    gamma=0.5295247412325502,
    min_child_weight=8,
    colsample_bytree=0.6857803669345204,
    subsample=0.7362787656257918,
)

# Pipeline creation and fitting

pipe = make_pipeline(date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)

# Making the prediction

y_pred = pipe.predict(X_test)
y_pred[y_pred < 0] = 0
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)

results.to_csv("submission.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rain['rr3']= rain['rr3']*(1/3)


In [34]:
import optuna


def train_test_split_temporal(X, y, delta_threshold="30 days"):
    cutoff_date = X["date"].max() - pd.Timedelta(delta_threshold)
    mask = X["date"] <= cutoff_date
    X_train, X_valid = X.loc[mask], X.loc[~mask]
    y_train, y_valid = y[mask], y[~mask]

    return X_train, y_train, X_valid, y_valid


X_train_opt, y_train_opt, X_test_opt, y_test_opt = train_test_split_temporal(
    X_train, y_train
)

date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train_opt[["date"]]).columns.tolist()

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore", sparse=False), date_cols),
        ("cat", categorical_encoder, categorical_cols),
        ("bin", binary_encoder, binary_cols),
    ]
)


def objective(trial):
    learning_rate_X = trial.suggest_float("learning_rate_X", 0.1, 0.2, log=True)
    max_depth_X = trial.suggest_int("max_depth_X", 6, 15, log=True)
    n_estimators_X = trial.suggest_int("n_estimations_X", 200, 450, log=True)
    min_child_weight_X = trial.suggest_int("min_child_weight_X", 1, 10, log=True)
    gamma_x = trial.suggest_float("gamma_x", 0.1, 0.5, log=True)
    colsample_bytree_X = trial.suggest_float("colsample_bytree_X", 0.1, 1, log=True)

    pipe = make_pipeline(
        date_encoder,
        preprocessor,
        XGBRegressor(
            objective="reg:squarederror",
            learning_rate=learning_rate_X,
            max_depth=max_depth_X,
            n_estimators=n_estimators_X,
            min_child_weight=min_child_weight_X,
            gamma=gamma_x,
            colsample_bytree=colsample_bytree_X,
        ),
    )
    pipe.fit(X_train_opt, y_train_opt)

    return pipe.score(X_test_opt, y_test_opt)


# 3. Create a study object and optimize the objective function.
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2023-12-10 23:18:43,714] A new study created in memory with name: no-name-c2f7525e-6ff5-421e-bda7-855496799bf9
[I 2023-12-10 23:19:08,536] Trial 0 finished with value: 0.892458874887906 and parameters: {'learning_rate_X': 0.14670358552811935, 'max_depth_X': 9, 'n_estimations_X': 310, 'min_child_weight_X': 5, 'gamma_x': 0.3563090052359455, 'colsample_bytree_X': 0.49743079734210055}. Best is trial 0 with value: 0.892458874887906.
[I 2023-12-10 23:19:40,688] Trial 1 finished with value: 0.8823595481536289 and parameters: {'learning_rate_X': 0.13101678043284068, 'max_depth_X': 15, 'n_estimations_X': 254, 'min_child_weight_X': 8, 'gamma_x': 0.27602803459808567, 'colsample_bytree_X': 0.1502446606939294}. Best is trial 0 with value: 0.892458874887906.
[I 2023-12-10 23:20:10,339] Trial 2 finished with value: 0.8903030828958347 and parameters: {'learning_rate_X': 0.12885552904141526, 'max_depth_X': 7, 'n_estimations_X': 266, 'min_child_weight_X': 1, 'gamma_x': 0.11197489758012182, 'colsample

In [36]:
study.best_params

{'learning_rate_X': 0.11721009896365608,
 'max_depth_X': 13,
 'n_estimations_X': 271,
 'min_child_weight_X': 8,
 'gamma_x': 0.1580748718696943,
 'colsample_bytree_X': 0.755659167280853}