In [49]:
import pandas as pd
import numpy as np
import time
from pathlib import Path
import holidays
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

def create_Pipeline(num_columns, VAR_num_scaler, le_columns, oh_columns, VAR_regressor):
    le_transformer = FunctionTransformer(
        lambda x: np.array(pd.factorize(x.squeeze())[0]).reshape(-1, 1)
    )
    preprocessor = ColumnTransformer(transformers=[
        ('scaled_num', VAR_num_scaler, num_columns),        # Scaling for numerical columns
        ('le_encoded', le_transformer, le_columns),         # Label encoding for specific columns
        ('oh_encoded', OneHotEncoder(), oh_columns)         # One-hot encoding for categorical columns
    ])
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', VAR_regressor)
    ])
    return pipeline


def fine_tune(pipeline, data, target):
    param_grid = {
        'regressor__n_estimators': [100, 200],
        'regressor__max_depth': [3, 5, 7],
        #'regressor__min_samples_split': [2, 5],
        #'regressor__min_samples_leaf': [1, 2],
        #'regressor__max_features': ['sqrt', 'log2', None],
        #'regressor__bootstrap': [True, False]
    }
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)
    grid_search_result = grid_search.fit(data, target, error_score='raise')
    return grid_search_result


def final_output(grid_search_result, test_data):
    y_pred = grid_search_result.predict(test_data)
    results = pd.DataFrame(
        dict(
            Id = np.arange(y_pred.shape[0]),
            log_bike_count=y_pred,
        )
    )
    results.to_csv(f'submission_{grid_search_result}.csv', index=False)


def _encode_dates(df, date_column):
    df['year'] = df[date_column].dt.year
    df['month'] = df[date_column].dt.month
    df['day'] = df[date_column].dt.day
    df['hour'] = df[date_column].dt.hour
    df['weekday'] = df[date_column].dt.weekday
    df['isWeekend'] = df['weekday'].apply(lambda x: 1 if x >= 5 else 0)

    vacances = holidays.CountryHoliday('France', years=[i for i in range(2009, 2025)])
    vacances_dates = pd.to_datetime(list(vacances.keys())).date
    df['isHoliday'] = df[date_column].isin(vacances_dates).astype(int)

    return df


def clean(df, df_name):
    # missing_values_count = df.isnull().any(axis=1).sum()
    # print(f'({df_name}) : Number of lines with missing values: {missing_values_count}')
    df = df.dropna()
    return df


def compile_data(raw_bike_counter_data, raw_data_meteo):

    bike_counter_data = raw_bike_counter_data.copy()
    data_meteo = raw_data_meteo.copy()

    bike_counter_data = clean(bike_counter_data, 'bike_counter_data')
    bike_counter_data = _encode_dates(bike_counter_data, 'date')

    data_meteo = data_meteo.drop(columns = ['TEMPERATURE_NIGHT_C', 'SUNRISE', 'SUNSET'] )
    data_meteo['DATE'] = pd.to_datetime(data_meteo['DATE'])
    data_meteo = clean(data_meteo, 'data_meteo')
    data_meteo = _encode_dates(data_meteo, 'DATE')
    data_meteo = data_meteo.drop(columns = ['isWeekend', 'isHoliday', 'hour', 'weekday'] )

    merged_data = pd.merge(bike_counter_data, data_meteo, on=['year', 'month', 'day'])

    columns_to_drop = ['date', 'day', 'counter_name', 'site_name', 'DATE', 'counter_installation_date', 'coordinates', 'counter_technical_id', 'latitude', 'longitude']
    if 'bike_count' in merged_data.columns:
        merged_data = merged_data.drop(columns = 'bike_count')
    merged_data = merged_data.drop(columns = columns_to_drop)
    
    return merged_data


In [50]:
"""raw_bike_counter_data = pd.read_parquet(Path('data') / 'train.parquet')
raw_meteo_data = pd.read_csv('external_data/export-paris0.csv')
raw_bike_counter_test_data = pd.read_parquet(Path('data') / 'final_test.parquet')

train_data = compile_data(raw_bike_counter_data, raw_meteo_data)
#public_test_data = compile_data(raw_bike_counter_test_data, raw_meteo_data)
train_data.info()
features = train_data.drop(columns = ['log_bike_count'])
target = train_data['log_bike_count']

fraction = 0.001
features_sample = features.sample(frac=fraction, random_state=42)  # x % des données
target_sample = target.loc[features_sample.index]

le_columns = [
    'WEATHER_CODE_MORNING',
    'WEATHER_CODE_NOON',
    'WEATHER_CODE_EVENING',
    'OPINION',
    'counter_id',
    'site_id',
    'isHoliday',
    'isWeekend',
    ]
oh_columns = [
    'isHoliday',
    'isWeekend',
    ]
num_columns = [
    'MAX_TEMPERATURE_C',
    'MIN_TEMPERATURE_C',
    'WINDSPEED_MAX_KMH',
    'TEMPERATURE_MORNING_C',
    'MAX_TEMPERATURE_C',
    'TEMPERATURE_EVENING_C',
    'PRECIP_TOTAL_DAY_MM',
    'HUMIDITY_MAX_PERCENT',
    'VISIBILITY_AVG_KM',
    'PRESSURE_MAX_MB',
    'CLOUDCOVER_AVG_PERCENT',
    'HEATINDEX_MAX_C',
    'DEWPOINT_MAX_C',
    'WINDTEMP_MAX_C',
    'TOTAL_SNOW_MM',
    'UV_INDEX',
    'SUNHOUR',
    'month',
    'year'
    ]

num_scaler_choices = [StandardScaler(), MinMaxScaler()]
regressor_choices = [ExtraTreesRegressor()]

def fine_tune_xgb(pipeline, data, target):

    param_grid = {
    'n_estimators': [50, 100, 200],          # Number of boosting rounds
    'max_depth': [3, 5, 7],                 # Maximum depth of trees
    'learning_rate': [0.01, 0.1, 0.2],      # Step size shrinkage
    'subsample': [0.6, 0.8, 1.0],           # Subsample ratio of training instances
    'colsample_bytree': [0.6, 0.8, 1.0],    # Subsample ratio of columns
    }

    grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)
    grid_search_result = grid_search.fit(data, target, error_score='raise')
    return grid_search_result



xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
pipeline = create_Pipeline(num_columns, MinMaxScaler(), le_columns, oh_columns, xgb_model)
print(f"Tuning pipeline with MinMaxScaler and XGBRegressor...")
grid_search_result = fine_tune_xgb(pipeline, data, target)
print(f"Best score: {grid_search_result.best_score_}")
print(f"Best params: {grid_search_result.best_params_}")"""


'raw_bike_counter_data = pd.read_parquet(Path(\'data\') / \'train.parquet\')\nraw_meteo_data = pd.read_csv(\'external_data/export-paris0.csv\')\nraw_bike_counter_test_data = pd.read_parquet(Path(\'data\') / \'final_test.parquet\')\n\ntrain_data = compile_data(raw_bike_counter_data, raw_meteo_data)\n#public_test_data = compile_data(raw_bike_counter_test_data, raw_meteo_data)\ntrain_data.info()\nfeatures = train_data.drop(columns = [\'log_bike_count\'])\ntarget = train_data[\'log_bike_count\']\n\nfraction = 0.001\nfeatures_sample = features.sample(frac=fraction, random_state=42)  # x % des données\ntarget_sample = target.loc[features_sample.index]\n\nle_columns = [\n    \'WEATHER_CODE_MORNING\',\n    \'WEATHER_CODE_NOON\',\n    \'WEATHER_CODE_EVENING\',\n    \'OPINION\',\n    \'counter_id\',\n    \'site_id\',\n    \'isHoliday\',\n    \'isWeekend\',\n    ]\noh_columns = [\n    \'isHoliday\',\n    \'isWeekend\',\n    ]\nnum_columns = [\n    \'MAX_TEMPERATURE_C\',\n    \'MIN_TEMPERATURE_C\'

In [55]:

raw_bike_counter_data = pd.read_parquet(Path('data') / 'train.parquet')
raw_meteo_data = pd.read_csv('external_data/export-paris0.csv')
raw_bike_counter_test_data = pd.read_parquet(Path('data') / 'final_test.parquet')


train_data = compile_data(raw_bike_counter_data, raw_meteo_data)
public_test_data = compile_data(raw_bike_counter_test_data, raw_meteo_data)

encode_columns = [
    'WEATHER_CODE_MORNING',
    'WEATHER_CODE_NOON',
    'WEATHER_CODE_EVENING',
    'OPINION',
    'counter_id',
    'site_id',
    'isHoliday',
    'isWeekend',
    ]

train_data = pd.get_dummies(train_data, columns=encode_columns, drop_first=False)
public_test_data = pd.get_dummies(public_test_data, columns=encode_columns, drop_first=False)
public_test_data = public_test_data.reindex(columns=train_data.columns, fill_value=0)
public_test_data = public_test_data.drop(columns = ['log_bike_count'])


features = train_data.drop(columns = ['log_bike_count'])
target = train_data['log_bike_count']
fraction = 0.1
features_sample = features.sample(frac=fraction, random_state=42)  # x % des données
target_sample = target.loc[features_sample.index]


In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import lightgbm as lgb



X_train, X_test, y_train, y_test = train_test_split(features_sample, target_sample, test_size=0.2, random_state=42)

# Define a function to evaluate models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2

# Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_mse, rf_r2 = evaluate_model(rf_model, X_test, y_test)
print(f"Random Forest - MSE: {rf_mse:.4f}, R^2: {rf_r2:.4f}")

# XGBoost Regressor
xgb_model = XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_mse, xgb_r2 = evaluate_model(xgb_model, X_test, y_test)
print(f"XGBoost - MSE: {xgb_mse:.4f}, R^2: {xgb_r2:.4f}")

# LightGBM Regressor
lgb_model = lgb.LGBMRegressor(n_estimators=100, random_state=42)
lgb_model.fit(X_train, y_train)
lgb_mse, lgb_r2 = evaluate_model(lgb_model, X_test, y_test)
print(f"LightGBM - MSE: {lgb_mse:.4f}, R^2: {lgb_r2:.4f}")



Random Forest - MSE: 0.2818, R^2: 0.8988
XGBoost - MSE: 0.2277, R^2: 0.9183
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003063 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1190
[LightGBM] [Info] Number of data points in the train set: 39746, number of used features: 175
[LightGBM] [Info] Start training from score 3.086063
LightGBM - MSE: 0.2691, R^2: 0.9034


In [53]:
train_data.head()

print('isHoliday_0' in train_data.columns)


True


In [61]:
def final_output_2(model, test_data, name):
    print(test_data.info())
    y_pred = model.predict(test_data)
    results = pd.DataFrame(
        dict(
            Id = np.arange(y_pred.shape[0]),
            log_bike_count=y_pred,
        )
    )
    results.to_csv(f'submission_{name}.csv', index=False)



models = [
    ("Random Forest", rf_model),
    ("XGBoost", xgb_model),
    ("LightGBM", lgb_model)
]

# Iterating over the models
for model_name, model in models:
    print(f"Model: {model_name}")
    final_output_2(model, public_test_data, model_name)

Model: Random Forest
<class 'pandas.core.frame.DataFrame'>
Int64Index: 51440 entries, 0 to 51439
Columns: 175 entries, year to isWeekend_1
dtypes: float64(5), int64(55), uint8(115)
memory usage: 29.6 MB
None
Model: XGBoost
<class 'pandas.core.frame.DataFrame'>
Int64Index: 51440 entries, 0 to 51439
Columns: 175 entries, year to isWeekend_1
dtypes: float64(5), int64(55), uint8(115)
memory usage: 29.6 MB
None
Model: LightGBM
<class 'pandas.core.frame.DataFrame'>
Int64Index: 51440 entries, 0 to 51439
Columns: 175 entries, year to isWeekend_1
dtypes: float64(5), int64(55), uint8(115)
memory usage: 29.6 MB
None
