In [106]:
# Import basic packages
import pandas as pd
import numpy as np
from pathlib import Path

# Import skleanr packages
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import _name_estimators, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score

# Import LightGBM
import lightgbm as lgb

import utils.get_data as get_data

In [107]:
def _encode_dates(X):
    
    '''
    Splits the 'date' columns of the input DataFrame into several columns (year, month, day, weekday, hour)
    
    Parameters:
        X (pd.DataFrame): the dataframe to modify
    
    Returns:
        X (pd.DataFrame): the modified dataframe
    '''
    
    # Duplicate X to work on it
    X = X.copy()
    
    # Create new columns with date parts from X.date
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour"] = X["date"].dt.hour
    
    # Clean the new dataframe and return it
    X.drop(columns=["date"], inplace=True)
    return X

In [132]:
def merge_external_data(X): 
    
    '''
    Enriches the input dataframe with weather data (merges on 'date')
    
    Parameters:
        X (pd.DataFrame): the dataframe to enrich
    
    Returns:
        X (pd.DataFrame): the enriched dataframe
    '''

    # Duplicate X to work on it
    X = X.copy()
    X.loc[:, "date"] = pd.to_datetime(X["date"])
    
    # Create X_weather, the dataframe with the wanted weather data
    file_path = "data/external_data.csv"
    df_weather = pd.read_csv(file_path, parse_dates=["date"])
    X_weather = df_weather[['date', 't', 'rr3', 'u']]
    X_weather['date'] = pd.to_datetime(X_weather['date']).astype('datetime64[us]')
    
    # Reset index
    X["orig_index"] = np.arange(X.shape[0])
    
    # Merge X_weather to X, on the closest value of column X.date
    X_merged = pd.merge_asof(X.sort_values("date"), X_weather.sort_values("date"), on='date')
    
    # Clean the merged dataframe
    X_merged['t'] = X_merged['t'].fillna(0)
    X_merged['rr3'] = X_merged['rr3'].fillna(0)
    X_merged['u'] = X_merged['u'].fillna(0)
    X_merged = X_merged.sort_values("orig_index")
    X_merged.drop(columns=["orig_index", 'counter_id', 'site_name', 'counter_installation_date', 'counter_technical_id', 'latitude', 'longitude'], inplace=True)
    
    # Render the new dataframe X
    X = X_merged
    return X

In [133]:
def get_estimator():
    
    '''
    Creates a pipe which:
        (1) performs all the transformations done by the functions merge_external_data() and _encode_dates(),
        (2) encodes the categorical and numerical data, 
        (3) performs a LightGBM regression with tuned parameters.

    Parameters:
        None

    Returns:
        pipe (sklearn Pipeline objet): the given pipeline
    '''

    # Call the merge_external_data function
    merge_external = FunctionTransformer(merge_external_data, validate=False)

    # Call the _encode_dates function to split the date column to several columns
    date_encoder = FunctionTransformer(_encode_dates)
    
    # Encode the final columns
    categorical_encoder = OneHotEncoder(handle_unknown="ignore")
    categorical_cols = ["counter_name", 'site_id']
    numeric_encoder = StandardScaler()
    numeric_cols = ['t', 'u','rr3', 'year', 'month', 'day', 'weekday', 'hour']
    
    # Create a ColumnTransformer object to perform all encodings
    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", categorical_encoder, categorical_cols),
            ("numeric", numeric_encoder, numeric_cols) 
        ]
    )
    
    params = {
        'colsample_bytree': 0.7, 
        'learning_rate': 0.01,
        'max_depth': 11,
        'min_child_samples': 198,
        'min_child_weight': 0.1,
        'n_estimators': 2000,
        'num_leaves': 99,
        'reg_alpha': 1, 
        'reg_lambda': 0.1,
        'subsample': 0.5
    }
    
    # Create the regressor object 
    regressor = lgb.LGBMRegressor(**params)

    # Create pipeline
    pipe = make_pipeline(
        merge_external,
        date_encoder, 
        preprocessor, 
        regressor
    )
    
    return pipe

In [134]:
def get_RMSE_local_pipe(pipe, X_train, y_train, X_test, y_test):
    from sklearn.metrics import mean_squared_error
    # Fit the pipeline on the training data
    pipe.fit(X_train, y_train)

    # Predict on training and test data
    y_train_pred = pipe.predict(X_train)
    y_test_pred = pipe.predict(X_test)

    # Calculate RMSE for training and test data
    rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
    rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
    print(f"Train set RMSE: {rmse_train:.2f}")
    print(f"Test set RMSE: {rmse_test:.2f}")


# LOAD DATA

In [135]:
# Read data
X_train, y_train = get_data.get_train_data()
X_test, y_test = get_data.get_test_data()
X_final_test = get_data.get_final_test_data()

# TRAINING

In [136]:
X_train.dtypes

counter_id                         category
counter_name                       category
site_id                               int64
site_name                          category
date                         datetime64[us]
counter_installation_date    datetime64[us]
counter_technical_id               category
latitude                            float64
longitude                           float64
dtype: object

In [137]:
pipe = get_estimator()

In [138]:
# Predict data and get RMSE
get_RMSE_local_pipe(pipe, X_train, y_train, X_test, y_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_weather['date'] = pd.to_datetime(X_weather['date']).astype('datetime64[us]')


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003425 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 645
[LightGBM] [Info] Number of data points in the train set: 455163, number of used features: 94
[LightGBM] [Info] Start training from score 3.048589


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_weather['date'] = pd.to_datetime(X_weather['date']).astype('datetime64[us]')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_weather['date'] = pd.to_datetime(X_weather['date']).astype('datetime64[us]')


Train set RMSE: 0.40
Test set RMSE: 0.42


# FINAL SUBMIT

In [101]:
concatenated_X_train_test = pd.concat([X_train, X_test], ignore_index=False)
concatenated_y_train_test = np.concatenate([y_train, y_test], axis=0)

pipe.fit(concatenated_X_train_test, concatenated_y_train_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_weather['date'] = pd.to_datetime(X_weather['date']).astype('datetime64[us]')


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002677 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 645
[LightGBM] [Info] Number of data points in the train set: 496771, number of used features: 94
[LightGBM] [Info] Start training from score 3.080113


## Create csv

In [102]:
def submission_kaggle(model, X_final_test):
    y_pred = model.predict(X_final_test)
    print(y_pred)
    results = pd.DataFrame(
        dict(
            Id=X_final_test.index,
            log_bike_count=y_pred,
        )
    )
    results.to_csv("submission.csv", index=False)

    return results

In [103]:
a = submission_kaggle(pipe, X_final_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_weather['date'] = pd.to_datetime(X_weather['date']).astype('datetime64[us]')


[1.34708362 1.29250497 1.09140572 ... 2.53740358 1.75007733 1.84171205]


In [104]:
a

Unnamed: 0,Id,log_bike_count
0,17081,1.347084
1,18655,1.292505
2,3124,1.091406
3,4147,1.334263
4,48210,2.102159
...,...,...
51435,42131,2.719680
51436,43042,1.862709
51437,43929,2.537404
51438,5707,1.750077
