In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error
import joblib

In [2]:
org_data = pd.read_csv('SeoulBikeData.csv', encoding='Windows-1252')
data = org_data

In [3]:
def preprocess_data(data):
    # Columns to drop if they exist
    columns_to_drop = ['Functioning Day', 'Dew point temperature(°C)']
    columns_to_drop = [col for col in columns_to_drop if col in data.columns]

    # Drop specified columns
    data = data.drop(columns=columns_to_drop, errors='ignore')

    # Derive date-based features
    data['Datetime'] = pd.to_datetime(data['Date'], format='%d/%m/%Y')
    data['Day'] = data['Datetime'].dt.day
    data['Month'] = data['Datetime'].dt.month
    data['Day of Week'] = data['Datetime'].dt.dayofweek

    # Drop the original Date and Datetime columns
    data.drop(columns=['Datetime', 'Date'], inplace=True)

    return data

In [4]:
data = preprocess_data(data)

In [5]:
#Define the features
target_feature = ['Rented Bike Count']
numerical_features = ['Temperature(°C)', 'Humidity(%)', 'Wind speed (m/s)', 'Visibility (10m)', 'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)']
categorical_features = ['Seasons', 'Holiday', 'Hour', 'Day', 'Month', 'Day of Week']

In [6]:
#Separate data into X and y
X = data.drop(columns=target_feature)
y = data[target_feature]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [7]:
# Combine both encoding and scaling in a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),  # Scaling for numerical features
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False), categorical_features)
    ]
)

In [8]:
#To see the input to the model
#Fit the preprocessor on the training data
X_train_transformed = preprocessor.fit_transform(X_train)

#Get numerical feature names (they remain the same)
num_columns = numerical_features

#Get the names of the one-hot encoded categorical columns
cat_columns = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)

#Combine numerical and categorical feature names
all_columns = np.concatenate([num_columns, cat_columns])

#Convert the transformed data to a DataFrame for easy inspection
X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=all_columns)

#Display the transformed data
print("Transformed Data after Encoding and Scaling:")
X_train_transformed_df.head()


Transformed Data after Encoding and Scaling:


Unnamed: 0,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons_Spring,Seasons_Summer,Seasons_Winter,...,Month_9,Month_10,Month_11,Month_12,Day of Week_1,Day of Week_2,Day of Week_3,Day of Week_4,Day of Week_5,Day of Week_6
0,-0.388746,0.18996,-1.081683,-1.882041,0.738812,-0.127135,-0.172694,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.298504,-0.056083,0.647847,0.924155,0.304619,-0.127135,-0.172694,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.423795,-0.056083,-0.024748,0.911019,0.350323,-0.03994,-0.172694,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.162534,-1.433924,-1.081683,0.51201,1.858574,-0.127135,-0.172694,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,-0.054637,-0.548169,-0.601258,0.924155,-0.655177,-0.127135,-0.172694,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [9]:
#Define the model
model = xgb.XGBRegressor(
    colsample_bytree=0.8,
    learning_rate=0.2,
    max_depth=5,
    n_estimators=300,
    subsample=0.8
)

In [10]:
# Build the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Combined preprocessing
    ('model', model)  # XGBoost model
])

In [11]:
# Fit the pipeline to the training data, applying log transformation to the target variable
pipeline.fit(X_train, np.log1p(y_train))

In [12]:
#Make predictions (use expm1 to reverse the log transformation)
y_pred = np.expm1(pipeline.predict(X_test))

In [13]:
#Calculate prediciton metrics
r2 = r2_score(y_test, y_pred)
print(f'R-squared on test set: {r2}')

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE on test set: {rmse}')

R-squared on test set: 0.9040235280990601
RMSE on test set: 200.42034109572478


In [14]:
#Export Model
joblib.dump(pipeline, 'Model_pipeline_Mari.pkl')

['Model_pipeline_Mari.pkl']