## Kaggle Bike sharing demand forecasting competition
### Approach using Autogluon modelling
https://www.kaggle.com/competitions/bike-sharing-demand

## 1. Load Competition data

In [None]:
import pandas as pd

In [None]:
# load train df
df_train = pd.read_csv("train.csv")
df_train.info()

In [None]:
# visualise first 10 rows
df_train.head(10)

In [None]:
# Assuming your column is already datetime type
df_train["datetime"] = pd.to_datetime(df_train["datetime"])

start = df_train["datetime"].min()
end = df_train["datetime"].max()

print("Start:", start)
print("End:", end)

In [None]:
# load test df
df_test = pd.read_csv("test.csv")
df_test.info()

In [None]:
# drop casual,registered columns as missing from test dataset
df_train.drop(columns=["casual","registered"], inplace=True)

## 2. Submission A: baseline model on raw data


In [None]:
from autogluon.tabular import TabularPredictor

In [None]:
predictor = TabularPredictor(label="count", path = f"AutogluonModels/agmodelA").fit(
    train_data=df_train,
    time_limit=60,
    presets="best_quality"
)

### Review AutoGluon's training run with ranking of models that did the best.

In [None]:
predictor.fit_summary()

In [None]:
# make predictions on test dataset
predictions = predictor.predict(df_test)
predictions.head()

In [None]:
#### NOTE: Kaggle will reject the submission if it contains negative numbers - we need to set everything to be > 0

# how many negative predictions?
print(f"number of negative predictions: {len(predictions[predictions<0])}")

# replace with zeros
predictions[predictions<0] = 0

### Set predictions to submission dataframe, save, and submit

In [None]:
submission = pd.DataFrame({"datetime":df_test["datetime"]})
submission["count"] = predictions
submission.to_csv("submissions/submission_a.csv", index=False)

## 3. Run EDA

### Process
- a Transformation
- b Missing Data
- c Anomalous data
- d Correlation analysis
- e Feature Engineering

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
## Transformation: parse datetime into day and hour
df_train["datetime"] = pd.to_datetime(df_train["datetime"])
df_train["day"] = df_train["datetime"].dt.date
df_train["hour"] = df_train["datetime"].dt.hour
df_test["datetime"] = pd.to_datetime(df_test["datetime"])
df_test["day"] = df_test["datetime"].dt.date
df_test["hour"] = df_test["datetime"].dt.hour

# store test datetime for submission
submission = pd.DataFrame({'datetime': df_test["datetime"]})

# datetime after refitting to dataset
df_train.drop(columns=["datetime"], inplace=True)
df_test.drop(columns=["datetime"], inplace=True)


In [None]:
## Transformation: set categorical variables to pandas type category
num_cols = ['temp', 'atemp','humidity', 'windspeed']
cat_cols = ["season", "holiday", "workingday", "weather"]
datete_cols = ["day", "hour"]
target_col = ["count"]
df_train[cat_cols] = df_train[cat_cols].astype("category")
df_test[cat_cols] = df_test[cat_cols].astype("category")

In [None]:
## Missing data - no missing data found
print(df_train.info())
print(df_test.info())


In [None]:
## explore continuous variables
print(df_train.describe())
print("\n------------------------------\n")
print(df_test.describe())

In [None]:
# outliers: no apparent outliers seen
for col in (num_cols + target_col):
    s = df_train[col].dropna()
    plt.figure()
    plt.hist(s, bins=30)
    plt.title(f'{col} (n={len(s)})')
    plt.xlabel(col); plt.ylabel('Frequency')
    plt.show()

In [None]:
# outliers: no apparent outliers seen
for col in num_cols:
    s = df_test[col].dropna()
    plt.figure()
    plt.hist(s, bins=30)
    plt.title(f'{col} (n={len(s)})')
    plt.xlabel(col); plt.ylabel('Frequency')
    plt.show()

In [None]:
# outliers: no apparent outliers seen
for col in cat_cols:
    counts = df_train[col].astype('category').value_counts(dropna=False).sort_index()
    plt.figure()
    counts.plot(kind='bar')
    plt.title(f'{col} (counts)')
    plt.ylabel('Count'); plt.xlabel(col)
    plt.show()

In [None]:
# outliers: no apparent outliers seen
for col in cat_cols:
    counts = df_test[col].astype('category').value_counts(dropna=False).sort_index()
    plt.figure()
    counts.plot(kind='bar')
    plt.title(f'{col} (counts)')
    plt.ylabel('Count'); plt.xlabel(col)
    plt.show()

In [None]:
# outliers: no apparent outliers seen
for col in num_cols + target_col:
    plt.figure()
    plt.boxplot(df_train[col].dropna())
    plt.title(col)
    plt.ylabel(col)
    plt.tight_layout()
    plt.show()

for col in num_cols:
    plt.figure()
    plt.boxplot(df_test[col].dropna())
    plt.title(col)
    plt.ylabel(col)
    plt.tight_layout()
    plt.show()

In [None]:
# correlation analysis
# plot sns pairplot including categorical variables
df_codes = df_train.copy()
for c in cat_cols:
    df_codes[c] = df_codes[c].astype('category').cat.codes
sns.pairplot(df_codes[df_codes.select_dtypes('number').columns], height=3)

In [None]:
# correlation analysis
# plot correlation matrix against all plots
corr = df_codes.drop(columns="day").corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr, vmin=-1, annot=True, vmax=1, center=0, cmap="vlag")
plt.show()

In [None]:
# Build polynomial features: 1/x, x, x^2, x^3
pieces = []
for c in num_cols:
    s = df_train[c]
    df_c = pd.DataFrame({
        f'{c}_inv': 1.0 / s.replace(0),  # avoid +/-inf on zeros
        f'{c}':     s,
        f'{c}_2':   s**2,
        f'{c}_3':   s**3,
    }, index=df_train.index)
    pieces.append(df_c)

poly_df = pd.concat(pieces, axis=1)
poly_df["count"] = df_train["count"]

# correlation analysis with expanded features
corr_poly = poly_df.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr_poly, vmin=-1, annot=True, vmax=1, center=0, cmap="vlag")
plt.show()

In [None]:
# FE: drop cross-correlated feature temperature, only keep atemp
df_train.drop(columns=["temp"], inplace=True)
df_test.drop(columns=["temp"], inplace=True)
num_cols.remove("temp")

## 4. Refit Autogluon model

In [None]:
predictor = TabularPredictor(label="count", path = f"AutogluonModels/agmodelB").fit(
    train_data=df_train,
    time_limit=60,
    presets="best_quality"
)

In [None]:
predictor.fit_summary()

In [None]:
# make predictions on test dataset
predictions = predictor.predict(df_test)
predictions.head()

In [None]:
#### NOTE: Kaggle will reject the submission if it contains negative numbers - we need to set everything to be > 0

# how many negative predictions?
print(f"number of negative predictions: {len(predictions[predictions<0])}")

# replace with zeros
predictions[predictions<0] = 0

In [None]:
# save submission to csv
submission["count"] = predictions
submission.to_csv("submissions/submission_b.csv", index=False)

## 5. Refit Autogluon model with hyperparameter tuning

In [None]:
from autogluon.tabular.configs.hyperparameter_configs import get_hyperparameter_config

In [None]:
nsl = [3,2,1,0]

for n in nsl:
    predictor = TabularPredictor(label="count", path = f"AutogluonModels/agmodelC").fit(
        train_data=df_train,
        num_stack_levels=n,
        time_limit=60,
        presets="best_quality",
        hyperparameters=get_hyperparameter_config("default"), 
        hyperparameter_tune_kwargs={"num_trials": 30, "searcher": "random"}
    )

    # make predictions on test dataset
    predictions = predictor.predict(df_test)
    predictions.head()

    # replace with zeros
    predictions[predictions<0] = 0

    # save submission to csv
    submission["count"] = predictions
    submission.to_csv(f"submissions/submission_tl_{str(n)}.csv", index=False)


In [None]:
predictor.fit_summary()

## 6: Write a Report
### Creating plots and table for report

In [None]:
# Taking the top model score from each training run and creating a line plot to show improvement
# You can create these in the notebook and save them to PNG or use some other tool (e.g. google sheets, excel)
fig = pd.DataFrame(
    {
        "model": ["initial", "add_features", "hpo"],
        "score": [-133.287222, -35.713615, -33.899264]
    }
).plot(x="model", y="score", figsize=(8, 6)).get_figure()
fig.savefig('model_train_score.png')

In [None]:
# Take the 3 kaggle scores and creating a line plot to show improvement
fig = pd.DataFrame(
    {
        "test_eval": ["initial", "add_features", "hpo"],
        "score": [1.41644, 0.59952, 0.50653]
    }
).plot(x="test_eval", y="score", figsize=(8, 6)).get_figure()
fig.savefig('model_test_score.png')