# Task 1: Describe the Dataset

## Data Understanding
Goal: Understand the structure and basic properties of the dataset using Python (NumPy, pandas, Matplotlib/Seaborn).

1) Read the CSV file with pandas.read_csv and parse the datetime
column.

In [None]:
import pandas as pd

day = pd.read_csv("day.csv")
hour = pd.read_csv("hour.csv")

day["dteday"] = pd.to_datetime(day["dteday"])
hour["dteday"] = pd.to_datetime(hour["dteday"])

hour["datetime"] = hour["dteday"] + pd.to_timedelta(hour["hr"], unit="h")

df_pre = pd.merge(
    hour,
    day,
    on="dteday",
    how="left",
    suffixes=("_hour", "_day")
)

df = pd.DataFrame(
    {
        "datetime": df_pre["datetime"],
        "target": df_pre["cnt_hour"],
        "weather": df_pre["weathersit_hour"],
        "temp": df_pre["temp_hour"],
        "humidity": df_pre["hum_hour"],
        "windspeed": df_pre["windspeed_hour"],
        "season": df_pre["season_day"],
        "day_of_week": df_pre["weekday_day"],
        "is_holiday": df_pre["holiday_day"],
        "is_workingday": df_pre["workingday_day"]
    }
)

2. Report:
- Number of rows and columns.
- Time range covered by the data.
- Target variable and list of feature variables (names and data types).

In [None]:

def dataframe_report(df: pd.DataFrame, target_col: str, time_col: str):
    print("=== DATAFRAME REPORT ===\n")

    # Rows & Columns
    n_rows, n_cols = df.shape
    print(f"Number of rows: {n_rows}")
    print(f"Number of columns: {n_cols}\n")

    # Time range
    df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
    start_date = df[time_col].min()
    end_date = df[time_col].max()
    print(f"Time range covered: {start_date} → {end_date}\n")

    # Target variable
    print("Target variable:")
    print(f"  - {target_col} ({df[target_col].dtype})\n")

    # Feature variables
    feature_cols = [col for col in df.columns if col not in [target_col, time_col]]
    print("Feature variables:")
    for col in feature_cols:
        print(f"  - {col}: {df[col].dtype}")

# Beispiel-Aufruf
dataframe_report(
    df=df,
    target_col="target",
    time_col="datetime"
)

3. Create a variable description table (see above for reference).

In [None]:
def create_variable_description_table(
    df: pd.DataFrame,
    target_col: str,
    time_col: str
) -> pd.DataFrame:

    rows = []

    for col in df.columns:
        if col == time_col:
            role = "time"
        elif col == target_col:
            role = "target"
        else:
            role = "feature"

        rows.append({
            "variable_name": col,
            "role": role,
            "data_type": df[col].dtype.name,
        })

    return pd.DataFrame(rows)

# Example usage
var_table = create_variable_description_table(
    df=df,
    target_col="target",
    time_col="datetime"
)

var_table

4. Check for:
- Missing values per column.
- Duplicated rows (if any).

In [None]:
print(f'Number of duplicate rows: {df.duplicated().sum()}')
print(f'Number of missing values: {df.isna().sum().sum()}')

## Descriptive statistics:

For numeric variables: calculate mean, standard deviation, minimum, maximum, and quartiles.

In [None]:
df[["target","temp","humidity","windspeed"]].describe()

For categorical variables: show frequency tables or bar charts

In [None]:
categorical_cols = ["weather", "season", "is_holiday", "is_workingday"]

for col in categorical_cols:
    print(f"\n=== {col.upper()} ===")
    print(df[col].value_counts())


## Visualisation

Plot the time series of total bike demand (count) over the full
period.

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 5))
plt.plot(df["datetime"], df["target"])
plt.title("total bike demand (count) over the full period")
plt.xlabel("Time")
plt.ylabel("Count")
plt.legend()
plt.show()

Plot distributions of key numeric variables

In [None]:
df[["temp", "humidity", "windspeed"]].hist(
    bins=40,
    layout=(3, 1),
    figsize=(10, 12)
    
)
plt.show()

Plot aggregated demand by season, day of week or hour of day

In [None]:

df_cpy = df.copy()

# Per Season
season_demand = df.groupby("season")["target"].mean()

plt.figure(figsize=(8, 4))
plt.bar(season_demand.index, season_demand.values, color="skyblue")
plt.xlabel("Season")
plt.ylabel("Average Bike Demand")
plt.title("Average Bike Demand by Season")
plt.xticks(season_demand.index, ["Spring", "Summer", "Fall", "Winter"])
plt.tight_layout()
plt.show()

# Per Day of Week
df_cpy["weekday"] = df["datetime"].dt.dayofweek  # 0 = Montag, 6 = Sonntag
weekday_demand = df_cpy.groupby("weekday")["target"].mean()

plt.figure(figsize=(8, 4))
plt.bar(weekday_demand.index, weekday_demand.values, color="orange")
plt.xlabel("Day of Week")
plt.ylabel("Average Bike Demand")
plt.title("Average Bike Demand by Day of Week")
plt.xticks(weekday_demand.index, ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"])
plt.tight_layout()
plt.show()

# Per Hour of Day
df_cpy["hour"] = df["datetime"].dt.hour
hourly_demand = df_cpy.groupby("hour")["target"].mean() 

plt.figure(figsize=(8, 4))
plt.bar(hourly_demand.index, hourly_demand.values, color="green")
plt.xlabel("Hour of Day")
plt.ylabel("Average Bike Demand")
plt.title("Average Bike Demand by Hour of Day")
plt.xticks(range(0, 24))
plt.tight_layout()
plt.show()

### Description
Demand shows clear seasonal and daily patterns.
Fall has the highest demand, while spring has the lowest demand.
Demand is very similar across all days of the week, with no strong differences.
Demand is lowest at night and rises to a first peak around 8 o’clock, decreasing slightly during midday, and reaching a second peak around 17 o’clock.
After this second peak, demand drops sharply until late in the evening.

# Task 2: Predict Demand from Conditions

## Supervised Regression Setup
Goal: Build a supervised regression model that predicts bike demand
count from given conditions (features such as weather, temperature,
time of day).

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

#Features und Target definieren
features = ["season", "is_holiday", "is_workingday", "weather", "temp", "humidity", "windspeed"]
target = "target"

X = df[features]
y = df[target]

#Trainings-, Validation- and Test-Splitting (70%/10%/20%)
X_train_and_val, X_test, y_train_and_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_and_val, y_train_and_val, test_size=0.125, random_state=42)  # 0.125*0.8 ≈ 0.1

## Train Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

#Preprocessing
categorical_features = ["season", "is_holiday", "is_workingday", "weather"]
numeric_features = ["temp", "humidity", "windspeed"]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features) # drop for avoiding dummy variable trap
    ]
)

X_train_proc = preprocessor.fit_transform(X_train)
X_test_proc = preprocessor.transform(X_test)
X_val_proc = preprocessor.transform(X_val)

# Building model

shape_input = X_train_proc.shape[1]
model = Sequential([
    layers.Input(shape=(shape_input,)),
    layers.Dense(64, activation="relu"),
    layers.Dense(64, activation="relu"),
    layers.Dense(1)
])

model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss="mse",
    metrics=["mae"]
)
EPOCHS = 200
BATCH_SIZE = 64
history = model.fit(
    X_train_proc,
    y_train,
    validation_data = (X_val_proc, y_val), # TODO: NORM!!!
    callbacks=[EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)],
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    verbose=1
)

y_test_pred = model.predict(X_test_proc).flatten()

rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)
print("Test RMSE:", rmse_test)
print("Test MAE:", mae_test)
print("Test R2:", r2_test)

plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_test_pred, alpha=0.5)
plt.plot(
    [y_test.min(), y_test.max()],
    [y_test.min(), y_test.max()],
    "r--"
)
plt.xlabel("Actual Count")
plt.ylabel("Predicted Count")
plt.title("Predicted vs Actual Bike Demand")
plt.tight_layout()
plt.show()

## User Prediction

In [None]:
def predict_bike_demand(
    season,
    is_holiday,
    is_workingday,
    weather,
    temp,
    humidity,
    windspeed
):
    """
    Predict bike demand for a single scenario.
    """

    # Create input DataFrame (must match training features exactly)
    input_df = pd.DataFrame([{
        "season": season,
        "is_holiday": is_holiday,
        "is_workingday": is_workingday,
        "weather": weather,
        "temp": temp,
        "humidity": humidity,
        "windspeed": windspeed
    }])

    # Apply preprocessing
    X_proc = preprocessor.transform(input_df)

    # Predict
    prediction = model.predict(X_proc, verbose=0)

    return float(prediction[0][0])

In [None]:
pred1 = predict_bike_demand(
    season=3,        # fall
    is_holiday=0,
    is_workingday=1,
    weather=1,       # clear
    temp=18,
    humidity=55,
    windspeed=10
)
print("Prediction 1:", pred1)

pred2 = predict_bike_demand(
    season=4,        # winter
    is_holiday=0,
    is_workingday=0,
    weather=3,       # rain/snow
    temp=2,
    humidity=85,
    windspeed=20
)
print("Prediction 2:", pred2)

pred3 = predict_bike_demand(
    season=2,        # summer
    is_holiday=0,
    is_workingday=1,
    weather=1,       # clear
    temp=26,
    humidity=40,
    windspeed=7
)
print("Prediction 3:", pred3)

In [None]:
# ===============================
# Feature Engineering
# ===============================
X_feat = np.concatenate([
    np.arange(len(df)).reshape(-1, 1),  # time trend
    df["datetime"].dt.dayofweek.values.reshape(-1, 1),  # day of week (0–6)
    df["datetime"].dt.month.values.reshape(-1, 1),  # month (1–12)
    df["datetime"].dt.dayofyear.values.reshape(-1, 1),  # day of year (1–365)
    (df["datetime"].dt.dayofweek >= 5).astype(int).values.reshape(-1, 1)  # weekend flag
], axis=1)

In [None]:
# ===============================
# Sliding Windows
# ===============================

WINDOW = 30
def make_windows(X, y, w):
    return (
        np.array([X[i:i + w] for i in range(len(X) - w)]),
        np.array([y[i + w] for i in range(len(y) - w)])
    )
X_window, y_window = make_windows(X=X_feat, y=y, w=WINDOW)

In [None]:
# ===============================
# Train / Validation / Test Split
# ===============================

n = len(X_window)
train_percent = 0.7
val_percent = 0.15

train_end = int(train_percent * n)
val_end   = int((train_percent + val_percent) * n)

X_train = X_window[:train_end]
y_train = y_window[:train_end]

X_val   = X_window[train_end:val_end]
y_val   = y_window[train_end:val_end]

X_test  = X_window[val_end:]
y_test  = y_window[val_end:]

In [None]:
# ===============================
# Normalization
# ===============================

# Normalize inputs using training data only
X_mean = X_train.mean((0, 1), keepdims=True)
X_std = X_train.std((0, 1), keepdims=True) + 1e-8

X_train = (X_train - X_mean) / X_std
X_val = (X_val - X_mean) / X_std
X_test = (X_test - X_mean) / X_std

# Normalize target values
y_mean = y_train.mean()
y_std = y_train.std() + 1e-8

y_train = (y_train - y_mean) / y_std
y_val = (y_val - y_mean) / y_std
y_test = (y_test - y_mean) / y_std

In [None]:
from tensorflow import keras
from keras.callbacks import EarlyStopping

# ===============================
# Model
# ===============================

model = keras.Sequential([
    keras.layers.Input((WINDOW, X_train.shape[2])),
    keras.layers.LSTM(64),
    keras.layers.Dense(1)
])

model.compile(
    optimizer="adam",
    loss="mse",
    metrics=["mae"]
)

early_stop = EarlyStopping(
    monitor="val_loss",
    patience=5,
    restore_best_weights=True
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=64,
    callbacks=[early_stop],
    verbose=2
)


In [None]:
# ===============================
# Evaluation & Visualization
# ===============================

y_pred_norm = model.predict(X_test)
y_pred = y_pred_norm * y_std + y_mean

y_true = y_test * y_std + y_mean

rmse_test = np.sqrt(mean_squared_error(y_true, y_pred))
mae_test = mean_absolute_error(y_true, y_pred)
r2_test = r2_score(y_true, y_pred)
print("Test RMSE:", rmse_test)
print("Test MAE:", mae_test)
print("Test R2:", r2_test)

y_naive = y_true[:-1]

plt.figure(figsize=(15,5))
plt.plot(y_true, label="True", color="black", linestyle="-")
plt.plot(y_pred, label="LSTM Prediction", color="blue", linestyle="-", alpha=0.7)
plt.plot(range(1, len(y_true)), y_naive, label="Naive Forecast", color="red", linestyle="--", alpha=0.4)
# plt.plot(y_naive, label="Naive Forecast", color="red", alpha=0.7)
plt.title("Demand Forecasting: True vs Predicted vs Baseline")
plt.xlabel("Time")
plt.ylabel("Demand")
plt.legend()
plt.show()


In [None]:
dates = df['datetime']

forecast_horizon = 30
window = WINDOW  # wie beim Training

# letzte bekannten Inputs (die letzten WINDOW Zeitschritte)
last_X = X_feat[-window:].copy()  # Shape: (WINDOW, n_features)

multi_step_preds = []

for step in range(forecast_horizon):
    # LSTM erwartet Shape (1, WINDOW, features)
    input_X = last_X.reshape(1, window, last_X.shape[1])
    
    # Vorhersage
    pred_norm = model.predict(input_X, verbose=0)
    
    # Rückskalierung auf Originalwerte
    pred = pred_norm[0,0] * y_std + y_mean
    multi_step_preds.append(pred)
    
    # Update last_X für nächsten Schritt
    # Neue Features generieren
    new_t = last_X[-1, 0] + 1
    new_datetime = dates.iloc[-1] + pd.Timedelta(hours=step+1)
    new_features = np.array([
        new_t,
        new_datetime.dayofweek,
        new_datetime.month,
        new_datetime.dayofyear,
        int(new_datetime.dayofweek >= 5)
    ])
    
    # Normierung wie beim Training (X_mean/X_std)
    new_features_norm = (new_features - X_mean[0,0]) / X_std[0,0]  # ggf. alle Features normalisieren
    
    # append neuen Schritt und verschiebe Fenster
    last_X = np.vstack([last_X[1:], new_features])



import matplotlib.pyplot as plt

# Vollständige Historie + Forecast
plt.figure(figsize=(15,5))
plt.plot(df['datetime'], df['target'], label="True Demand", color="black")
forecast_dates = pd.date_range(start=df['datetime'].iloc[-1]+pd.Timedelta(hours=1), periods=forecast_horizon, freq='H')
plt.plot(forecast_dates, multi_step_preds, label="30-step Forecast", color="blue", linestyle="--")

plt.xlabel("Time")
plt.ylabel("Demand")
plt.title("Multi-step 30-Day Forecast")
plt.legend()
plt.show()



KeyError: 'demand'

<Figure size 1500x500 with 0 Axes>