## Setting Up

In [97]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import os
import joblib

## Saving (training-validation) files names

In [98]:
# Base path where the CSV files are stored in the Kaggle environment
base_path = '../input/train-splits/'

# List all CSV files in the folder dynamically
file_names = [f for f in os.listdir(base_path) if f.endswith('.csv')]

## Needed functions for the pipeline

In [99]:
# Helper function to preprocess the dataset
def preprocess_data(df):
    # Ensure 'timestamp' and 'value' columns exist
    if 'timestamp' not in df.columns:
        raise KeyError("'timestamp' column is missing from the input data")
    
    if 'value' not in df.columns:
        raise KeyError("'value' column is missing from the input data")

    # Convert 'timestamp' to datetime and set as index
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df.set_index('timestamp', inplace=True)
    
    # Apply forward fill first
    df.ffill(inplace=True)

    # Apply backfill to handle remaining NaNs at the start
    df.bfill(inplace=True)
    
    # If NaNs persist, drop them if needed
    df.dropna(inplace=True)
    
    # Feature Engineering: 
    # 1. Add lag features
    df['lag_1'] = df['value'].shift(1)
    df['lag_2'] = df['value'].shift(2)
    df['lag_3'] = df['value'].shift(3)

    # Backfill for initial NaNs in lag features
    df[['lag_1', 'lag_2', 'lag_3']] = df[['lag_1', 'lag_2', 'lag_3']].bfill()

    if 'anomaly' in df.columns:
        # Forward fill the 'value' column only where 'anomaly' is 1
        df.loc[df['anomaly'] == True, 'value'] = pd.NA  # Mark as NaN where anomaly is detected
        df['value'] = df['value'].ffill()  # Forward fill to replace incorrect values
        
        # Drop the 'anomaly' column after making the changes
        df = df.drop('anomaly', axis=1)
    
    # 2. Add Rate of Change (Derivatives)
    df['rate_of_change'] = df['value'].diff()
    df['rate_of_change_2'] = df['rate_of_change'].diff()

    # Fill NaNs from differencing
    df[['rate_of_change', 'rate_of_change_2']] = df[['rate_of_change', 'rate_of_change_2']].bfill()

    # 3. Add Rolling Window Statistics (Moving Average, Std Dev, Min, Max)
    df['rolling_mean_5'] = df['value'].rolling(window=5).mean()
    df['rolling_std_5'] = df['value'].rolling(window=5).std()
    df['rolling_min_5'] = df['value'].rolling(window=5).min()
    df['rolling_max_5'] = df['value'].rolling(window=5).max()

    # Fill NaNs from rolling windows
    df[['rolling_mean_5', 'rolling_std_5', 'rolling_min_5', 'rolling_max_5']] = df[['rolling_mean_5', 'rolling_std_5', 'rolling_min_5', 'rolling_max_5']].bfill()

    # 4. Add Exponential Moving Average (EMA)
    df['ema_5'] = df['value'].ewm(span=5, adjust=False).mean()

    # Fill NaNs from EMA
    # Backfill for 'ema_5' column without inplace=True
    df['ema_5'] = df['ema_5'].bfill()

    # Apply forward fill first
    df.ffill(inplace=True)
    
    # 5. Add Outlier Detection (Z-score)
    df['z_score'] = (df['value'] - df['value'].mean()) / df['value'].std()
    df['is_anomaly'] = (df['z_score'].abs() > 3).astype(int)  # Mark anomalies where Z-score exceeds threshold

    # 6. Add Time-Based Features (Hour, Day, Month)
    df['hour'] = df.index.hour
    df['day_of_week'] = df.index.dayofweek
    df['day_of_month'] = df.index.day
    df['month'] = df.index.month

    # 7. Add Cyclic Feature Encoding (Sin/Cos Transforms for Time)
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

    # If NaNs persist, drop them if needed
    df.dropna(inplace=True)
    
    # Scaling the features
    scaler = StandardScaler()
    feature_columns = ['value', 'lag_1', 'lag_2', 'lag_3', 
                       'rate_of_change', 'rate_of_change_2', 
                       'rolling_mean_5', 'rolling_std_5', 'rolling_min_5', 'rolling_max_5', 
                       'ema_5', 'z_score', 'is_anomaly',
                       'hour', 'day_of_week', 'day_of_month', 'month', 
                       'hour_sin', 'hour_cos', 'day_of_week_sin', 'day_of_week_cos']

    # Apply scaling only to the selected columns
    df[feature_columns] = scaler.fit_transform(df[feature_columns])
    #print(df)

    return df

### Preprocessing Summary:

1. **Validating Columns:**  
   Ensures the dataset has essential columns (`timestamp` for time ordering and `value` for the data to be analyzed).

2. **Datetime Conversion and Indexing:**  
   Converts `timestamp` to a datetime format and sets it as the index for efficient time series operations.

3. **Handling Missing Data:**  
   Uses forward and backward filling to ensure no gaps remain in the data, preserving continuity in the time series.

4. **Lag Features:**  
   Adds previous time step values (lag features) to capture temporal dependencies, providing the model with context about past behavior.

5. **Anomaly Handling (Optional):**  
   Replaces anomaly-affected values by forward filling to maintain data integrity and removes the `anomaly` column.

6. **Rate of Change Features:**  
   Calculates the rate of change (first and second derivatives) to identify trends, shifts, and acceleration in the data.

7. **Rolling Statistics:**  
   Adds rolling mean, standard deviation, min, and max to smooth the data and capture local patterns over time.

8. **Exponential Moving Average (EMA):**  
   Adds EMA to highlight recent trends by giving more weight to recent observations.

9. **Outlier Detection (Z-Score):**  
   Detects and flags extreme values (outliers) that could distort analysis by using a Z-score threshold.

10. **Time-Based Features:**  
   Adds hour, day, and month features to capture seasonal and cyclical patterns in the data.

11. **Cyclic Feature Encoding:**  
   Uses sin/cos transformations for cyclical features (like hours and days) to help models recognize periodic patterns.

12. **Feature Scaling:**  
   Standardizes features to ensure equal contribution during model training, preventing large-scale features from dominating.

In [100]:
# Function to calculate baseline MSE (predicting the mean value)
def calculate_baseline_mse(y_train, y_test):
    baseline_pred = [y_train.mean()] * len(y_test)  # Predicting the mean for all test values
    baseline_mse = mean_squared_error(y_test, baseline_pred)
    return baseline_mse

### Comments
- MSE (model) tells how well model performs.
- Baseline MSE tells how a naive model performs.
- Comparing the two shows if model's complexity is justified and whether it's actually improving prediction accuracy.

In [101]:
# Function to save a trained model
def save_model(model, file_name):
    # Save the model to disk
    model_save_path = f"saved_models/{file_name}.pkl"
    os.makedirs(os.path.dirname(model_save_path), exist_ok=True)
    joblib.dump(model, model_save_path)
    print(f"Model saved to {model_save_path}")

In [102]:
# Function to train and validate a model on a dataset and compare with baseline
def train_and_validate(df, file_name):
    # Preprocess the data
    df = preprocess_data(df)

    # Split into features (X) and target (y)
    X = df[['lag_1','lag_2','lag_3','rate_of_change', 'rate_of_change_2', 
           'rolling_mean_5', 'rolling_std_5', 'rolling_min_5', 'rolling_max_5', 
           'ema_5', 'z_score', 'is_anomaly',
           'hour', 'day_of_week', 'day_of_month', 'month', 
           'hour_sin', 'hour_cos', 'day_of_week_sin', 'day_of_week_cos']]
    y = df['value']

    # Time-based cross-validation (use TimeSeriesSplit)
    tscv = TimeSeriesSplit(n_splits=5)

    # Initialize model
    model = RandomForestRegressor(n_estimators=100, random_state=42)

    mse_scores = []
    baseline_mse_scores = []

    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict(X_test)

        # Calculate MSE for model predictions
        mse = mean_squared_error(y_test, y_pred)
        mse_scores.append(mse)

        # Calculate baseline MSE
        baseline_mse = calculate_baseline_mse(y_train, y_test)
        baseline_mse_scores.append(baseline_mse)

        # Optionally, plot the predictions vs actual values
        #plot_predictions(y_test, y_pred, title=f"{file_name}: Model Predictions vs Actual")

    # Get the average MSE for the model
    avg_mse = sum(mse_scores) / len(mse_scores)

    # Get the average baseline MSE
    avg_baseline_mse = sum(baseline_mse_scores) / len(baseline_mse_scores)

    # Print comparison
    #print(f"Processed {file_name}, Model MSE: {avg_mse}, Baseline MSE: {avg_baseline_mse}")
    
    # Save the model after training
    save_model(model, file_name)

    return avg_mse, avg_baseline_mse

In [103]:
# Main function to apply pipeline to all datasets
def run_pipeline():
    results = {}
    
    # Loop over each file in the folder
    for file_name in file_names:
        file_path = os.path.join(base_path, file_name)
        
        # Load the dataset
        df = pd.read_csv(file_path)
        
        # Train and validate on the dataset, and get MSE values
        model_mse, baseline_mse = train_and_validate(df, file_name.replace('.csv', ''))
        
        # Store results for both model MSE and baseline MSE
        results[file_name] = {'Model MSE': model_mse, 'Baseline MSE': baseline_mse}
        print(f"Processed {file_name}, Model MSE: {model_mse}, Baseline MSE: {baseline_mse}")
    
    return results

## Running the pipeline to get and evaluate the models

In [104]:
# Run the pipeline
pipeline_results = run_pipeline()

Model saved to saved_models/train_506.pkl
Processed train_506.csv, Model MSE: 0.006408909825334078, Baseline MSE: 1.0051548484945483
Model saved to saved_models/train_446.pkl
Processed train_446.csv, Model MSE: 0.041065716469621025, Baseline MSE: 1.0765738218045644
Model saved to saved_models/train_352.pkl
Processed train_352.csv, Model MSE: 0.002542640537321867, Baseline MSE: 1.1003476859849317
Model saved to saved_models/train_29.pkl
Processed train_29.csv, Model MSE: 0.0003044780617876211, Baseline MSE: 1.2679947637193216
Model saved to saved_models/train_428.pkl
Processed train_428.csv, Model MSE: 0.0001258365159061472, Baseline MSE: 1.035571471063816
Model saved to saved_models/train_200.pkl
Processed train_200.csv, Model MSE: 0.007147846308769093, Baseline MSE: 1.0100968779411634
Model saved to saved_models/train_165.pkl
Processed train_165.csv, Model MSE: 0.07125170910445752, Baseline MSE: 1.365052065802292
Model saved to saved_models/train_212.pkl
Processed train_212.csv, Model

### Comments

1. **Low MSE Values Indicating Good Model Performance**: 
   - Several models have very low MSE values compared to their baselines, which suggests that they have learned well and can predict the target variable accurately. For instance:
     - `train_453.csv`: Model MSE = 0.000007, Baseline MSE = 1.050442
     - `train_391.csv`: Model MSE = 0.000045, Baseline MSE = 0.932005
     - `train_495.csv`: Model MSE = 0.000035, Baseline MSE = 1.150589
   
   These models outperform the baseline significantly, meaning they are capturing relationships in the data effectively.

2. **High MSE Values Indicating Poorer Model Performance**:
   - Some models, while still performing better than the baseline, have higher MSEs, which suggests they may not be fitting the data as well or there may be room for improvement in the model or feature engineering:
     - `train_218.csv`: Model MSE = 0.510917, Baseline MSE = 1.359575
     - `train_211.csv`: Model MSE = 0.127210, Baseline MSE = 1.464105

   These models show some predictive ability, but their MSEs indicate they are less accurate compared to others.

3. **Anomalous or Poor Results**:
   - Some models have MSE values that are still relatively high or are worse than other models:
     - `train_491.csv`: Model MSE = 0.065203, Baseline MSE = 1.483529
     - `train_204.csv`: Model MSE = 0.128564, Baseline MSE = 1.374284

   These models are not capturing the data relationships as effectively as others and might benefit from further tuning, feature selection, or a change in model architecture.

4. **Models with Very Small MSEs**: 
   - Some models have extremely low MSE values (e.g., `train_505.csv`: MSE = 0.007080), indicating they are very close to perfect predictions. While these are generally positive results, extremely low MSEs can also sometimes indicate potential overfitting, especially if the baseline MSE is still relatively large.

5. **Overall Performance**: 
   - Generally, the models outperform their baselines, which is a positive indication of successful training. However, there is considerable variation in the performance across different datasets (MSE ranging from very small values to higher ones like 0.510917 or 0.128564), suggesting that some datasets may be easier to predict or may benefit from more fine-tuning than others.

### Suggestions for Improvement:
- **Hyperparameter Tuning**: For models with higher MSE values, I should try tuning the model parameters (e.g., learning rate, number of layers) to improve performance.
- **Feature Engineering**: I should consider exploring feature engineering techniques like normalization, scaling, or creating new features for better results on datasets with higher MSEs.
- **Cross-Validation**: I should implement cross-validation to better evaluate model generalization across different parts of the dataset and avoid overfitting on smaller datasets.