# 1. Intro & imports

This section introduces the purpose of this utility file, which is to provide reusable functions for loading, cleaning, analyzing, and preparing datasets for machine learning and data analysis. The necessary imports are included here to support all subsequent functions.

In [None]:
# Styling variables
line_start = '\033[4m'
bold_start = '\033[1m'
style_end   = '\033[0m'
bullet_start = '\u2022 '
trait_start = '- '

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler



# for visual aesthetics for seaborn
sns.set(style="whitegrid")

In [None]:
# Install Prophet
!pip install prophet

# Install Keras and TensorFlow (for LSTM, Transformer, and TCN)
!pip install keras tensorflow

# Install TCN package
!pip install keras-tcn

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting numpy (from keras)
  Downloading numpy-2.0.2-cp311-cp311-win_amd64.whl (15.9 MB)
                                              0.0/15.9 MB ? eta -:--:--
     -                                        0.6/15.9 MB 12.4 MB/s eta 0:00:02
     -----                                    2.1/15.9 MB 22.5 MB/s eta 0:00:01
     -----------                              4.5/15.9 MB 32.3 MB/s eta 0:00:01
     ------------------                       7.4/15.9 MB 39.5 MB/s eta 0:00:01
     ------------------------------          12.4/15.9 MB 72.6 MB/s eta 0:00:01
     --------------------------------------  15.9/15.9 MB 93.9 MB/s eta 0:00:01
     --------------------------------------- 15.9/15.9 MB 65.6 MB/s eta 0:00:00
Installing collected packages: numpy
Successfully installed numpy-2.0.2


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.0 requires FuzzyTM>=0.4.0, which is not installed.
tables 3.8.0 requires blosc2~=2.0.0, which is not installed.
tables 3.8.0 requires cython>=0.29.21, which is not installed.
transformers 2.1.1 requires sentencepiece, which is not installed.
numba 0.57.0 requires numpy<1.25,>=1.21, but you have numpy 2.0.2 which is incompatible.
scipy 1.10.1 requires numpy<1.27.0,>=1.19.5, but you have numpy 2.0.2 which is incompatible.


Defaulting to user installation because normal site-packages is not writeable


In [None]:
# Model Implementaiton
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
from tcn import TCN
from keras.models import Sequential
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from keras.models import Sequential
from keras.layers import Dense, Input, MultiHeadAttention, Dropout, LayerNormalization

# 2.  Data Loading and Basic Information

To load data efficiently and conduct a preliminary inspection, providing insights into the dataset's structure and size.

In [None]:
def load_data(file_path, max_rows, encoding, dtype):
    """
    Loads a subset of a large CSV file, limited by the specified number of rows.

    Parameters:
    - file_path (str): Path to the CSV file.
    - max_rows (int): Maximum number of rows to load. Default is 500,000.
    - encoding (str): File encoding to use. Default is 'ISO-8859-1'.
    - dtype (dict): Column data types for memory efficiency.
    - low_memory : Ensures consistent data types across all rows by reading the entire file in a single pass

    Returns:
    - DataFrame: A DataFrame containing the subset of data.
    """
    return pd.read_csv(file_path, nrows=max_rows, encoding=encoding, dtype=dtype, low_memory = False)

In [None]:
def check(data,glance_size):
    data_check = data.head(glance_size)
    return data_check

In [None]:
def size(data):
    data_size = data.shape
    return data_size

# 3. Exploratory Data Analysis

In [None]:
def initial_eda(data, datetime_column=None):
    """
    Performs essential exploratory data analysis for project-focused insights.

    Parameters:
    - data (DataFrame): The DataFrame to analyze.
    - datetime_column (str, optional): The name of the datetime column, if available, for date analysis.

    Returns:
    - None (prints results for each EDA step).
    """

    # 1. Basic Data Overview
    print("----- Data Overview -----")
    print("Dataset Shape:", data.shape)
    print("Column Names:", list(data.columns))
    print("Data Types:\n", data.dtypes)
    print("\nPreview of Data:\n", data.head(), "\n")

        # Unique Values Analysis
    print("----- Unique Values Analysis -----")
    unique_values = data.nunique()
    print(unique_values.sort_values(ascending=True), "\n")

    # 2. Missing Values Summary
    print("----- Missing Values Summary -----")
    missing_data = data.isnull().mean() * 100  # Percentage of missing values
    print(missing_data[missing_data > 0].sort_values(ascending=False), "\n")

    # 3. Basic Statistics for Numerical Columns
    print("----- Basic Statistics (Numerical Columns) -----")
    print(data.describe().T[['mean', 'std', 'min', 'max']], "\n")  # Focused stats

    # 4. Date-Time Analysis (if datetime column is provided)
    if datetime_column and datetime_column in data.columns:
        print("----- Date-Time Analysis -----")
        data[datetime_column] = pd.to_datetime(data[datetime_column], errors='coerce')  # Ensure datetime format
        print(f"Date Range for '{datetime_column}':")
        print("Min Date:", data[datetime_column].min())
        print("Max Date:", data[datetime_column].max(), "\n")

In [None]:
def missing_summ(data):
    missing_summary = data.isnull().sum()
    filtered_summary = missing_summary[missing_summary > 0]
    print(filtered_summary)  # Print for visualization
    return filtered_summary  # Return the summary

In [None]:
import matplotlib.pyplot as plt

def plot_numeric_histograms(data):
    """
    Plots histograms for all numerical columns in the dataset.

    Parameters:
    - data (DataFrame): The DataFrame containing the data to plot.
    """
    # Select only numerical columns
    numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns

    # Define the plot layout based on the number of numeric columns
    num_cols = len(numeric_columns)
    num_rows = (num_cols // 3) + (num_cols % 3 > 0)  # 3 plots per row

    # Set up the figure size based on the number of subplots needed
    plt.figure(figsize=(15, num_rows * 4))

    for i, column in enumerate(numeric_columns, 1):
        plt.subplot(num_rows, 3, i)
        plt.hist(data[column].dropna(), bins=30, edgecolor='black')
        plt.title(column)
        plt.xlabel('Value')
        plt.ylabel('Frequency')

    plt.tight_layout()
    plt.show()

In [None]:
import matplotlib.pyplot as plt

def plot_entry_counts(data, date_column, period='M'):
    """
    Plots the number of entries per specified period (e.g., month) to visualize data density over time.

    Parameters:
    - data (DataFrame): The DataFrame containing the data.
    - date_column (str): The name of the date column.
    - period (str): The period to group by (e.g., 'M' for month, 'Y' for year).

    Returns:
    - None (displays a plot).
    """
    # Ensure the date column is in datetime format
    data[date_column] = pd.to_datetime(data[date_column], errors='coerce')

    # Group by the specified period and count entries
    entry_counts = data[date_column].dt.to_period(period).value_counts().sort_index()

    # Plot the counts
    plt.figure(figsize=(12, 6))
    entry_counts.plot(kind='bar')
    plt.title(f"Entry Counts per {period} Period")
    plt.xlabel(f"{period} Period")
    plt.ylabel("Number of Entries")
    n = 6  # Display every 6th label
    plt.xticks(ticks=range(0, len(entry_counts), n), labels=entry_counts.index[::n], rotation=45)
    plt.show()

# 4. Data Cleaning and Missing Value Handling

In [None]:
def fill_missing_values(data, columns, method='interpolate', fill_remaining='forward'):
    """
    Fills missing values in specified numeric columns using the chosen method, with an optional secondary method
    for remaining missing values after interpolation.

    Parameters:
    - data (DataFrame): The DataFrame containing the data.
    - columns (list): List of columns to fill missing values in.
    - method (str): Method to fill missing values. Options: 'mean', 'interpolate'.
    - fill_remaining (str): Secondary method to fill remaining missing values. Options: 'forward', 'backward', 'mean'.

    Returns:
    - DataFrame: Updated DataFrame with filled missing values in specified columns.
    """
    for col in columns:
        if method == 'mean':
            data[col].fillna(data[col].mean(), inplace=True)
        elif method == 'interpolate':
            # Perform linear interpolation
            data[col].interpolate(method='linear', inplace=True, limit_direction='both')

        # Handle any remaining NaNs after interpolation
        if fill_remaining == 'forward':
            data[col].fillna(method='ffill', inplace=True)
        elif fill_remaining == 'backward':
            data[col].fillna(method='bfill', inplace=True)
        elif fill_remaining == 'mean':
            data[col].fillna(data[col].mean(), inplace=True)

    return data

In [None]:
# Drops columns with missing values above a specified threshold percentage.
def drop_missing_threshold(data, threshold):
    # Calculate missing percentage for each column
    missing_percentage = data.isnull().mean() * 100
    # Identify columns to drop
    columns_to_drop = missing_percentage[missing_percentage > threshold].index
    # Drop columns
    data = data.drop(columns=columns_to_drop, inplace = True)
    print(f"Dropped columns with >{threshold}% missing values: {list(columns_to_drop)}")
    return

In [None]:
def drop_columns(data, columns_to_drop):
    # Drop specified columns in place
    data.drop(columns=columns_to_drop, inplace=True)
    print(f"Dropped columns: {columns_to_drop}")

In [None]:
# Fill missing 'type' values with the most frequent category
def categorical_filling(data, column):
    # Find the most frequent value (mode) in the specified column
    most_frequent_type = data[column].mode()[0]  # Use the first mode if there are multiple
    # Fill missing values with the most frequent value
    data[column].fillna(most_frequent_type, inplace=True)

    # Check remaining missing values in the DataFrame (optional)
    missing_summary = data.isnull().sum()
    print("Remaining missing values:\n", missing_summary[missing_summary > 0])

    return data


# 5.Feature Engineering

In [None]:
collinear_pairs = [
    ('height', 'station_pressure_hourly_mB'),  # Drop 'height' and keep 'station_pressure_hourly_mB'
    ('wind_speed_hourly_m_s', 'wind_gust_max_m_s')  # Drop 'wind_speed_hourly_m_s' and keep 'wind_gust_max_m_s'
]
def drop_collinear_features(data, collinear_pairs):

    features_to_drop = [pair[0] for pair in collinear_pairs]
    print(f"Dropping collinear features: {features_to_drop}")
    return data.drop(columns=features_to_drop)

In [None]:
def drop_weakly_correlated_features(data, target_variable, threshold=0.2):
    """

    Parameters:
    - data (DataFrame): The DataFrame containing the features.
    - target_variable (str): The name of the target variable.
    - threshold (float): The minimum absolute correlation required to keep a feature.

    Returns:
    - DataFrame: The DataFrame with weakly correlated features dropped.
    """
    correlation_matrix = data.corr()
    weak_corr_features = [
        col for col in correlation_matrix.columns
        if abs(correlation_matrix[col][target_variable]) < threshold and col != target_variable
    ]
    print(f"Dropping weakly correlated features: {weak_corr_features}")
    return data.drop(columns=weak_corr_features)


In [None]:
def standardize_date_index_MonthHour(data, date_column):
    # If the datetime column is already the index, reset it first to make it a regular column
    if data.index.name == date_column:
        data = data.reset_index()

    # Convert the date column to datetime format
    data[date_column] = pd.to_datetime(data[date_column], errors='coerce')

    # Drop rows with unparseable dates
    data = data.dropna(subset=[date_column])

    # Extract the month as a numerical value
    data['month'] = data[date_column].dt.month

    # Define numeric seasons
    def get_season(month):
        if month in [12, 1, 2]:
            return 1  # Winter
        elif month in [3, 4, 5]:
            return 2  # Spring
        elif month in [6, 7, 8]:
            return 3  # Summer
        else:
            return 4  # Autumn

    # Apply the season function to create a numeric 'season' column
    data['season'] = data['month'].apply(get_season)

    # Define a function to classify hours into time-of-day categories with numeric labels
    def classify_time_of_day(hour):
        if 5 <= hour < 9:
            return 1  # Early Morning
        elif 9 <= hour < 12:
            return 2  # Morning
        elif 12 <= hour < 16:
            return 3  # Afternoon
        elif 16 <= hour < 20:
            return 4  # Evening
        else:
            return 5  # Night

    # Apply the function to create the 'time_of_day' column as a numeric category
    data['time_of_day'] = data[date_column].dt.hour.apply(classify_time_of_day)

    # Set the exact datetime as the index
    data.set_index(date_column, inplace=True)
    return data

# 6. Outlier Detection and Handling


In [None]:
def plot_boxplots(data, numerical_columns):
    """
    Plots boxplots for each numerical column to visualize outliers.
    """
    for col in numerical_columns:
        plt.figure(figsize=(8, 4))
        sns.boxplot(x=data[col])
        plt.title(f'Boxplot of {col}')
        plt.show()

In [None]:
from sklearn.preprocessing import RobustScaler
def handle_outliers(data, columns, method='cap'):
    """
    Detects and handles outliers in the specified columns using the IQR method with an option to cap or robust scale.

    Parameters:
    - data (DataFrame): The DataFrame containing the data.
    - columns (list): List of columns to check for outliers.
    - method (str): Method to handle outliers. Options: 'cap' or 'robust_scale'.

    Returns:
    - DataFrame: DataFrame with handled outliers.
    """
    data = data.copy()

    if method == 'cap':
        for col in columns:
            Q1 = data[col].quantile(0.25)
            Q3 = data[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            # Cap outliers in the column
            data[col] = data[col].clip(lower=lower_bound, upper=upper_bound)

    elif method == 'robust_scale':
        # Apply RobustScaler to selected columns
        scaler = RobustScaler()
        data[columns] = scaler.fit_transform(data[columns])

    else:
        raise ValueError("Method not recognized. Use 'cap' or 'robust_scale'.")

    return data

## Data Splitting


In [None]:
from sklearn.model_selection import train_test_split

def split_data(dataset, target_column, test_size=0.2, random_state=42, shuffle=False):
    """
    Splits a dataset into training and testing sets.

    Parameters:
    - dataset (DataFrame): The full dataset to split.
    - target_column (str): The name of the target column.
    - test_size (float): The proportion of the dataset to include in the test split.
    - random_state (int): Seed used by the random number generator.
    - shuffle (bool): Whether to shuffle the data before splitting. Default is False for time-series.

    Returns:
    - X_train, X_test, y_train, y_test: Split features and target sets.
    """
    X = dataset.drop(columns=[target_column])
    y = dataset[target_column]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, shuffle=shuffle
    )

    print(f"Data split completed. Training shape: {X_train.shape}, Testing shape: {X_test.shape}")
    return X_train, X_test, y_train, y_test


# Model Implementation

### SVR

In [None]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

def implement_svr(kernel='rbf', C=1.0, epsilon=0.1, gamma='scale'):
    """
    Implements an SVR model for time-series regression.

    Parameters:
    - kernel (str): Kernel type (e.g., 'rbf', 'linear'). Default is 'rbf'.
    - C (float): Regularization parameter. Default is 1.0.
    - epsilon (float): Epsilon in the epsilon-SVR model. Default is 0.1.
    - gamma (str): Kernel coefficient for 'rbf'. Default is 'scale'.

    Returns:
    - model: Configured SVR model.
    """
    model = make_pipeline(StandardScaler(), SVR(kernel=kernel, C=C, epsilon=epsilon, gamma=gamma))
    return model


### Transformer Model

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Input, MultiHeadAttention, Dropout, LayerNormalization

def implement_transformer(input_dim, num_heads=4, ff_dim=128, dropout_rate=0.1, epochs=20, batch_size=32):
    """
    Implements a Transformer-based model for time-series forecasting.

    Parameters:
    - input_dim (int): Dimension of the input features.
    - num_heads (int): Number of attention heads. Default is 4.
    - ff_dim (int): Dimensionality of the feed-forward layer. Default is 128.
    - dropout_rate (float): Dropout rate. Default is 0.1.
    - epochs (int): Number of training epochs. Default is 20.
    - batch_size (int): Batch size for training. Default is 32.

    Returns:
    - model: Trained Transformer model.
    """
    model = Sequential()
    model.add(Input(shape=(None, input_dim)))
    model.add(MultiHeadAttention(num_heads=num_heads, key_dim=input_dim))
    model.add(Dropout(dropout_rate))
    model.add(LayerNormalization())
    model.add(Dense(ff_dim, activation='relu'))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')
    return model


### TCN

In [None]:
from tcn import TCN
from keras.models import Sequential

def implement_tcn(data, lookback=1, filters=64, kernel_size=3, epochs=20, batch_size=32, optimizer='adam', loss='mse'):
    """
    Implements a Temporal Convolutional Network (TCN) model for time-series forecasting.

    Parameters:
    - data (array-like): Input time-series data.
    - lookback (int): Number of previous timesteps to consider. Default is 1.
    - filters (int): Number of convolutional filters. Default is 64.
    - kernel_size (int): Kernel size for convolution. Default is 3.
    - epochs (int): Number of training epochs. Default is 20.
    - batch_size (int): Batch size for training. Default is 32.
    - optimizer (str): Optimizer to use (e.g., 'adam'). Default is 'adam'.
    - loss (str): Loss function to use (e.g., 'mse'). Default is 'mse'.

    Returns:
    - model: Trained TCN model.
    """
    scaler = MinMaxScaler()
    data_scaled = scaler.fit_transform(data.reshape(-1, 1))
    X, y = [], []
    for i in range(lookback, len(data_scaled)):
        X.append(data_scaled[i-lookback:i, 0])
        y.append(data_scaled[i, 0])
    X, y = np.array(X), np.array(y)
    X = np.reshape(X, (X.shape[0], X.shape[1], 1))

    model = Sequential()
    model.add(TCN(input_shape=(X.shape[1], 1), filters=filters, kernel_size=kernel_size))
    model.add(Dense(1))
    model.compile(optimizer=optimizer, loss=loss)
    model.fit(X, y, epochs=epochs, batch_size=batch_size, verbose=1)
    return model


### LSTM Model

In [None]:


def implement_lstm(data, lookback=1, units=50, epochs=20, batch_size=32, optimizer='adam', loss='mse'):
    """
    Implements an LSTM model for time-series forecasting.

    Parameters:
    - data (array-like): Input time-series data.
    - lookback (int): Number of previous timesteps to consider. Default is 1.
    - units (int): Number of LSTM units. Default is 50.
    - epochs (int): Number of training epochs. Default is 20.
    - batch_size (int): Batch size for training. Default is 32.
    - optimizer (str): Optimizer to use (e.g., 'adam'). Default is 'adam'.
    - loss (str): Loss function to use (e.g., 'mse'). Default is 'mse'.

    Returns:
    - model: Trained LSTM model.
    """
    scaler = MinMaxScaler()
    data_scaled = scaler.fit_transform(data.reshape(-1, 1))
    X, y = [], []
    for i in range(lookback, len(data_scaled)):
        X.append(data_scaled[i-lookback:i, 0])
        y.append(data_scaled[i, 0])
    X, y = np.array(X), np.array(y)
    X = np.reshape(X, (X.shape[0], X.shape[1], 1))

    model = Sequential()
    model.add(LSTM(units=units, input_shape=(X.shape[1], 1)))
    model.add(Dense(1))
    model.compile(optimizer=optimizer, loss=loss)
    model.fit(X, y, epochs=epochs, batch_size=batch_size, verbose=1)
    return model


### Prophet

In [None]:
# Reshape data for Prophet
def reshape_for_prophet(data, date_col, target_col):
    """
    Reshape the data for Prophet model implementation.

    Parameters:
    - data (DataFrame): Input dataset.
    - date_col (str): Column name for the date.
    - target_col (str): Column name for the target variable.

    Returns:
    - DataFrame: Reshaped dataset with columns 'ds' and 'y'.
    """
    prophet_data = data[[date_col, target_col]].rename(columns={date_col: 'ds', target_col: 'y'})
    return prophet_data


In [None]:
def implement_prophet(train_data, growth='linear', seasonality_mode='additive', changepoint_prior_scale=0.05):
    """
    Implements the Prophet model on training data.

    Parameters:
    - train_data (DataFrame): Training dataset with 'ds' (date) and 'y' (target).
    - growth (str): Growth trend ('linear' or 'logistic').
    - seasonality_mode (str): Seasonality mode ('additive' or 'multiplicative').
    - changepoint_prior_scale (float): Regularization strength for changepoints.

    Returns:
    - model (Prophet object): Fitted Prophet model.
    """
    model = Prophet(growth=growth, seasonality_mode=seasonality_mode, changepoint_prior_scale=changepoint_prior_scale)
    model.add_seasonality(name='monthly', period=30.5, fourier_order=5)
    model.fit(train_data)
    return model


