In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

In [2]:
# Load the dataset
df = pd.read_csv('ett.csv', parse_dates=['date'])
df.set_index('date', inplace=True)

In [3]:
# Check for missing values
print("Missing values before imputation:")
print(df.isnull().sum())

Missing values before imputation:
HUFL    0
HULL    0
MUFL    0
MULL    0
LUFL    0
LULL    0
OT      0
dtype: int64


In [4]:
# Impute missing values using KNN imputer
imputer = KNNImputer(n_neighbors=5)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns, index=df.index)

print("\nMissing values after imputation:")
print(df_imputed.isnull().sum())


Missing values after imputation:
HUFL    0
HULL    0
MUFL    0
MULL    0
LUFL    0
LULL    0
OT      0
dtype: int64


In [5]:
# Normalize the data using MinMaxScaler
scaler = MinMaxScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df_imputed), columns=df_imputed.columns, index=df_imputed.index)

In [6]:
# Create lag features
def create_lag_features(data, lag=1):
    lag_features = {}
    for column in data.columns:
        lag_features[f"{column}_lag_{lag}"] = data[column].shift(lag)
    return pd.DataFrame(lag_features)

lag_features = create_lag_features(df_normalized, lag=1)
df_with_lags = pd.concat([df_normalized, lag_features], axis=1)
df_with_lags.dropna(inplace=True)

In [7]:
# Split the data into features (X) and target (y)
target_column = 'OT'  # Assuming 'OT' is the target variable
X = df_with_lags.drop(columns=[target_column])
y = df_with_lags[target_column]

In [8]:
# Split the data into train and test sets
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

print("\nShape of training data:")
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("\nShape of test data:")
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)


Shape of training data:
X_train: (13935, 13)
y_train: (13935,)

Shape of test data:
X_test: (3484, 13)
y_test: (3484,)


In [9]:
# Save preprocessed data
X_train.to_csv('X_train.csv')
X_test.to_csv('X_test.csv')
y_train.to_csv('y_train.csv')
y_test.to_csv('y_test.csv')

print("\nPreprocessed data saved to CSV files.")


Preprocessed data saved to CSV files.
