In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

In [8]:
# Load the data
df = pd.read_csv('stock_post_eda.csv')

In [9]:
# Sort the dataframe by date
df = df.sort_values('Date')

In [10]:
# Create additional features
df['Daily_Return'] = df['Closing_Price'].pct_change()
df['Price_Range'] = df['High_Price'] - df['Low_Price']
df['Price_Range_Percentage'] = (df['High_Price'] - df['Low_Price']) / df['Opening_Price']

In [11]:
# Create lagged features
for lag in [1, 2, 3, 5, 10]:
    df[f'Closing_Price_Lag_{lag}'] = df['Closing_Price'].shift(lag)
    df[f'Volume_Lag_{lag}'] = df['Volume_Millions'].shift(lag)                                                                                                                          

In [12]:
# Drop rows with NaN values created by lag features
df = df.dropna()

In [13]:
numeric_columns = ['Closing_Price', 'Opening_Price', 'High_Price', 'Low_Price', 'Volume_Millions']

In [14]:
# Normalize the features using MinMaxScaler
scaler = MinMaxScaler()
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

In [15]:
# Prepare data for time series modeling
def create_sequences(data, sequence_length):
    X, y = [], []
    for i in range(len(data) - sequence_length):
        X.append(data[i:(i + sequence_length)])
        y.append(data[i + sequence_length])
    return np.array(X), np.array(y)

sequence_length = 10
features = df[numeric_columns].values
X, y = create_sequences(features, sequence_length)

In [16]:
# Split the data into training and testing sets
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [17]:
print("\nPreprocessed data shape:")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")


Preprocessed data shape:
X_train shape: (7338, 10, 5)
y_train shape: (7338, 5)
X_test shape: (1835, 10, 5)
y_test shape: (1835, 5)


In [18]:
# Save preprocessed data
np.save('X_train.npy', X_train)
np.save('y_train.npy', y_train)
np.save('X_test.npy', X_test)
np.save('y_test.npy', y_test)