In [None]:
# Import Libraries and Load Data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.stattools import adfuller
import xgboost as xgb
import statsmodels.api as sm
import datetime

In [None]:
# Load your dataset (replace 'path_to_file' with the actual path)
df = pd.read_csv('path_to_file.csv')
df.head()  # Show the first few rows to inspect the data

In [None]:
# Exploratory Data Analysis (EDA)
# 1. Data Overview:

df.info()
df.isnull().sum()

In [None]:
# Exploratory Data Analysis (EDA)
# 2. Visualize the Closing Price Over Time:

df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)
decomposed = sm.tsa.seasonal_decompose(df['Close'], model='multiplicative', period=30)
decomposed.plot()
plt.show()

In [None]:
# Exploratory Data Analysis (EDA)
# 3. Check for Trends and Seasonality:

correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

In [None]:
# Exploratory Data Analysis (EDA)
# 4. Feature Correlation:

df['Lag_1'] = df['Close'].shift(1)
df['Lag_2'] = df['Close'].shift(2)
df['Lag_5'] = df['Close'].shift(5)
df['Rolling_Mean_5'] = df['Close'].rolling(window=5).mean()

In [None]:
# Feature Engineering
# 1.Create Lagged Features:

plt.figure(figsize=(10,6))
plt.plot(df['Date'], df['Close'], label='Closing Price')
plt.title('Stock Closing Price Over Time')
plt.xlabel('Date')
plt.ylabel('Price')
plt.xticks(rotation=45)
plt.legend()
plt.show()

In [None]:
# Feature Engineering
# 2. Add Moving Averages (e.g., 5-day, 20-day):

df['SMA_5'] = df['Close'].rolling(window=5).mean()
df['SMA_20'] = df['Close'].rolling(window=20).mean()

In [None]:
# Feature Engineering
# 3. Drop missing values created by shifting and rolling:

df.dropna(inplace=True)

In [None]:
# Split Data into Training and Test Sets

X = df[['Lag_1', 'Lag_2', 'Lag_5', 'SMA_5', 'SMA_20']]  # Feature columns
y = df['Close']  # Target column

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
# Model Selection and Training
# 1. XGBoost Model:

model = xgb.XGBRegressor()
model.fit(X_train, y_train)

In [None]:
# Model Selection and Training
# 2. Evaluate the Model:

y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Root Mean Squared Error: {rmse}')

In [None]:
#  Visualize Model Performance

plt.figure(figsize=(10,6))
plt.plot(y_test.index, y_test, label='True Values', color='blue')
plt.plot(y_test.index, y_pred, label='Predictions', color='red')
plt.title('True vs Predicted Closing Price')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.show()