In [None]:
# Import required libraries
import pandas as pd                       # Data loading & manipulation
import matplotlib.pyplot as plt           # Basic plotting
import seaborn as sns                    # Statistical data visualization
import plotly.express as px              # Interactive plotting
import plotly.graph_objects as go        # Advanced interactive plots
from plotly.subplots import make_subplots
import warnings                          # Warning control
warnings.filterwarnings('ignore')

# Set seaborn theme
sns.set_theme(style='whitegrid')

In [None]:
# Load the Dataset
# Read the dataset into a pandas DataFrame 
df = pd.read_csv('/path/to/dataset')

# Preview the Top Rows
# Display the first three rows of the dataset to get an initial look at the data
df.head(3)

In [None]:
df.info()

In [None]:
# Summary Statistics
# Display the summary statistics of the dataset, including count, mean, std, min, 25%, 50%, 75%, and max values for numerical columns
df.describe()

#  📈 Stocks Market Analysis – Exploratory Data Analysis

## 1. Price Trend Over Time

In [None]:
# --- Scatter Plot: Stock Price Trend Over Time ---
# This plot shows the closing price trends of Apple, Amazon, Google, Microsoft, and NVIDIA from 2010 to 2025.
# It provides an interactive comparison of long-term growth patterns with range sliders for time filtering.

# Create an interactive line plot for the closing prices of all five companies
plt_data = [
    go.Scatter(x=df['Date'], y=df['Close_AAPL'], mode='lines', name='Apple (AAPL)'),
    go.Scatter(x=df['Date'], y=df['Close_AMZN'], mode='lines', name='Amazon (AMZN)'),
    go.Scatter(x=df['Date'], y=df['Close_GOOGL'], mode='lines', name='Google (GOOGL)'),
    go.Scatter(x=df['Date'], y=df['Close_MSFT'], mode='lines', name='Microsoft (MSFT)'),
    go.Scatter(x=df['Date'], y=df['Close_NVDA'], mode='lines', name='NVIDIA (NVDA)')
]

layout = go.Layout(
    title='Stock Price Trend Over Time (2010–2025)',
    xaxis=dict(
        title='Date',
        rangeselector=dict(
            buttons=list([
                dict(count=1, label="1m", step="month", stepmode="backward"),
                dict(count=6, label="6m", step="month", stepmode="backward"),
                dict(count=1, label="1y", step="year", stepmode="backward"),
                dict(count=5, label="5y", step="year", stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(visible=True),
        type="date"
    ),
    yaxis=dict(title='Stock Closing Price (USD)'),
    template='plotly_dark',
    height=600
)

fig = go.Figure(data=plt_data, layout=layout)
fig.show(renderer="notebook")

## 2. Daily High-Low Volatility

In [None]:
# --- Candlestick Plot: Daily High-Low Volatility (NVIDIA) ---
# This plot shows the daily open, high, low, and close prices of NVIDIA stock.
# It helps visualize market volatility and daily trading behavior using a candlestick chart.

df.sort_values('Date', inplace=True)

# Create a candlestick chart for NVIDIA (you can repeat for other stocks)
fig = go.Figure(data=[
    go.Candlestick(
        x=df['Date'],
        open=df['Open_NVDA'],
        high=df['High_NVDA'],
        low=df['Low_NVDA'],
        close=df['Close_NVDA'],
        name='NVIDIA'
    )
])

# Update layout for better readability and interaction
fig.update_layout(
    title='NVIDIA Daily Price Volatility (Candlestick Chart)',
    xaxis_title='Date',
    yaxis_title='Price (USD)',
    xaxis_rangeslider_visible=False,
    template='plotly_dark',
    height=600
)

fig.show(renderer="notebook")

In [None]:
# --- Area Plot: Daily High-Low Volatility (Apple) ---
# This plot shows the daily high-low price difference to illustrate volatility over time.

# Calculate high-low difference
df['Range_AAPL'] = df['High_AAPL'] - df['Low_AAPL']

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df['Date'],
    y=df['Range_AAPL'],
    fill='tozeroy',
    mode='lines',
    name='Apple High-Low Range',
    line=dict(color='skyblue')
))

fig.update_layout(
    title='Apple Daily Volatility (High - Low Range)',
    xaxis_title='Date',
    yaxis_title='Price Range (USD)',
    template='plotly_white',
    height=500
)

fig.show(renderer="notebook")

In [None]:
# --- Filled Area Plot: Daily High-Low Volatility (Amazon) ---
# This plot shows the daily high and low prices of Amazon stock over time.
# The filled area between the high and low lines represents daily volatility in the stock price.

# Create the filled area plot for Amazon (AMZN)
fig = go.Figure()

# Add high line
fig.add_trace(go.Scatter(
    x=df['Date'],
    y=df['High_AMZN'],
    mode='lines',
    name='High Price',
    line=dict(color='green')
))

# Add low line and fill area between high and low
fig.add_trace(go.Scatter(
    x=df['Date'],
    y=df['Low_AMZN'],
    mode='lines',
    name='Low Price',
    line=dict(color='red'),
    fill='tonexty',  # Fills the area between this trace and the previous (high)
    fillcolor='rgba(255,0,0,0.1)'  # Transparent fill for clarity
))

# Customize layout
fig.update_layout(
    title='Amazon Daily High-Low Volatility (2010–2025)',
    xaxis_title='Date',
    yaxis_title='Price (USD)',
    template='plotly_white',
    height=500
)

fig.show(renderer="notebook")

In [None]:
# --- Band Plot: Daily High-Low Volatility (Microsoft) ---
# This plot shows the high and low prices of Microsoft stock with the area filled between these prices,
# providing a clear representation of the volatility range for each day.

# Create the band plot for Microsoft (MSFT)
fig = go.Figure()

# Add the band between the high and low prices
fig.add_trace(go.Scatter(
    x=df['Date'],
    y=df['High_MSFT'],
    mode='lines',
    name='Microsoft High Price',
    line=dict(color='green'),
    fill='tonexty',  # Fills the area between this trace and the next (low)
    fillcolor='rgba(0, 255, 0, 0.2)'  # Green fill for high-low area
))

fig.add_trace(go.Scatter(
    x=df['Date'],
    y=df['Low_MSFT'],
    mode='lines',
    name='Microsoft Low Price',
    line=dict(color='red'),
    fill='tonexty',  # Fills the area between this trace and the previous (high)
    fillcolor='rgba(255, 0, 0, 0.2)'  # Red fill for high-low area
))

# Update layout
fig.update_layout(
    title='Microsoft Daily High-Low Volatility (2010–2025)',
    xaxis_title='Date',
    yaxis_title='Price (USD)',
    template='plotly_white',
    height=500
)

fig.show(renderer="notebook")

## 3. Moving Averages (20-day, 50-day, 200-day) 

In [None]:
# --- Line Plot: Google Moving Averages (20-day, 50-day, 200-day) ---
# This plot overlays 20-day, 50-day, and 200-day moving averages on Google stock's closing price.
# The moving averages help to smooth out daily price fluctuations and reveal trends.

# Calculate the moving averages
df['MA20'] = df['Close_GOOGL'].rolling(window=20).mean()
df['MA50'] = df['Close_GOOGL'].rolling(window=50).mean()
df['MA200'] = df['Close_GOOGL'].rolling(window=200).mean()

# Create the plot
fig = go.Figure()

# Add the closing price
fig.add_trace(go.Scatter(
    x=df['Date'],
    y=df['Close_GOOGL'],
    mode='lines',
    name='Google Close Price',
    line=dict(color='blue')
))

# Add the 20-day moving average
fig.add_trace(go.Scatter(
    x=df['Date'],
    y=df['MA20'],
    mode='lines',
    name='20-day Moving Average',
    line=dict(color='orange', dash='dash')
))

# Add the 50-day moving average
fig.add_trace(go.Scatter(
    x=df['Date'],
    y=df['MA50'],
    mode='lines',
    name='50-day Moving Average',
    line=dict(color='green', dash='dash')
))

# Add the 200-day moving average
fig.add_trace(go.Scatter(
    x=df['Date'],
    y=df['MA200'],
    mode='lines',
    name='200-day Moving Average',
    line=dict(color='red', dash='dash')
))

# Update layout
fig.update_layout(
    title='Google Stock Price with Moving Averages (20-day, 50-day, 200-day)',
    xaxis_title='Date',
    yaxis_title='Price (USD)',
    template='plotly_dark',
    height=600
)

fig.show(renderer="notebook")

## 4. Volume vs Price

In [None]:
# --- Dual-Axis Plot: Volume vs Price (Google) ---
# This plot shows the relationship between daily trading volume (as bars) and closing price (as a line).
# It helps visualize how changes in volume correspond with price movements for Google stock.

# Create the figure with dual axes
fig = make_subplots(
    specs=[[{'secondary_y': True}]],
    shared_xaxes=True,
    vertical_spacing=0.1,
)

# Add the bar plot for volume
fig.add_trace(go.Bar(
    x=df['Date'],
    y=df['Volume_MSFT'],
    name='Volume',
    marker_color='rgba(246, 78, 139, 0.6)',
    yaxis='y1'
))

# Add the line plot for closing price
fig.add_trace(go.Scatter(
    x=df['Date'],
    y=df['Close_MSFT'],
    mode='lines',
    name='Close Price',
    line=dict(color='blue'),
    yaxis='y2'
))

# Update layout with dual axes
fig.update_layout(
    title='Google Stock Price vs Trading Volume',
    xaxis_title='Date',
    yaxis_title='Volume',
    yaxis2=dict(
        title='Close Price (USD)',
        overlaying='y',
        side='right'
    ),
    template='plotly_dark',
    height=600,
)

fig.show(renderer="notebook")

# Blue Line: Microsoft stock's closing price.

# Gray Bars: Trading volume for Microsoft stock.

## 5. Box Plot by Year or Month

In [None]:
# --- Box Plot by Year: Closing Prices Grouped by Year ---
# This plot shows box plots of the closing prices of each stock (AAPL, AMZN, GOOGL, MSFT, NVDA) by year.
# It helps to visualize seasonality, price variation, and outliers.

# Ensure the Date column is in datetime format
df['Date'] = pd.to_datetime(df['Date'])

df['Year'] = df['Date'].dt.year  # Extract year from the Date column

# Set up the plot with 2 rows and 3 columns for the subplots
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# Adjust layout for better spacing
plt.tight_layout()

# List of companies
companies = ['AAPL', 'AMZN', 'GOOGL', 'MSFT', 'NVDA']
titles = ['Apple', 'Amazon', 'Google', 'Microsoft', 'NVIDIA']

# Plot box plots for each company in the subplots
for i, company in enumerate(companies):
    # Get the company data
    data = df[['Year', f'Close_{company}']].dropna()
    
    # Create box plot
    sns.boxplot(data=data, x='Year', y=f'Close_{company}', ax=axes[i//3, i%3], palette='Set2')
    
    # Set titles and labels
    axes[i//3, i%3].set_title(f'{titles[i]} Closing Prices by Year')
    axes[i//3, i%3].set_xlabel('Year')
    axes[i//3, i%3].set_ylabel('Price (USD)')

# Show the plot
plt.tight_layout()
plt.show()

## 6. Heatmap of Correlation

In [None]:
# --- Heatmap of Correlation: Closing Prices Among All 5 Companies ---
# This heatmap shows the correlation matrix of closing prices between all 5 tech companies (AAPL, AMZN, GOOGL, MSFT, NVDA).
# It helps to detect which stocks tend to move together.

# Select only the 'Close' columns for each company
close_columns = ['Close_AAPL', 'Close_AMZN', 'Close_GOOGL', 'Close_MSFT', 'Close_NVDA']

# Calculate the correlation matrix of closing prices
correlation_matrix = df[close_columns].corr()

# Set up the plot for the heatmap
plt.figure(figsize=(10, 6))

# Create the heatmap
sns.heatmap(
    correlation_matrix, 
    annot=True,  # Annotate each cell with the correlation value
    cmap='coolwarm',  # Color palette
    linewidths=0.5,  # Line thickness between cells
    fmt='.2f',  # Decimal format for correlation values
    cbar_kws={'label': 'Correlation'},
    square=True
)

# Title and labels
plt.title('Correlation Matrix of Stock Closing Prices', fontsize=10)
plt.tight_layout()

# Show the plot
plt.show()

## 7. Rolling Standard Deviation (Volatility Measure)

In [None]:
# --- Rolling Standard Deviation (Volatility Measure) ---
# This plot shows the rolling standard deviation of the closing prices for each company over a specified window (e.g., 30 days).

# Define the rolling window (e.g., 30 days)
window = 30

# Update the companies list based on the actual column names in your dataset
companies = ['Close_AAPL', 'Close_AMZN', 'Close_GOOGL', 'Close_MSFT', 'Close_NVDA']
colors = ['blue', 'green', 'red', 'orange', 'purple']

# Calculate rolling standard deviation (volatility) for each company
rolling_volatility = df[companies].rolling(window).std()

# Plot the rolling standard deviation
plt.figure(figsize=(12, 6))

# Plot each company's rolling volatility
for i, company in enumerate(companies):
    plt.plot(rolling_volatility.index, rolling_volatility[company], label=f'{company} Rolling Std Dev', color=colors[i])

# Add title and labels
plt.title('30-Day Rolling Standard Deviation (Volatility Measure) of Closing Prices', fontsize=16)
plt.xlabel('Date')
plt.ylabel('Rolling Standard Deviation (Volatility)')
plt.legend()
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()

## 8. Drawdowns (Peak to Trough Loss)

In [None]:
# --- Drawdowns (Peak to Trough Loss) ---
# This plot visualizes the periods of maximum drawdowns for each stock.

# Calculate the cumulative maximum price (peak)
df_peak = df[['Close_AAPL', 'Close_AMZN', 'Close_GOOGL', 'Close_MSFT', 'Close_NVDA']].cummax()

# Calculate the drawdown as the percentage change from the peak
drawdowns = (df[['Close_AAPL', 'Close_AMZN', 'Close_GOOGL', 'Close_MSFT', 'Close_NVDA']] - df_peak) / df_peak

# Plot the drawdowns for each company
plt.figure(figsize=(12, 6))

# Plot each company's drawdown
companies = ['Close_AAPL', 'Close_AMZN', 'Close_GOOGL', 'Close_MSFT', 'Close_NVDA']
colors = ['blue', 'green', 'red', 'orange', 'purple']
for i, company in enumerate(companies):
    plt.fill_between(drawdowns.index, drawdowns[company], label=f'{company} Drawdown', color=colors[i], alpha=0.5)

# Add title and labels
plt.title('Peak-to-Trough Drawdowns of Stock Prices', fontsize=16)
plt.xlabel('Date')
plt.ylabel('Drawdown (%)')
plt.legend()
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()

## 9. Lag Plots & ACF/PACF

In [None]:
# ### 🧠 Forecasting Preparation: Lag Plot, ACF & PACF

# To prepare for forecasting models like **ARIMA** or **SARIMAX**, we use **Lag Plots**, **ACF**, and **PACF**. These tools are essential for identifying autocorrelation in time series data and understanding how past values influence future trends.

# ---

# #### 🔁 Steps:

# - **📉 Lag Plot:**  
#   A lag plot shows the relationship between values of a time series and its past values (lags).  
#   It's useful for visually inspecting autocorrelation and identifying non-random patterns in the data.

# - **📊 ACF (AutoCorrelation Function):**  
#   ACF measures the correlation between the time series and its own lagged versions.  
#   It helps assess how strongly current values relate to past values over multiple time steps.

# - **📈 PACF (Partial AutoCorrelation Function):**  
#   PACF shows the correlation between the time series and its lagged versions, **while controlling for shorter lags**.  
#   It’s especially useful for determining the optimal number of lags to include in ARIMA models.

# ---

# These tools provide insights into the underlying patterns of the time series and guide the selection of appropriate parameters for forecasting models.


In [None]:
# --- Lag Plots & ACF/PACF --- 
# These plots are used to understand autocorrelation and prepare for ARIMA/SARIMAX modeling.

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from pandas.plotting import lag_plot

# Set Date as index
df_9 = df.set_index('Date')

# Choose a company to visualize the lag plot and ACF/PACF (e.g., AAPL)
company = 'Close_AAPL'

# --- Lag Plot ---
# This shows the relationship between the current value and the lagged values (previous time steps)
plt.figure(figsize=(6, 4))
lag_plot(df_9[company])
plt.title(f'Lag Plot for {company}')
plt.tight_layout()
plt.show()

In [None]:
# --- ACF Plot ---
# ACF shows the autocorrelation of the time series at different lags
plt.figure(figsize=(10, 6))
plot_acf(df_9[company], lags=40)  # Adjust the lags for a better view
plt.title(f'ACF Plot for {company}')
plt.tight_layout()
plt.show()

In [None]:
# --- PACF Plot ---
# PACF shows partial autocorrelation, useful for ARIMA model order selection
plt.figure(figsize=(10, 6))
plot_pacf(df_9[company], lags=40)  # Adjust the lags for a better view
plt.title(f'PACF Plot for {company}')
plt.tight_layout()
plt.show()

# Model Training and Evaluation

Future prices of one stock only (e.g., Apple's close prices)              
➤ Use univariate forecasting models.

Future prices using all stocks together (e.g., predict NVDA using AAPL, GOOGL, etc.)
➤ Use multivariate regression models or multivariate time series models.


| Goal                                 | Technique                          | When to Use                                           | Pros                                     | Tools                   |
| ------------------------------------ | ---------------------------------- | ----------------------------------------------------- | ---------------------------------------- | ----------------------- |
| Simple and interpretable forecasting | **Prophet**                        | For business-friendly use cases, seasonal trends      | Easy to implement, good for business use | `fbprophet`             |
| Classic time series                  | **ARIMA/SARIMAX**                  | Univariate or exogenous variables (like news, events) | Good statistical control, explainable    | `statsmodels`           |
| Learn temporal dependencies          | **LSTM**                           | When patterns are complex and need memory (lags)      | Powerful, good for sequences             | `Keras / PyTorch`       |
| Multivariate prediction              | **Multivariate Linear Regression** | Predict one company using others                      | Easy, interpretable                      | `sklearn`               |
| Cutting-edge, attention-based        | **Transformer**                    | For high accuracy and large datasets                  | State-of-the-art, needs tuning           | `HuggingFace / PyTorch` |


## 1. ARIMA – Univariate Time Series Forecast (e.g., Apple)


In [None]:
# --- ARIMA Forecast for Apple Closing Price ---
# Predict future Close_AAPL using classical ARIMA

from statsmodels.tsa.arima.model import ARIMA

# Ensure datetime index
df.set_index('Date', inplace=True)

# Select series
series = df['Close_AAPL'].dropna()

# Fit ARIMA
model = ARIMA(series, order=(5, 1, 0))  # You can tune this with AIC/BIC
model_fit = model.fit()

# Forecast for 3 years (~252 business days/year)
steps = 252 * 3  # = 756 business days
forecast = model_fit.forecast(steps=steps)

# Create forecast index
last_date = series.index[-1]
forecast_index = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=steps, freq='B')
forecast = pd.Series(forecast.values, index=forecast_index)

# Plot
plt.figure(figsize=(12, 5))
plt.plot(series, label='Actual')
plt.plot(forecast, label='3-Year Forecast', linestyle='--', color='red')
plt.title('ARIMA Forecast - Apple (3 Years Ahead)')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

## 2. Prophet – Trend & Seasonality-Aware Forecast (Apple)

In [None]:
# --- Prophet Forecast for Apple ---
# Predict Close_GOOGL using Facebook Prophet

from prophet import Prophet

# Prepare data ['AAPL', 'AMZN', 'GOOGL', 'MSFT', 'NVDA']
df_prophet = df[['Close_GOOGL']].reset_index()
df_prophet.columns = ['ds', 'y']

# Initialize and fit
model = Prophet()
model.fit(df_prophet)

# Forecast
future = model.make_future_dataframe(periods=30)
forecast = model.predict(future)

# Plot
model.plot(forecast)
plt.title("Prophet Forecast - Google")
plt.tight_layout()
plt.show()

## 3. Multivariate Regression – Predict One Company Using Others (e.g., NVDA)

In [None]:
# --- Multivariate Regression ---
# Predict Close_NVDA using other companies' closing prices

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Drop NaNs
data = df[['Close_AAPL', 'Close_AMZN', 'Close_GOOGL', 'Close_MSFT', 'Close_NVDA']].dropna()

X = data[['Close_AAPL', 'Close_AMZN', 'Close_GOOGL', 'Close_MSFT']]
y = data['Close_NVDA']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Train
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
pred = model.predict(X_test)

# Evaluation
print(f'R2 Score: {r2_score(y_test, pred):.3f}')
print(f'MSE: {mean_squared_error(y_test, pred):.2f}')

# Plot
plt.figure(figsize=(10, 4))
plt.plot(y_test.index, y_test, label='Actual')
plt.plot(y_test.index, pred, label='Predicted', linestyle='--')
plt.title('Multivariate Regression: Predicting NVDA')
plt.legend()
plt.tight_layout()
plt.grid(True)
plt.show()

## 4. LSTM – Deep Learning Forecast for Stock Price (e.g., Apple)

In [None]:
# --- LSTM Forecasting for Microsoft ---
# LSTM requires 3D input [samples, timesteps, features]

import numpy as np
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Preprocessing
data = df['Close_MSFT'].dropna().values.reshape(-1, 1)
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)

# Prepare sequences
def create_dataset(series, time_steps=60):
    X, y = [], []
    for i in range(time_steps, len(series)):
        X.append(series[i-time_steps:i])
        y.append(series[i])
    return np.array(X), np.array(y)

X, y = create_dataset(scaled_data)

# Train-test split
split = int(len(X) * 0.8)
X_train, y_train = X[:split], y[:split]
X_test, y_test = X[split:], y[split:]

# Model
model = Sequential([
    LSTM(50, return_sequences=False, input_shape=(X_train.shape[1], 1)),
    Dense(1)
])
model.compile(optimizer='adam', loss='mse')
model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)

# Predict
pred_scaled = model.predict(X_test)
pred = scaler.inverse_transform(pred_scaled)
actual = scaler.inverse_transform(y_test)

# Plot
plt.figure(figsize=(10, 4))
plt.plot(actual, label='Actual')
plt.plot(pred, label='Predicted', linestyle='--')
plt.title("LSTM Forecast - Microsoft")
plt.legend()
plt.tight_layout()
plt.grid(True)
plt.show()