# Stock Market Financial Data Analysis and Time Series Forecasting using Python + Power BI

# Install Required Libraries

In [None]:
!pip install pandas numpy matplotlib seaborn statsmodels scikit-learn yfinance


# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")


# Load Dataset

In [None]:
df = pd.read_csv("/kaggle/input/datasets/prince7489/stock-market-dataset/random_stock_market_dataset.csv")
df.head()


# Data Understanding

In [None]:
df.info()
df.describe()
df.isnull().sum()


# Data Cleaning

In [None]:
# Convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Sort by date
df = df.sort_values('Date')

# Remove duplicates
df = df.drop_duplicates()

# Handle missing values
df = df.fillna(method='ffill')

df.set_index('Date', inplace=True)


# Exploratory Data Analysis (EDA)

# Stock Price Trend

In [None]:
plt.figure(figsize=(12,6))
plt.plot(df['Close'])
plt.title("Stock Closing Price Trend")
plt.xlabel("Date")
plt.ylabel("Closing Price")
plt.show()


# Volume Trend

In [None]:
plt.figure(figsize=(12,6))
plt.plot(df['Volume'])
plt.title("Trading Volume Over Time")
plt.show()


# Monthly Average Price

In [None]:
monthly = df['Close'].resample('M').mean()

plt.figure(figsize=(12,6))
monthly.plot()
plt.title("Monthly Average Closing Price")
plt.show()


# Moving Averages (Technical Analysis)

In [None]:
df['MA50'] = df['Close'].rolling(50).mean()
df['MA200'] = df['Close'].rolling(200).mean()

plt.figure(figsize=(14,6))
plt.plot(df['Close'], label='Close')
plt.plot(df['MA50'], label='50-Day MA')
plt.plot(df['MA200'], label='200-Day MA')
plt.legend()
plt.title("Moving Average Analysis")
plt.show()


# Feature Engineering (Financial Metrics)

In [None]:
# Daily Returns
df['Daily_Return'] = df['Close'].pct_change()

# Log Returns (more stable for finance)
df['Log_Return'] = np.log(df['Close'] / df['Close'].shift(1))

# Rolling Volatility (30 days)
df['Volatility_30'] = df['Daily_Return'].rolling(30).std()

# Rolling Mean
df['Rolling_Mean_30'] = df['Close'].rolling(30).mean()

df.dropna(inplace=True)


# Return Distribution Analysis

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(df['Daily_Return'], bins=100, kde=True)
plt.title("Distribution of Daily Returns")
plt.show()


# Volatility Clustering

In [None]:
plt.figure(figsize=(12,6))
plt.plot(df['Volatility_30'])
plt.title("30-Day Rolling Volatility")
plt.show()


# Drawdown Analysis

In [None]:
df['Cumulative_Return'] = (1 + df['Daily_Return']).cumprod()

df['Running_Max'] = df['Cumulative_Return'].cummax()

df['Drawdown'] = df['Cumulative_Return'] / df['Running_Max'] - 1

plt.figure(figsize=(12,6))
plt.plot(df['Drawdown'])
plt.title("Drawdown Analysis")
plt.show()

print("Maximum Drawdown:", df['Drawdown'].min())


# Correlation Matrix (Multi-Feature Analysis)

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df[['Open','High','Low','Close','Volume']].corr(), 
            annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()


# Rolling Correlation

In [None]:
rolling_corr = df['Close'].rolling(60).corr(df['Volume'])

plt.figure(figsize=(12,6))
plt.plot(rolling_corr)
plt.title("60-Day Rolling Correlation: Close vs Volume")
plt.show()


# Candlestick Chart

In [None]:
!pip install mplfinance



import mplfinance as mpf

mpf.plot(df.tail(100), type='candle', volume=True, style='charles')


# Technical Indicators
         ylabel_lower='Volume')


In [None]:
# relative strength index
delta = df['Close'].diff()
gain = delta.clip(lower=0)
loss = -delta.clip(upper=0)

avg_gain = gain.rolling(14).mean()
avg_loss = loss.rolling(14).mean()

rs = avg_gain / avg_loss
df['RSI'] = 100 - (100 / (1 + rs))

plt.figure(figsize=(12,6))
plt.plot(df['RSI'])
plt.axhline(70, color='r')
plt.axhline(30, color='g')
plt.title("RSI Indicator")
plt.show()


# Bollinger Bands

In [None]:
df['BB_Middle'] = df['Close'].rolling(20).mean()
df['BB_Upper'] = df['BB_Middle'] + 2*df['Close'].rolling(20).std()
df['BB_Lower'] = df['BB_Middle'] - 2*df['Close'].rolling(20).std()

plt.figure(figsize=(14,6))
plt.plot(df['Close'])
plt.plot(df['BB_Upper'])
plt.plot(df['BB_Lower'])
plt.title("Bollinger Bands")
plt.show()


# Stationarity Test (ADF Test)

In [None]:
print(df.shape)
print(df.head())
print(df['Close'].isna().sum())


In [None]:
close_series = df['Close'].dropna()


In [None]:
from statsmodels.tsa.stattools import adfuller

close_series = df['Close'].dropna()

if len(close_series) > 0:
    result = adfuller(close_series)

    print("ADF Statistic:", result[0])
    print("p-value:", result[1])

    if result[1] < 0.05:
        print("Series is Stationary ")
    else:
        print("Series is Non-Stationary ")
else:
    print("Close column is empty after removing NaN values")


# Outlier Detection (Z-Score Method)

In [None]:
df['Z_Score'] = (df['Daily_Return'] - df['Daily_Return'].mean()) / df['Daily_Return'].std()

outliers = df[np.abs(df['Z_Score']) > 3]
print("Extreme Market Movements:")
print(outliers[['Close','Daily_Return']])


# Seasonal Analysis

In [None]:
df['Month'] = df.index.month

monthly_returns = df.groupby('Month')['Daily_Return'].mean()

monthly_returns.plot(kind='bar')
plt.title("Average Monthly Returns")
plt.show()


# Risk Metrics

In [None]:
# Annualized Volatility
annual_vol = df['Daily_Return'].std() * np.sqrt(252)
print("Annualized Volatility:", annual_vol)

In [None]:
# Sharpe Ratio
risk_free_rate = 0.02

sharpe = (df['Daily_Return'].mean()*252 - risk_free_rate) / annual_vol
print("Sharpe Ratio:", sharpe)

# Market Regime Detection (Volatility Regimes)

In [None]:
df['Volatility_Regime'] = np.where(df['Volatility_30'] > df['Volatility_30'].median(),
                                   'High Volatility',
                                   'Low Volatility')

sns.countplot(x=df['Volatility_Regime'])
plt.title("Market Regime Distribution")
plt.show()


# Time Series Decomposition

In [None]:

print("Shape:", df.shape)
print("Columns:", df.columns)
print("Total Close values:", df['Close'].count())
print("NaN in Close:", df['Close'].isna().sum())
print(df.head())


In [None]:

close_series = df['Close'].dropna()
print("After dropna:", len(close_series))


In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt

close_series = df['Close'].dropna()

if len(close_series) >= 60:
    decomposition = seasonal_decompose(close_series,
                                       model='multiplicative',
                                       period=30)

    decomposition.plot()
    plt.show()
else:
    print("Not enough data for decomposition. Need at least 60 observations.")


# Train-Test Split

In [None]:
train = df['Close'][:int(len(df)*0.8)]
test = df['Close'][int(len(df)*0.8):]


# ARIMA Forecasting Model

In [None]:
print(df.head())
print(df.index)



In [None]:
df.index = pd.to_datetime(df.index)


In [None]:

df.index = pd.to_datetime(df.index, errors='coerce')
df = df[~df.index.isna()]


In [None]:
!pip install pmdarima


In [None]:

\

In [None]:
model = ARIMA(train, order=(1,1,1))
model_fit = model.fit()


In [None]:
model = ARIMA(train, order=(2,1,1))


In [None]:
print(train.describe())
print(len(train))


In [None]:
train = train.dropna()


In [None]:
model = ARIMA(train,
              order=(5,1,0),
              enforce_stationarity=False,
              enforce_invertibility=False)

model_fit = model.fit()


In [None]:
ARIMA(train, order=(1,1,0))


In [None]:
ARIMA(train, order=(0,1,1))


In [None]:
from statsmodels.tsa.arima.model import ARIMA

data = df['Close'].dropna()

train_size = int(len(data)*0.8)
train, test = data[:train_size], data[train_size:]

model = ARIMA(train,
              order=(1,1,1),
              enforce_stationarity=False,
              enforce_invertibility=False)

model_fit = model.fit()

forecast = model_fit.forecast(steps=len(test))

print("Model fitted successfully ✅")


In [None]:
print("Total rows in df:", len(df))
print("Close column count:", df['Close'].count())

data = df['Close'].dropna()
print("Data length after dropna:", len(data))

train_size = int(len(data) * 0.8)
print("Train size:", train_size)

train, test = data[:train_size], data[train_size:]

print("Train length:", len(train))
print("Test length:", len(test))


In [None]:
print(df.columns)


In [None]:
# Check correct column name first
print(df.columns)

# Replace 'Close' if needed
data = df['Close'].dropna()   # change if column name different

if len(data) == 0:
    print("❌ No data available for ARIMA")
else:
    train_size = int(len(data)*0.8)
    train, test = data[:train_size], data[train_size:]

    print("Train:", len(train))
    print("Test:", len(test))

    if len(train) > 10:
        from statsmodels.tsa.arima.model import ARIMA
        
        model = ARIMA(train, order=(1,1,1),
                      enforce_stationarity=False,
                      enforce_invertibility=False)
        
        model_fit = model.fit()
        forecast = model_fit.forecast(steps=len(test))
        
        print("Model fitted successfully")
    else:
        print(" Not enough training data")


# Model Evaluation

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima.model import ARIMA



In [None]:
print(df.head())
print(df.columns)
print(len(df))


In [None]:
train_size = int(len(df) * 0.8)
train = df['Close'][:train_size]
test = df['Close'][train_size:]


In [None]:
model = ARIMA(train, order=(1,1,1))
model_fit = model.fit()


In [None]:
train = train.astype(float)


In [None]:
print(train.nunique())


In [None]:
series = df['Close'].dropna()

train_size = int(len(series) * 0.8)

train = series.iloc[:train_size]
test = series.iloc[train_size:]

print("Train length:", len(train))
print("Test length:", len(test))


In [None]:
print("DF shape:", df.shape)
print("Train length:", len(train))
print("Test length:", len(test))
print("Null values in train:", train.isna().sum())
print("Unique values:", train.nunique())


# Holt-Winters Forecasting

In [None]:
print("DF shape:", df.shape)
print("Train length:", len(train))
print("Test length:", len(test))


In [None]:
print(df.head())
print(df.shape)


In [None]:
df['Close']


In [None]:
print(df.columns)


In [None]:
df = df.dropna()


In [None]:
# 1. Check data
print(df.shape)
print(df.columns)
print(df.head())

# 2. Select correct column
series = df['Close']  # change if needed

# 3. Remove missing values
series = series.dropna()

print("Series length:", len(series))

# 4. Split
train_size = int(len(series) * 0.8)

train = series.iloc[:train_size]
test = series.iloc[train_size:]

print("Train length:", len(train))
print("Test length:", len(test))


# Future 30-Day Forecast

In [None]:
future_forecast = model_fit.forecast(steps=30)

plt.figure(figsize=(12,6))
plt.plot(df['Close'], label='Historical')
plt.plot(pd.date_range(df.index[-1], periods=30, freq='D'), future_forecast, label='Future Forecast')
plt.legend()
plt.show()



In [None]:
# Reload data
df = pd.read_csv("/kaggle/input/datasets/prince7489/stock-market-dataset/random_stock_market_dataset.csv")

print(df.shape)
print(df.columns)
print(df.head())

# Select correct column
series = df['Close']  # change if needed

# Remove missing values
series = series.dropna()

print("Series length:", len(series))
