In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
kundanbedmutha_market_trend_and_external_factors_dataset_path = kagglehub.dataset_download('kundanbedmutha/market-trend-and-external-factors-dataset')

print('Data source import complete.')


In [None]:
import numpy as np
import pandas as pd
import plotly.express as px


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df= pd.read_csv('/kaggle/input/market-trend-and-external-factors-dataset/Market_Trend_External.csv')
df.sample(6)
print(df.shape)

# Initial Inspection

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['Date'] = pd.to_datetime(df['Date'])
df.info()

**Advance Cleaning is Not Required**

# Exploratory Data Analysis (EDA)

Q1: How has the 'Close_Price' changed over time? (Line plot)

Q2: What is the distribution of 'Daily_Return_Pct'? (Histogram)

Q3: Is there a correlation between 'VIX_Close' and 'Daily_Return_Pct'? (Scatter plot)

Q4: How does the 'Economic_News_Flag' relate to the average 'Volatility_Range'? (Bar plot)

Q5: What are the monthly average 'Volume' trends? (Line plot after aggregation)

Q6: How does the 'Sentiment_Score' relate to the 'Daily_Return_Pct' on the next day? (Scatter plot with lagged return)

Q7: Compare the distribution of 'Close_Price' before and after a 'Federal_Rate_Change_Flag' was set to 1. (Box plots)

Q8: Which numerical variables are most strongly correlated with 'Close_Price'? (Heatmap)

In [None]:
fig1 = px.line(df, x='Date', y='Close_Price',
               title='Q1) Close Price Trend Over Time (1902-2023)',
               labels={'Close_Price': 'Closing Price ($)', 'Date': 'Date'})
fig1.show()


df_filtered_returns = df[(df['Daily_Return_Pct'] > -10) & (df['Daily_Return_Pct'] < 10)]
fig2 = px.histogram(df_filtered_returns, x='Daily_Return_Pct', nbins=50,
                    title='Q2) Distribution of Daily Return Percentage',
                    labels={'Daily_Return_Pct': 'Daily Return (%)', 'count': 'Frequency'})
fig2.update_traces(marker_line_width=1, marker_line_color="white")
fig2.show()

In [None]:
df_filtered_vix = df[df['VIX_Close'] < 100]

fig3 = px.scatter(df_filtered_vix, x='VIX_Close', y='Daily_Return_Pct',
                  title='Q3) VIX vs. Daily Return Percentage',
                  labels={'VIX_Close': 'VIX Closing Value', 'Daily_Return_Pct': 'Daily Return (%)'},
                  trendline="ols",
                  opacity=0.5)
fig3.show()


avg_volatility = df.groupby('Economic_News_Flag')['Volatility_Range'].mean().reset_index()
avg_volatility['Economic_News_Flag'] = avg_volatility['Economic_News_Flag'].astype(str).replace({'0': 'No News', '1': 'News Event'})

fig4 = px.bar(avg_volatility, x='Economic_News_Flag', y='Volatility_Range',
              title='Q4) Average Volatility Range by Economic News Flag',
              labels={'Economic_News_Flag': 'Economic News Flag', 'Volatility_Range': 'Average Volatility Range'},
              color='Economic_News_Flag')
fig4.update_traces(marker_color=['skyblue', 'salmon'])
fig4.show()

In [None]:
df['Year_Month'] = df['Date'].dt.to_period('M').astype(str)
monthly_volume = df.groupby('Year_Month')['Volume'].mean().reset_index()

fig5 = px.line(monthly_volume, x='Year_Month', y='Volume',
               title='Q5) Monthly Average Trading Volume Trend',
               labels={'Volume': 'Average Volume', 'Year_Month': 'Month/Year'})
fig5.update_xaxes(dtick="M36", tickformat="%Y-%m")
fig5.show()



df['Next_Day_Return'] = df['Daily_Return_Pct'].shift(-1)
df_lagged = df.dropna(subset=['Next_Day_Return'])

fig6 = px.scatter(df_lagged, x='Sentiment_Score', y='Next_Day_Return',
                  title='Q6) Sentiment Score vs. Next Day\'s Return Percentage',
                  labels={'Next_Day_Return': 'Next Day Daily Return (%)', 'Sentiment_Score': 'Sentiment Score'},
                  trendline="ols",
                  opacity=0.5)
fig6.show()



df_q7 = df.copy()
df_q7['Federal_Rate_Change_Flag'] = df_q7['Federal_Rate_Change_Flag'].astype(str).replace({'0': 'No Change', '1': 'Rate Change Day'})

fig7 = px.box(df_q7, x='Federal_Rate_Change_Flag', y='Close_Price',
              title='Q7) Close Price Distribution on Rate Change Days vs. Normal Days',
              labels={'Close_Price': 'Closing Price ($)', 'Federal_Rate_Change_Flag': 'Federal Rate Change Event'})
fig7.show()



numerical_cols = ['Open_Price', 'Close_Price', 'High_Price', 'Low_Price', 'Volume', 'Daily_Return_Pct',
                  'Volatility_Range', 'VIX_Close', 'Sentiment_Score', 'GeoPolitical_Risk_Score', 'Currency_Index']
corr_matrix = df[numerical_cols].corr()
corr_target = corr_matrix[['Close_Price']].sort_values(by='Close_Price', ascending=False)

fig8 = px.imshow(corr_matrix, text_auto=True, aspect="auto",
                 title='Q8) Correlation Matrix of Financial Variables',
                 color_continuous_scale=px.colors.diverging.RdBu)
fig8.show()

# Feature Engineering and Model Training

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

df.sort_values('Date', inplace=True)

df['Target'] = df['Daily_Return_Pct']
df['Close_Lag1'] = df['Close_Price'].shift(1)
df['VIX_Lag1'] = df['VIX_Close'].shift(1)
df['Sentiment_Lag1'] = df['Sentiment_Score'].shift(1)
df['Volatility_7D'] = df['Daily_Return_Pct'].rolling(window=7).std().shift(1)


df.dropna(inplace=True)
df_model = df.tail(10000).copy()


features = ['Close_Lag1', 'VIX_Lag1', 'Sentiment_Lag1', 'Volatility_7D',
            'Economic_News_Flag', 'Federal_Rate_Change_Flag', 'GeoPolitical_Risk_Score', 'Currency_Index']
X = df_model[features]
y = df_model['Target']


split_point = int(len(df_model) * 0.8)
X_train, X_test = X.iloc[:split_point], X.iloc[split_point:]
y_train, y_test = y.iloc[:split_point], y.iloc[split_point:]


rgr1 = LinearRegression()
rgr2 = DecisionTreeRegressor(max_depth=5, random_state=1)
rgr3 = RandomForestRegressor(n_estimators=50, random_state=1, max_depth=5, n_jobs=-1)


vrg = VotingRegressor(
    estimators=[('lr', rgr1), ('dt', rgr2), ('rf', rgr3)],
    weights=[1, 1, 2]
)

vrg.fit(X_train, y_train)


models = {
    "Linear Regression": rgr1,
    "Decision Tree Regressor": rgr2,
    "Random Forest Regressor": rgr3,
    "Voting Regressor": vrg
}

results = {}
for name, model in models.items():
    if name != "Voting Regressor":
        model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    results[name] = {"RMSE": rmse, "R2 Score": r2}

print("Regression Model Evaluation Results:")
for name, metrics in results.items():
    print(f"\n--- {name} ---")
    print(f"Root Mean Squared Error (RMSE): {metrics['RMSE']:.4f}")
    print(f"R-squared (R2 Score): {metrics['R2 Score']:.4f}")

The Random Forest Regressor had the lowest RMSE ($\mathbf{17.3814}$) and the highest $R^2$ score ($\mathbf{0.0573}$), making it the best performer among the base models.

Summary-

This project involved building a complete data science pipeline for financial time-series analysis, starting with data cleaning and extensive Exploratory Data Analysis (EDA) using Plotly visualizations to understand long-term price trends, return distributions, and relationships between external factors (like VIX and Sentiment Score) and market volatility. The final phase focused on regression modeling to predict the continuous Daily Return Percentage, utilizing feature engineering to create lagged and rolling indicators. A Voting Regressor ensemble, combining Linear Regression, Decision Tree Regressor, and Random Forest Regressor, was implemented and evaluated against its base models. The results showed that the Random Forest Regressor was the best individual model (lowest RMSE, highest $R^2$), though all models struggled to achieve high predictive power, explaining less than 6% of the variance, highlighting the inherent difficulty of forecasting market returns.