In [1]:
import numpy as np
import yfinance as yf
import pandas as pd
from pandas_datareader import data as pdr
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

How macro data affect S&P/NASDAQ/other indices - Tasmin \\
How earnings data affect stock movement (multiple stocks) - Rachel \\
Are there any correlations with institution holding changes - Rachel \\
How can we use past prices to predict current price using different indicators - Tasmin

In [2]:
start_date = '2015-01-01'

In [3]:
macro_data = pdr.get_data_fred(['GDP', 'CPIAUCSL', 'FEDFUNDS', 'UNRATE', 'INDPRO'], start=start_date)
macro_data = macro_data.resample('D').ffill()

In [4]:
macro_data

Unnamed: 0_level_0,GDP,CPIAUCSL,FEDFUNDS,UNRATE,INDPRO
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-01-01,18063.529,234.747,0.11,5.7,102.7923
2015-01-02,18063.529,234.747,0.11,5.7,102.7923
2015-01-03,18063.529,234.747,0.11,5.7,102.7923
2015-01-04,18063.529,234.747,0.11,5.7,102.7923
2015-01-05,18063.529,234.747,0.11,5.7,102.7923
...,...,...,...,...,...
2025-01-28,,319.086,4.33,4.0,103.4350
2025-01-29,,319.086,4.33,4.0,103.4350
2025-01-30,,319.086,4.33,4.0,103.4350
2025-01-31,,319.086,4.33,4.0,103.4350


In [5]:
#How macro data affect S&P/NASDAQ/other indices
sp500 = yf.download('^GSPC', start=start_date, auto_adjust=False)
nasdaq = yf.download('^IXIC', start=start_date, auto_adjust=False)
dow = yf.download('^DJI', start=start_date, auto_adjust=False)

end_date = datetime.datetime.today().strftime("%Y-%m-%d")

date_range = pd.date_range(start=start_date, end=end_date, freq='D')
index_data = pd.DataFrame(index=date_range)

index_data['SP500'] = sp500['Adj Close']
index_data['NASDAQ'] = nasdaq['Adj Close']
index_data['DOW'] = dow['Adj Close']

index_data = index_data.ffill()

merged_data = pd.merge(index_data, macro_data, left_index=True, right_index=True, how='left')
merged_data = merged_data.ffill()
merged_data

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Unnamed: 0,SP500,NASDAQ,DOW,GDP,CPIAUCSL,FEDFUNDS,UNRATE,INDPRO
2015-01-01,,,,18063.529,234.747,0.11,5.7,102.7923
2015-01-02,2058.199951,4726.810059,17832.990234,18063.529,234.747,0.11,5.7,102.7923
2015-01-03,2058.199951,4726.810059,17832.990234,18063.529,234.747,0.11,5.7,102.7923
2015-01-04,2058.199951,4726.810059,17832.990234,18063.529,234.747,0.11,5.7,102.7923
2015-01-05,2020.579956,4652.569824,17501.650391,18063.529,234.747,0.11,5.7,102.7923
...,...,...,...,...,...,...,...,...
2025-03-19,5675.290039,17750.789062,41964.628906,29719.647,319.775,4.33,4.1,104.2062
2025-03-20,5662.890137,17691.630859,41953.320312,29719.647,319.775,4.33,4.1,104.2062
2025-03-21,5667.560059,17784.050781,41985.351562,29719.647,319.775,4.33,4.1,104.2062
2025-03-22,5667.560059,17784.050781,41985.351562,29719.647,319.775,4.33,4.1,104.2062


In [6]:
correlation = merged_data.corr()
print(correlation[['SP500', 'NASDAQ', 'DOW']].loc[['GDP', 'CPIAUCSL', 'FEDFUNDS', 'UNRATE', 'INDPRO']])

             SP500    NASDAQ       DOW
GDP       0.949375  0.914903  0.944844
CPIAUCSL  0.937070  0.899969  0.927277
FEDFUNDS  0.642337  0.566697  0.652450
UNRATE   -0.183569 -0.101195 -0.216295
INDPRO    0.301112  0.220106  0.334046


In [7]:
monthly_data = merged_data.resample('ME').last()
monthly_returns = monthly_data[['SP500', 'NASDAQ', 'DOW']].pct_change()
macro_changes = monthly_data[['GDP', 'CPIAUCSL', 'FEDFUNDS']].pct_change()
monthly_analysis = pd.concat([monthly_returns, macro_changes], axis = 1).dropna()

X = monthly_analysis[['GDP', 'CPIAUCSL', 'FEDFUNDS']]
for index in ['SP500', 'NASDAQ', 'DOW']:
  y = monthly_analysis[index]
  model = LinearRegression()
  model.fit(X, y)

  print(f"\nRegression for {index}:")
  for i, var in enumerate(['GDP', 'CPIAUCSL', 'FEDFUNDS']):
    print(f"{var} coefficient: {model.coef_[i]}")
  print(f"R-squared: {model.score(X, y)}")


Regression for SP500:
GDP coefficient: -0.04765965194404613
CPIAUCSL coefficient: 1.016910722317877
FEDFUNDS coefficient: -0.022774540864225257
R-squared: 0.015171962692987373

Regression for NASDAQ:
GDP coefficient: -0.05961253891379015
CPIAUCSL coefficient: 0.04494264384286316
FEDFUNDS coefficient: -0.028888746649202972
R-squared: 0.021234754474173334

Regression for DOW:
GDP coefficient: -0.11772304704288218
CPIAUCSL coefficient: 1.1163589030526317
FEDFUNDS coefficient: -0.018021019057350207
R-squared: 0.01096349753526582


In [8]:
#How can we use past prices to predict current price using different indicators
google = yf.download('GOOD', start=start_date, auto_adjust=False)
df = google.copy()
close = google['Adj Close']
df['MA_20'] = close.rolling(window = 20).mean()

delta = close.diff()
gain = delta.where(delta > 0,0)
loss = -delta.where(delta < 0,0)
avg_gain = gain.rolling(window = 14).mean()
avg_loss = loss.rolling(window = 14).mean()
rs = avg_gain/avg_loss
df['RSI'] = 100 - (100/(1+rs))

df['OBV'] = (np.sign(close.diff())) * df['Volume'].fillna(0).cumsum()

df['ROC_10'] = close.pct_change(periods = 10) * 100

df['Next_Close'] = df['Adj Close'].shift(-1)

df = df.dropna()

[*********************100%***********************]  1 of 1 completed


In [9]:
features = ['MA_20', 'RSI', 'OBV', 'ROC_10']
X = df[features]
y = df['Next_Close']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"Training R-squared: {train_score}")
print(f"Testing R-squared: {test_score}")

mse = np.mean((y_test - y_pred) ** 2)
rmse = np.sqrt(mse)

print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")

feature_importance = pd.DataFrame({
    'Feature' : features,
    'Importance' : np.abs(model.coef_)
}).sort_values(by = 'Importance', ascending = False)

print(feature_importance)



Training R-squared: 0.9888398134528051
Testing R-squared: 0.9846920381597852
Mean Squared Error: 0.13268682791644407
Root Mean Squared Error: 0.3642620319446484
  Feature    Importance
0   MA_20  9.974088e-01
3  ROC_10  5.986381e-02
1     RSI  1.076197e-02
2     OBV  1.371886e-10
