In [None]:
!pip install torch torch-geometric pandas scikit-learn matplotlib

Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1


In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
import numpy as np
from sklearn.model_selection import ParameterGrid
import pandas as pd
import yfinance as yf
from sklearn.preprocessing import StandardScaler
from torch_geometric.data import Data

In [6]:
info = ['sector', 'industry', 'fullTimeEmployees', 'profitMargins',
        'operatingMargins', 'returnOnAssets', 'returnOnEquity',
        'revenueGrowth', 'earningsGrowth', 'debtToEquity', 'totalCash',
        'totalDebt', 'totalRevenue', 'bookValue', 'operatingCashflow',
        'freeCashflow', 'targetLowPrice', 'targetMeanPrice',
        'targetMedianPrice', 'recommendationMean']

data = []
stocks = ["AAPL", "MSFT", "AMZN", "GOOG", "GOOGL", "TSLA", "NVDA", "META", "UNH", "JNJ", "V", "JPM", "PG", "HD", "MA", "BAC", "XOM", "CVX", "LLY", "PFE"]

# Download closing prices
prices = yf.download(stocks, period="5d")['Close']

for stock in stocks:
  ticker = yf.Ticker(stock)
  stock_info = [stock]
  for i in info:
    try:
      stock_info.append(ticker.info[i])
    except KeyError:
      stock_info.append(None)
  # Add closing price to stock_info
  stock_info.append(prices[stock].iloc[0])
  data.append(stock_info)

df = pd.DataFrame(data, columns = ['Ticker'] + info + ['Close'])

[*********************100%***********************]  20 of 20 completed


In [10]:
df_cleaned = df.dropna()

In [11]:
df_cleaned = df.dropna(how='any').reset_index(drop=True)

In [12]:
df_cleaned.isna().sum()

Unnamed: 0,0
Ticker,0
sector,0
industry,0
fullTimeEmployees,0
profitMargins,0
operatingMargins,0
returnOnAssets,0
returnOnEquity,0
revenueGrowth,0
earningsGrowth,0


In [15]:
df_encoded = pd.get_dummies(df_cleaned.drop(columns=['Ticker']), columns=['sector', 'industry'], drop_first=True)
df_encoded = df_encoded.astype(int)

features = ['fullTimeEmployees', 'profitMargins', 'operatingMargins', 'returnOnAssets',
            'returnOnEquity', 'revenueGrowth', 'earningsGrowth', 'debtToEquity',
            'totalCash', 'totalDebt', 'totalRevenue', 'bookValue',
            'operatingCashflow', 'freeCashflow', 'targetLowPrice',
            'targetMeanPrice', 'targetMedianPrice', 'recommendationMean']

scaler = StandardScaler()
df_encoded[features] = scaler.fit_transform(df_encoded[features])

In [16]:
df_encoded.head()

Unnamed: 0,fullTimeEmployees,profitMargins,operatingMargins,returnOnAssets,returnOnEquity,revenueGrowth,earningsGrowth,debtToEquity,totalCash,totalDebt,...,industry_Credit Services,industry_Drug Manufacturers - General,industry_Healthcare Plans,industry_Home Improvement Retail,industry_Household & Personal Products,industry_Internet Content & Information,industry_Internet Retail,industry_Oil & Gas Integrated,industry_Semiconductors,industry_Software - Infrastructure
0,-0.20273,0.0,0.0,0.0,0.443398,-0.25,-0.25,0.005562,0.663202,1.316287,...,0,0,0,0,0,0,0,0,0,0
1,-0.009328,0.0,0.0,0.0,-0.394132,-0.25,-0.25,-0.334222,1.067419,1.225477,...,0,0,0,0,0,0,0,0,0,1
2,3.734593,0.0,0.0,0.0,-0.394132,-0.25,-0.25,-0.245583,1.46666,2.803591,...,0,0,0,0,0,0,1,0,0,0
3,-0.149092,0.0,0.0,0.0,-0.394132,-0.25,-0.25,-0.413997,1.809141,-0.593155,...,0,0,0,0,0,1,0,0,0,0
4,-0.149092,0.0,0.0,0.0,-0.394132,-0.25,-0.25,-0.413997,1.809141,-0.593155,...,0,0,0,0,0,1,0,0,0,0


In [17]:
X = df_encoded.drop(columns=['Close']).values
y = df_encoded['Close'].values

# Convert to torch tensors
x = torch.tensor(X, dtype=torch.float)
y = torch.tensor(y, dtype=torch.float)

In [28]:
edge_index = []

for i in range(len(df_cleaned)):
    for j in range(i + 1, len(df_cleaned)):
        if df_cleaned.iloc[i]['sector'] == df_cleaned.iloc[j]['sector']:
            edge_index.append([i, j])
            edge_index.append([j, i])

In [29]:
edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
data = Data(x=x, edge_index=edge_index, y=y)

In [30]:
class GNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels1, hidden_channels2):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(in_channels=in_channels, out_channels=hidden_channels1)
        self.conv2 = GCNConv(in_channels=hidden_channels1, out_channels=hidden_channels2)
        self.fc = nn.Linear(hidden_channels2, 1)  # Output is a single value (predicted price)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.leaky_relu(x)  # Use Leaky ReLU to avoid dead neurons
        x = self.conv2(x, edge_index)
        x = F.leaky_relu(x)
        x = self.fc(x)  # Output layer for price prediction
        return x

In [32]:
def train_and_evaluate(params, data, x, y):
    model = GNN(in_channels=x.shape[1],
                hidden_channels1=params['hidden_channels1'],
                hidden_channels2=params['hidden_channels2'])

    # Define the optimizer based on the parameters
    if params['optimizer'] == 'adam':
        optimizer = optim.Adam(model.parameters(), lr=params['learning_rate'])
    else:
        optimizer = optim.SGD(model.parameters(), lr=params['learning_rate'])

    # Loss function
    criterion = nn.MSELoss()

    # Training loop
    model.train()
    for epoch in range(1500):
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output.flatten(), data.y)
        loss.backward()
        optimizer.step()

    # Switch to evaluation mode
    model.eval()
    with torch.no_grad():
        predicted_prices = model(data).flatten().detach().numpy()

    # Compute the Mean Squared Error for evaluation
    mse = np.mean((predicted_prices - y.detach().numpy()) ** 2)

    return mse

param_grid = {
    'learning_rate': [0.01, 0.001, 0.0001, 0.00001],
    'hidden_channels1': [32, 64, 128, 256, 512, 1024, 2048],
    'hidden_channels2': [16, 32, 64, 128, 256, 512, 1024],
    'optimizer': ['adam', 'sgd']
}

grid = ParameterGrid(param_grid)
results = []

for params in grid:
    mse = train_and_evaluate(params, data, x, y)
    results.append((params, mse))
    print(f"Params: {params} - MSE: {mse}")

results_df = pd.DataFrame(results, columns=['Params', 'MSE'])
best_params = results_df.loc[results_df['MSE'].idxmin()]

print("\nBest Parameters: ", best_params)

Params: {'hidden_channels1': 32, 'hidden_channels2': 16, 'learning_rate': 0.01, 'optimizer': 'adam'} - MSE: 38097.01953125
Params: {'hidden_channels1': 32, 'hidden_channels2': 16, 'learning_rate': 0.01, 'optimizer': 'sgd'} - MSE: nan
Params: {'hidden_channels1': 32, 'hidden_channels2': 16, 'learning_rate': 0.001, 'optimizer': 'adam'} - MSE: 38097.01953125
Params: {'hidden_channels1': 32, 'hidden_channels2': 16, 'learning_rate': 0.001, 'optimizer': 'sgd'} - MSE: nan
Params: {'hidden_channels1': 32, 'hidden_channels2': 16, 'learning_rate': 0.0001, 'optimizer': 'adam'} - MSE: 123996.03125
Params: {'hidden_channels1': 32, 'hidden_channels2': 16, 'learning_rate': 0.0001, 'optimizer': 'sgd'} - MSE: 38097.01953125
Params: {'hidden_channels1': 32, 'hidden_channels2': 16, 'learning_rate': 1e-05, 'optimizer': 'adam'} - MSE: 138456.953125
Params: {'hidden_channels1': 32, 'hidden_channels2': 16, 'learning_rate': 1e-05, 'optimizer': 'sgd'} - MSE: 38097.01953125
Params: {'hidden_channels1': 32, 'hid

In [33]:
best_params = results_df.loc[results_df['MSE'].idxmin()]['Params']
print(best_params)

{'hidden_channels1': 64, 'hidden_channels2': 128, 'learning_rate': 0.01, 'optimizer': 'adam'}


In [39]:
best_learning_rate = best_params['learning_rate']
best_hidden_channels1 = best_params['hidden_channels1']
best_hidden_channels2 = best_params['hidden_channels2']
best_optimizer_type = best_params['optimizer']

best_model = GNN(in_channels=x.shape[1],
                hidden_channels1=best_hidden_channels1,
                hidden_channels2=best_hidden_channels2)

if best_optimizer_type == 'adam':
    best_optimizer = optim.Adam(best_model.parameters(), lr=best_learning_rate)
else:
    best_optimizer = optim.SGD(best_model.parameters(), lr=best_learning_rate)


criterion = torch.nn.MSELoss()

best_model.train()
for epoch in range(500):
    best_optimizer.zero_grad()
    output = best_model(data)
    loss = criterion(output.flatten(), data.y)
    loss.backward()
    best_optimizer.step()

best_model.eval()

with torch.no_grad():
    final_predictions = best_model(data).flatten().detach().numpy()


comparison_df = pd.DataFrame({
    'Predicted Price': final_predictions
})

In [40]:
print(comparison_df)

    Predicted Price
0        254.333313
1        254.333313
2        284.000000
3        303.333344
4        303.333344
5        284.000000
6        254.333313
7        303.333344
8        414.000000
9        414.000000
10       173.000000
11       284.000000
12       496.000000
13       134.000000
14       134.000000
15       414.000000
16       414.000000
