In [58]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from scipy.special import expit
from skopt import gp_minimize
import time 
import traceback
import warnings
warnings.filterwarnings("ignore")

In [59]:
pd.set_option('display.max_rows', 20)

In [60]:
merged_df = pd.read_csv("sentiment_and_prices.csv")
merged_df_train, merged_df_test = train_test_split(merged_df, test_size=0.2, shuffle=False)
merged_df_test = merged_df_test.reset_index(drop=True)

Entire Architecture to go from a Dataframe containing Sentiment Scores and Prices for each stock over givem time period to the weights of the portfolio allocated to each of the 4 stocks and Cash on each day

In [61]:
# Neural Network for Diffusion Term
class DiffusionNetwork(nn.Module):
    def __init__(self):
        super(DiffusionNetwork, self).__init__()
        self.fc1 = nn.Linear(2, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.exp(self.fc3(x))  # Ensure the output is positive
        return x

diffusion_model = DiffusionNetwork()

def simulate_stock_price(S0, t, dt, T, drift):
    dt = 1  # Assuming one time step is one day
    T = t + dt
    h = T - t
    dW = np.sqrt(h) * np.random.normal()
    
    input_tensor = torch.tensor([t, S0], dtype=torch.float32).unsqueeze(0)
    diffusion = diffusion_model(input_tensor).item()
    
    S_new = S0 + (drift * S0 * h) \
            + S0 * diffusion * dW
    
    return S_new

def compute_weights(merged_df, alpha, gamma, beta, delta, epsilon, rho):
    np.random.seed(0)
    torch.manual_seed(0)
    weights = pd.DataFrame(columns=['Date', 'Weight_AMZN', 'Weight_AAPL', 'Weight_MSFT', 'Weight_NVIDIA', 'Weight_CASH'])
    last_valid_weights = None  # Store the last valid weights

    # Calculate rolling statistics
    returns = merged_df[['Price_AMZN', 'Price_AAPL', 'Price_MSFT', 'Price_NVIDIA']].pct_change()
    volatility = returns.rolling(window=20).std()
    rolling_mean = merged_df[['Price_AMZN', 'Price_AAPL', 'Price_MSFT', 'Price_NVIDIA']].rolling(window=20).mean()
    rolling_std = merged_df[['Price_AMZN', 'Price_AAPL', 'Price_MSFT', 'Price_NVIDIA']].rolling(window=20).std()
    momentum = merged_df[['Price_AMZN', 'Price_AAPL', 'Price_MSFT', 'Price_NVIDIA']].pct_change(periods=5)

    for t in range(20, len(merged_df)):
        try:
            # Extract features at time t
            sentiment_t = np.array(merged_df.loc[t, ['Sentiment_Score_AMZN', 'Sentiment_Score_AAPL', 'Sentiment_Score_MSFT', 'Sentiment_Score_NVIDIA']].astype(float))
            volatility_t = np.array(volatility.loc[t, :].astype(float))
            momentum_t = np.array(momentum.loc[t, :].astype(float))
            mean_reversion_t = ((np.array(merged_df.loc[t, ['Price_AMZN', 'Price_AAPL', 'Price_MSFT', 'Price_NVIDIA']].astype(float)) -
                                np.array(rolling_mean.loc[t, :].astype(float))) /
                                np.array(rolling_std.loc[t, :].astype(float)))

            # Calculate drift and simulated returns
            drift = alpha * sentiment_t - gamma * volatility_t + beta * momentum_t - delta * mean_reversion_t
            simulated_returns = []
            for i, stock in enumerate(['AMZN', 'AAPL', 'MSFT', 'NVIDIA']):
                S0 = merged_df.loc[t, f'Price_{stock}']
                new_price = simulate_stock_price(S0, t, 1, t + 1, drift[i])
                simulated_return = (new_price - S0) / S0
                simulated_returns.append(simulated_return)

            # Check if any simulated return exceeds the rho threshold
            if any(abs(r) >= rho for r in simulated_returns):
                # Calculate cash weight using sigmoid function
                avg_simulated_return = np.mean(simulated_returns)
                cash_weight = expit(epsilon * avg_simulated_return)
                
                # Calculate new weights
                new_weights = np.maximum(simulated_returns, 0)
                new_weights = np.append(new_weights, cash_weight)
                new_weights /= np.sum(new_weights)  # Normalize to sum to 1
                
                last_valid_weights = new_weights
                
                # Update DataFrames
                current_date = merged_df.iloc[t].name if isinstance(merged_df.index, pd.DatetimeIndex) else merged_df.iloc[t]['Date']
                weights.loc[t, 'Date'] = current_date
                weights.loc[t, ['Weight_AMZN', 'Weight_AAPL', 'Weight_MSFT', 'Weight_NVIDIA', 'Weight_CASH']] = new_weights
            else:
                # Use last valid weights if available, otherwise use equal weights
                if last_valid_weights is not None:
                    weights_to_use = last_valid_weights
                else:
                    weights_to_use = [0.2, 0.2, 0.2, 0.2, 0.2]  # Default to equal weights

                # Update DataFrames
                current_date = merged_df.iloc[t].name if isinstance(merged_df.index, pd.DatetimeIndex) else merged_df.iloc[t]['Date']
                weights.loc[t, 'Date'] = current_date
                weights.loc[t, ['Weight_AMZN', 'Weight_AAPL', 'Weight_MSFT', 'Weight_NVIDIA', 'Weight_CASH']] = weights_to_use
                
        except Exception as e:
            print(f"An error occurred at t={t}: {e}")

    return weights




Code to Evaluate the performance of the portfolio weights

In [74]:
def evaluate_portfolio_performance_fees(optimal_weights_df, merged_df):
    initial_investment = 100000  # Starting with $100,000
    portfolio_value = initial_investment
    portfolio = {'AMZN': 0, 'AAPL': 0, 'MSFT': 0, 'NVIDIA': 0}
    portfolio_value_df = pd.DataFrame(columns=['Portfolio_Value'])

    slippage = 0.003  # 0.3% slippage
    bid_ask_spread = 0.001  # 0.1% bid-ask spread

    prev_weights = None

    for i, date in enumerate(optimal_weights_df['Date']):
        # Calculate the new value of the portfolio based on current stock prices
        for stock, quantity in portfolio.items():
            portfolio_value += quantity * merged_df.loc[merged_df['Date'] == date, f'Price_{stock}'].values[0] * (1 - bid_ask_spread / 2)
        
        # Store the current portfolio value
        portfolio_value_df.loc[i] = [portfolio_value]
        
        # Reset portfolio to zero for rebalancing
        new_portfolio = {'AMZN': 0, 'AAPL': 0, 'MSFT': 0, 'NVIDIA': 0}
        
        for stock in new_portfolio.keys():
            weight = optimal_weights_df.iloc[i][f'Weight_{stock}']
            amount_to_invest = portfolio_value * weight

            # Account for slippage and bid-ask spread only if weights have changed
            if prev_weights and prev_weights[stock] != weight:
                amount_to_invest *= (1 - slippage)  # Deduct slippage from the amount to invest

            # Update the quantity of each stock in the portfolio
            stock_price = merged_df.loc[merged_df['Date'] == date, f'Price_{stock}'].values[0] * (1 + bid_ask_spread / 2)
            new_portfolio[stock] = amount_to_invest / stock_price

            # Deduct the invested amount from the portfolio value for the next iteration
            portfolio_value -= amount_to_invest

        # Update the previous weights and portfolio
        prev_weights = {k: optimal_weights_df.iloc[i][f'Weight_{k}'] for k in new_portfolio.keys()}
        portfolio = new_portfolio

    # Calculate the percent change in the portfolio
    percent_change = ((portfolio_value_df.iloc[-1]['Portfolio_Value'] - initial_investment) / initial_investment) * 100
    
    return portfolio_value_df, percent_change


Code for hyperparameter tuning based on performance of portfolio weights

In [63]:
def objective(params):
    alpha, gamma, beta, delta, epsilon, rho = params

    try:
        start_time = time.time()  # Record the start time
        print(f"Running for parameters: alpha={alpha}, gamma={gamma}, beta={beta}, delta={delta}, epsilon={epsilon}, rho={rho}")
        
        # Run your portfolio simulation here
        weights_train = compute_weights(merged_df_train, alpha, gamma, beta, delta, epsilon, rho)
        
        # Evaluate the portfolio performance
        portfolio_value_df, percentage_change = evaluate_portfolio_performance_fees(weights_train, merged_df_train)
        
        # Get the final portfolio value
        final_portfolio_value = portfolio_value_df.iloc[-1]['Portfolio_Value']

        # Print how long this iteration took
        end_time = time.time()  # Record the end time
        print(f"Completed for parameters: alpha={alpha}, gamma={gamma}, beta={beta}, delta={delta}, epsilon={epsilon}, rho={rho}")
        print(f"Time taken: {end_time - start_time} seconds")
        print(f"Final Portfolio Value: {final_portfolio_value}")

        # We negate this value because gp_minimize tries to find the minimum value
        # and we are interested in maximizing the portfolio value
        return -final_portfolio_value
    
    except Exception as e:
        print(f"An error occurred: {e}")
        traceback.print_exc()
        return 0  # return a default value

# The bounds for each hyperparameter
bounds = [(0.1, 1.0),  # alpha
          (0.1, 1.0),  # gamma
          (0.1, 1.0),  # beta
          (0.1, 1.0),  # delta
          (-100, -0.1), # epsilon
          (0.01, 1.0)]  # rho

# Run the optimizer
res = gp_minimize(objective, bounds, n_calls=50, random_state=0)

# Best hyperparameters
best_params = res.x

print("Optimization completed.")
print(f"Best parameters: {best_params}")



Running for parameters: alpha=0.6335601564025166, gamma=0.8598391737229157, beta=0.8721510558604813, delta=0.8625265649057131, epsilon=-37.70598669108135, rho=0.3905378902197729
Completed for parameters: alpha=0.6335601564025166, gamma=0.8598391737229157, beta=0.8721510558604813, delta=0.8625265649057131, epsilon=-37.70598669108135, rho=0.3905378902197729
Time taken: 1.2295541763305664 seconds
Final Portfolio Value: 135714.79883238656
Running for parameters: alpha=0.3677811458900251, gamma=0.1510416795856989, beta=0.3453906651221019, delta=0.529898605589215, epsilon=-18.8643439953282, rho=0.4851774006513069
Completed for parameters: alpha=0.3677811458900251, gamma=0.1510416795856989, beta=0.3453906651221019, delta=0.529898605589215, epsilon=-18.8643439953282, rho=0.4851774006513069
Time taken: 1.2972710132598877 seconds
Final Portfolio Value: 126061.29633697483
Running for parameters: alpha=0.4535063164907469, gamma=0.85247088718364, beta=0.40365654437554166, delta=0.6833546848460775, 

In [64]:
best_params

[0.6590478055733751,
 0.1,
 0.9997266139295863,
 0.4261477170777742,
 -53.42758553763068,
 0.574889025734415]

In [65]:
weights_train = compute_weights(merged_df_train,0.66,0.1,1,0.43,-53,0.57)
weights_train

Unnamed: 0,Date,Weight_AMZN,Weight_AAPL,Weight_MSFT,Weight_NVIDIA,Weight_CASH
20,2007-02-01,0.064461,0.333998,0.219453,0.382089,0.0
21,2007-02-02,0.260974,0.214051,0.274282,0.250693,0.0
22,2007-02-05,0.189378,0.254387,0.337534,0.218701,0.0
23,2007-02-06,0.068865,0.31125,0.456783,0.163102,0.0
24,2007-02-07,0.011402,0.337127,0.561684,0.089787,0.0
...,...,...,...,...,...,...
406,2008-08-25,0.0,0.0,0.0,0.999513,0.000487
407,2008-08-26,0.315264,0.23373,0.297864,0.153142,0.0
408,2008-08-27,0.348802,0.201503,0.281376,0.168319,0.0
409,2008-08-28,0.348802,0.201503,0.281376,0.168319,0.0


In [66]:
weights_test = compute_weights(merged_df_test,1.0,0.15,0.1,1.0,-100,0.45)
weights_test

Unnamed: 0,Date,Weight_AMZN,Weight_AAPL,Weight_MSFT,Weight_NVIDIA,Weight_CASH
20,2008-10-01,0.336441,0.351538,0.064686,0.247335,0.0
21,2008-10-02,0.308336,0.293687,0.070428,0.32755,0.0
22,2008-10-03,0.27286,0.309474,0.067496,0.35017,0.0
23,2008-10-06,0.249023,0.227025,0.205722,0.318231,0.0
24,2008-10-07,0.256135,0.205459,0.272956,0.265449,0.0
...,...,...,...,...,...,...
98,2009-01-26,0.316649,0.073801,0.441442,0.168107,0.0
99,2009-01-27,0.426528,0.061474,0.458585,0.053413,0.0
100,2009-01-28,0.436794,0.0,0.563206,0.0,0.0
101,2009-01-29,0.372459,0.000921,0.497243,0.129377,0.0


In [67]:
# Make sure 'Date' is a column in both dataframes and not an index
weights_test.reset_index(drop=True, inplace=True)
merged_df_test.reset_index(drop=True, inplace=True)
weights_train.reset_index(drop=True, inplace=True)
merged_df_train.reset_index(drop=True, inplace=True)

In [89]:
portfolio_value_df_test, percentage_change_test = evaluate_portfolio_performance_fees(weights_test, merged_df_test)
portfolio_value_df_train, percentage_change_train = evaluate_portfolio_performance_fees(weights_train, merged_df_train)

In [93]:
print(f"Train Portfolio Performance day by day over testing time period: \n{portfolio_value_df_train}")
print(f"Train Portfolio Performance over testing time period: {percentage_change_train}")

Train Portfolio Performance day by day over testing time period: 
     Portfolio_Value
0      100000.000000
1      100034.831797
2       99522.698870
3      100307.351370
4      101136.123615
..               ...
386    146064.540220
387    142438.294312
388    142756.679849
389    143838.032025
390    140880.893725

[391 rows x 1 columns]
Train Portfolio Performance over testing time period: 40.88089372522137


In [94]:
print(f"Test Portfolio Performance day by day over testing time period: \n{portfolio_value_df_test}")
print(f"Test Portfolio Performance over testing time period: {percentage_change_test}")

Test Portfolio Performance day by day over testing time period: 
    Portfolio_Value
0     100000.000000
1      95875.843828
2      94614.862308
3      91907.942691
4      86285.552677
..              ...
78     90838.288169
79     90504.389999
80     92623.163852
81     91533.805231
82     96513.789434

[83 rows x 1 columns]
Test Portfolio Performance over testing time period: -3.486210565537287


In [79]:
import pandas as pd
import numpy as np

# Initial investment and weights
initial_investment = 100000
weights = [0.2, 0.2, 0.2, 0.2, 0.2]  # AMZN, AAPL, MSFT, NVIDIA, CASH

# Initialize portfolio value
portfolio_value = [initial_investment]

# Calculate the initial amount of money invested in each stock and cash
amounts = np.array(weights) * initial_investment 

# Buy stocks on the first day, accounting for slippage and fees
fees_and_slippage_rate = 0.001
amounts_in_stocks = amounts[:-1]  # Exclude cash
amounts_in_stocks = amounts_in_stocks * (1 - fees_and_slippage_rate)

# Calculate the number of shares bought for each stock
initial_prices = merged_df_test.loc[0, ['Price_AMZN', 'Price_AAPL', 'Price_MSFT', 'Price_NVIDIA']].values
shares = amounts_in_stocks / initial_prices

# Update the amount in cash after buying
amounts[-1] = amounts[-1] - np.sum(amounts_in_stocks * fees_and_slippage_rate)

# Simulate the portfolio value over time
for t in range(20, len(merged_df_test)):
    current_prices = merged_df_test.loc[t, ['Price_AMZN', 'Price_AAPL', 'Price_MSFT', 'Price_NVIDIA']].values
    current_value_in_stocks = np.sum(current_prices * shares)
    current_portfolio_value = current_value_in_stocks + amounts[-1]  # Add the amount in cash
    portfolio_value.append(current_portfolio_value)

# Convert portfolio_value to a DataFrame for easier handling later
bechmark1_portfolio_value_df = pd.DataFrame({
    'Portfolio_Value': portfolio_value
})

percent_change = ((bechmark1_portfolio_value_df.iloc[-1]['Portfolio_Value'] - initial_investment) / initial_investment) * 100

In [88]:
print(f"Benchmmark Portfolio Performance day by day over testing time period: \n{bechmark1_portfolio_value_df}")
print(f"Benchmmark Portfolio Performance over testing time period: {percent_change}")



Benchmmark Portfolio Performance day by day over testing time period: 
    Portfolio_Value
0     100000.000000
1      86520.879784
2      82970.553284
3      82083.004469
4      78824.024122
..              ...
79     68800.790386
80     69424.119913
81     71160.290214
82     69768.803761
83     70691.964329

[84 rows x 1 columns]
Benchmmark Portfolio Performance over testing time period: -29.308035670798027


In [85]:
# Importing required libraries
import yfinance as yf

# Define the start and end dates
start = '2008-09-02'
end = '2009-01-30'

# Fetch historical data for S&P 500 and NASDAQ Composite
sp500_data = yf.download('^GSPC', start=start, end=end)['Adj Close']
nasdaq_data = yf.download('^IXIC', start=start, end=end)['Adj Close']

# Calculate the returns
sp500_return = (sp500_data.iloc[-1] / sp500_data.iloc[0] - 1) * 100
nasdaq_return = (nasdaq_data.iloc[-1] / nasdaq_data.iloc[0] - 1) * 100

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [86]:
print(f"S&P 500 Return over Testing Time Period: {sp500_return}")
print(f"NASDAQ Return over Testing Time Period: {nasdaq_return}")

S&P 500 Return over Testing Time Period: -33.84836615171029
NASDAQ Return over Testing Time Period: -35.815839501783685


As you can see the test portfolio vastly outperformed the S&P 500, NASDAQ, and the portfolio holding 20% in each of the 4 stocks and cash over the same time period.

In [98]:
print(f"Beat S&P 500 by {np.abs(sp500_return) - np.abs(percentage_change_test)}%")
print(f"Beat NASDAQ by {np.abs(nasdaq_return) - np.abs(percentage_change_test)}%")
print(f"Beat Benchmark Porfolio by {np.abs(percent_change) - np.abs(percentage_change_test)}%")


Beat S&P 500 by 30.362155586173%
Beat NASDAQ by 32.3296289362464%
Beat Benchmark Porfolio by 25.82182510526074%
