In [18]:
#packges
import numpy as np
import pandas as pd
import yfinance as yf
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from scipy.optimize import minimize
import matplotlib.pyplot as plt

In [19]:
# Function to get historical stock prices from Yahoo Finance
def get_stock_data(ticker, start_date, end_date):
    stock_data = yf.download(ticker, start=start_date, end=end_date)
    return stock_data['Adj Close']


In [20]:
#  tickers (27 compagnies) and dates

tickers = ['AAPL', 'MSFT', 'GOOG', 'AMZN', 'BRK-A', 'NVDA', 'V', 'JPM', 'UNH',
           'JNJ', 'BAC', 'WMT', 'PG', 'HD', 'MA', 'XOM', 'PFE', 'DIS', 'CVX',
           'KO', 'AVGO', 'PEP', 'CSCO', 'WFC', 'COST', 'LLY', 'ADBE']
start_date = '2009-12-31'
end_date = '2021-12-31'

In [23]:
# Policy gradient algorithm for portfolio optimization
class PolicyGradientPortfolioOptimization:
    def __init__(self, tickers, start_date, end_date, initial_cash=100000):
        self.tickers = tickers
        self.start_date = start_date
        self.end_date = end_date
        self.initial_cash = initial_cash
        self.num_stocks = len(tickers)
        self.stock_prices = self.get_stock_prices()
        self.portfolio = np.zeros(self.num_stocks)
        self.cash = initial_cash
        self.discount_factor = 0.95
        self.learning_rate = 0.01
        self.num_episodes = 1000
        self.policy = np.ones((self.num_stocks + 1, self.num_stocks + 1)) / (self.num_stocks + 1)
        self.svm_models = [SVR(kernel='rbf') for _ in range(self.num_stocks)]
        self.dt_models = [DecisionTreeRegressor() for _ in range(self.num_stocks)]

    def get_stock_prices(self):
        stock_prices = []
        for ticker in self.tickers:
            stock_data = get_stock_data(ticker, self.start_date, self.end_date)
            stock_prices.append(stock_data)
        return np.array(stock_prices)

    def select_action(self, state):
        probabilities = self.policy[state]
        probabilities /= np.sum(probabilities)  # Normalize probabilities
        return np.random.choice(range(self.num_stocks + 1), p=probabilities)

    def update_policy(self, states, actions, rewards):
        for i in range(len(states)):
            state = states[i]
            action = actions[i]
            reward = rewards[i]
            state_int = int(state)
            self.policy[state_int, action] += self.learning_rate * reward

    def step(self, action):
        if action == self.num_stocks:  # Hold cash
            return
        else:
            stock_price = self.stock_prices[action, -1]
            stock_quantity = self.cash / (self.num_stocks * stock_price)
            self.portfolio[action] += stock_quantity
            self.cash -= stock_quantity * stock_price

    def train(self):
        for episode in range(self.num_episodes):
            states = []
            actions = []
            rewards = []
            state = int(self.cash / (self.initial_cash / self.num_stocks))
            for _ in range(100):  # 100 steps per episode
                action = self.select_action(state)
                actions.append(action)
                self.step(action)
                next_state = int(self.cash / (self.initial_cash / self.num_stocks))
                reward = np.sum(self.portfolio * self.stock_prices[:, -1])
                states.append(state)
                rewards.append(reward)
                state = next_state
            self.update_policy(states, actions, rewards)
            if episode % 100 == 0:
                print(f"Episode {episode}, Portfolio Value: {np.sum(self.portfolio * self.stock_prices[:, -1])}")
    def calculate_Policy_gradient_weights(self):
        returns = np.diff(np.log(self.stock_prices), axis=1)
        expected_returns = np.mean(returns, axis=1)
        cov_matrix = np.cov(returns)

        def objective(weights):
            return -np.dot(expected_returns, weights)  # Maximize expected return

        def constraint(weights):
            return np.sum(weights) - 1  # Sum of weights equals 1

        initial_weights = np.ones(self.num_stocks) / self.num_stocks  # Equal weights initially
        bounds = [(0, 1) for _ in range(self.num_stocks)]
        constraints = [{'type': 'eq', 'fun': constraint}]

        result = minimize(objective, initial_weights, bounds=bounds, constraints=constraints)
        return result.x


    def get_portfolio_weights(self):
        if np.sum(self.cash) == 0:
            cash_weight = 0
        else:
            cash_weight = self.cash / np.sum(self.cash)
        stock_weights = self.portfolio / np.sum(self.portfolio)
        return np.concatenate(([cash_weight], stock_weights))




In [24]:
# Q-learning algorithm for portfolio optimization
class QLearningPortfolioOptimization(PolicyGradientPortfolioOptimization):
    def __init__(self, tickers, start_date, end_date, initial_cash=100000):
        super().__init__(tickers, start_date, end_date, initial_cash)
        self.q_table = np.zeros((self.num_stocks + 1, self.num_stocks + 1))  # Q-table initialization
        self.epsilon = 0.1  # Epsilon for epsilon-greedy exploration

    def select_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(range(self.num_stocks + 1))  # Random action
        else:
            return np.argmax(self.q_table[state])

    def update_q_table(self, state, action, reward, next_state):
        max_next_q = np.max(self.q_table[next_state])
        self.q_table[state, action] += self.learning_rate * (reward + self.discount_factor * max_next_q - self.q_table[state, action])


In [25]:

# Policy Gradient
policy_gradient = PolicyGradientPortfolioOptimization(tickers, start_date, end_date)
policy_gradient.train()

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

Episode 0, Portfolio Value: 97524.06514555847
Episode 100, Portfolio Value: 100000.00000000003
Episode 200, Portfolio Value: 100000.00000000003
Episode 300, Portfolio Value: 100000.00000000003
Episode 400, Portfolio Value: 100000.00000000003
Episode 500, Portfolio Value: 100000.00000000003
Episode 600, Portfolio Value: 100000.00000000003
Episode 700, Portfolio Value: 100000.00000000003
Episode 800, Portfolio Value: 100000.00000000003
Episode 900, Portfolio Value: 100000.00000000003


In [26]:
# Q-learning
q_learning = QLearningPortfolioOptimization(tickers, start_date, end_date)
q_learning.train()

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

Episode 0, Portfolio Value: 97704.07138326137
Episode 100, Portfolio Value: 99999.99999999972
Episode 200, Portfolio Value: 99999.99999999972
Episode 300, Portfolio Value: 99999.99999999972
Episode 400, Portfolio Value: 99999.99999999972
Episode 500, Portfolio Value: 99999.99999999972
Episode 600, Portfolio Value: 99999.99999999972
Episode 700, Portfolio Value: 99999.99999999972
Episode 800, Portfolio Value: 99999.99999999972
Episode 900, Portfolio Value: 99999.99999999972


In [27]:
portfolio_weights_q = q_learning.get_portfolio_weights()

In [28]:
#Q_learning weights
Q_weights=pd.DataFrame(portfolio_weights_q[1:],index=tickers)
Q_weights.to_csv('Q_learning_w.csv')

In [29]:
# Calculate Policy_gradient weights
pg_weights = policy_gradient.calculate_Policy_gradient_weights()
weights=pd.DataFrame(pg_weights,index=tickers)
weights.to_csv('Policy_gradient_w.csv')
