In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import yfinance as yf

In [2]:
# nifty50_stocks = [
#     "RELIANCE.NS" , "TCS.NS", "HDFCBANK.NS",  # ... Add other stock symbols here
#     # List of all NIFTY50 stocks: https://www1.nseindia.com/live_market/dynaContent/live_watch/equities_stock_watch.htm
# ]

start_date = "2010-01-01"
end_date = "2019-06-30"

stock_data = pd.DataFrame()

stock_data = yf.download("RELIANCE.NS", start=start_date, end=end_date)
stock_data.to_csv('reliance_stock_data.csv', index=False)

print("Data collection and saving complete.")

[*********************100%%**********************]  1 of 1 completed
Data collection and saving complete.


In [3]:
# Read the data from the csv file, read only the column named Close
stock_data = pd.read_csv("reliance_stock_data.csv")[["Close"]]
stock_data.head()

Unnamed: 0,Close
0,532.7005
1,530.323059
2,538.891846
3,547.832092
4,546.395691


In [4]:
stock_data.describe()

Unnamed: 0,Close
count,2339.0
mean,586.935741
std,255.480789
min,334.875702
25%,427.250092
50%,489.683289
75%,555.992249
max,1395.62085


In [5]:
# Divide into 90% training and 10% testing
training_stock_data = stock_data[:int(len(stock_data)*0.9)]
testing_stock_data = stock_data[int(len(stock_data)*0.9):]

len(training_stock_data), len(testing_stock_data)

(2105, 234)

### Q Learning Temporal Difference Agent

In [20]:
from tqdm import tqdm

class Q_Learning_Agent:
    def __init__(self, num_iterations=200, checkpoint=10):
        self.num_iterations = num_iterations
        self.checkpoint = checkpoint

        self.epsilon = 0.9
        self.decay = 0.999
        self.gamma = 0.9
        self.alpha = 0.9

        self.max_lots_tradable = 36
        self.lot_size = 50
        self.max_lots_cumulative_traded = 100
        self.transaction_cost = 0.0000335

        self.action_space = [i for i in range(-self.max_lots_tradable, self.max_lots_tradable+1)]
        self.state_space = [i for i in range(-self.max_lots_cumulative_traded, self.max_lots_cumulative_traded+1)]
        self.q_table = np.zeros((len(self.state_space), len(self.action_space)))

        self.current_state = 0
        self.cumulative_reward = 0
    
    def get_action(self):
        if np.random.uniform(0,1) <= self.epsilon:
            while True:
                action = np.random.choice(self.action_space)
                if (self.current_state + action) in self.state_space:
                    self.epsilon = self.epsilon * self.decay
                    return action
                else:
                    continue
        else:
            return np.argmax(self.q_table[self.current_state])
    
    def update_state(self, action):
        self.current_state += action
    
    def get_reward(self, action, old_val, new_val):
        reward = -self.current_state*old_val*self.lot_size
        self.update_state(action)
        reward += self.current_state*new_val*self.lot_size
        reward -= abs(action)*self.transaction_cost
        return reward

    def update_q_table(self, action, old_val, new_val):
        q_old = self.q_table[self.current_state][action]
        reward = self.get_reward(action, old_val, new_val)
        self.cumulative_reward += reward
        q_new = reward + self.gamma*np.max(self.q_table[self.current_state])
        self.q_table[self.current_state-action][action] = (1-self.alpha)*q_old + self.alpha*q_new

    def train(self, data):
        cumulative_rewards = []
        for i in range(self.num_iterations):
            print("Iteration: ", i+1)
            prev_val = data.values[0]
            for val in tqdm(data.values[1:]):
                action = self.get_action()
                self.update_q_table(action, prev_val, val)
                prev_val = val
            if (i+1)%self.checkpoint == 0:
                print("Iteration: ", i+1, " Cumulative Reward: ", self.cumulative_reward)
            cumulative_rewards.append(self.cumulative_reward)
            self.cumulative_reward = 0
            self.epsilon = 0.9
            self.current_state = 0
        return cumulative_rewards

    def plot_progress(self, cumulative_rewards):
        plt.plot(cumulative_rewards)
        plt.xlabel("Iterations")
        plt.ylabel("Cumulative Reward")
        plt.show()
    
    def save_q_table(self, filename):
        np.save(filename, self.q_table)
    
    def load_q_table(self, filename):
        self.q_table = np.load(filename)
    
    def test(self, data):
        self.current_state = 0
        self.cumulative_reward = 0
        actions, rewards = [], []
        prev_val = data.values[0]
        for val in data.values[1:]:
            action = np.argmax(self.q_table[self.current_state])
            actions.append(action)
            reward = self.get_reward(action, prev_val, val)
            rewards.append(reward)
            self.cumulative_reward += reward
            prev_val = val
        return actions, rewards, self.cumulative_reward

In [22]:
agent = Q_Learning_Agent(num_iterations = 200, checkpoint = 10)
cumulative_rewards = agent.train(training_stock_data)
agent.plot_progress(cumulative_rewards)

Iteration:  1


  5%|▌         | 106/2104 [01:51<35:02,  1.05s/it]


KeyboardInterrupt: 

In [9]:
actions, rewards, cumulative_reward = agent.test(testing_stock_data)
plt.plot(actions)
plt.show()

ValueError: not enough values to unpack (expected 3, got 2)

In [10]:
plt.plot(rewards)
plt.show()

NameError: name 'rewards' is not defined

In [11]:
print("Cumulative Reward :", cumulative_reward)

NameError: name 'cumulative_reward' is not defined