In [None]:
#!pip install cvxpy optuna

import os
import numpy as np
import pandas as pd
import gym
from gym import spaces
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal
from torch.utils.tensorboard import SummaryWriter
import pickle
import cvxpy as cp
import matplotlib.pyplot as plt
import optuna

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def save_study(study, filename):
    with open(filename, 'wb') as f:
        pickle.dump(study, f)

def load_study(filename):
    if os.path.isfile(filename):
        with open(filename, 'rb') as f:
            study = pickle.load(f)
        return study
    return None

def save_checkpoint(agent, episode, filename):
    checkpoint = {'agent_state_dict': agent.state_dict(), 'optimizer_actor_state_dict': agent.optimizer_actor.state_dict(), 'optimizer_critic_state_dict': agent.optimizer_critic.state_dict(), 'episode': episode}
    torch.save(checkpoint, filename)

def load_checkpoint(agent, filename):
    if os.path.isfile(filename):
        try:
            checkpoint = torch.load(filename, map_location=device)
            agent.load_state_dict(checkpoint['agent_state_dict'])
            agent.optimizer_actor.load_state_dict(checkpoint['optimizer_actor_state_dict'])
            agent.optimizer_critic.load_state_dict(checkpoint['optimizer_critic_state_dict'])
            episode = checkpoint['episode']
            print(f"Checkpoint loaded from episode {episode}")
            return episode
        except Exception as e:
            print("Checkpoint exists but failed to load due to error:", e, "Training from scratch.")
            return 0
    else:
        return 0

def save_agent(agent, filename):
    torch.save(agent.state_dict(), filename)

def load_agent(agent, filename):
    if os.path.isfile(filename):
        checkpoint = torch.load(filename, map_location=device)
        agent.load_state_dict(checkpoint)

data = pd.read_csv('input.csv')
stock_headers = ['INFY','BSOFT','BBOX','ACCELYA','HBLPOWER','BOSCHLTD','NCC','AUROPHARMA','NATCOPHARM','SHRIRAMFIN','HINDUNILVR','SBIN','DRREDDY','BHARTIARTL','ONGC']
bond_headers = ['IN5Y','IN10Y']
macro_headers = ['Inflation','GDP','Unemployment','Repo Rate','Corporate Tax rate','IIP','Exchange Rate']
tech_indicators = ['SMA_20','EMA_20','EMA_50','RSI','BB_High','BB_Low','BB_Mid','MACD','MACD_Signal','MACD_Diff','ATR','Stoch','Stoch_Signal','SMA_20_x_RSI','SMA_20_x_MACD','RSI_x_MACD']
features = stock_headers + bond_headers + macro_headers + tech_indicators
split_idx = int(len(data) * 0.9)
train_data = data.iloc[:split_idx].copy()
test_data = data.iloc[split_idx:].copy()
train_data.loc[:, features] = train_data[features].replace([np.inf, -np.inf], np.nan)
train_data.loc[:, features] = train_data[features].ffill().bfill()
state_data = train_data[features].values
price_data = train_data[stock_headers]
daily_returns_stocks = price_data.pct_change().dropna()
cov_matrix = daily_returns_stocks.cov().values

class PortfolioEnv(gym.Env):
    def __init__(self, state_data, stock_indices, cov_matrix):
        super(PortfolioEnv, self).__init__()
        self.state_data = state_data
        self.stock_indices = stock_indices
        self.n_steps, self.n_features = state_data.shape
        self.action_space = spaces.Box(low=0, high=1, shape=(len(stock_indices),), dtype=np.float32)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.n_features,), dtype=np.float32)
        self.current_step = 0
        self.done = False
        self.risk_free_rate = 0.03
        self.cov_matrix = cov_matrix
        self.cumulative_return = 1.0
    def reset(self):
        self.current_step = 0
        self.done = False
        self.cumulative_return = 1.0
        return self.state_data[self.current_step]
    def step(self, action_weights):
        if self.current_step >= self.n_steps - 2:
            self.done = True
            return self.state_data[self.current_step], 0, self.done, {'cumulative_return': self.cumulative_return}
        price_today = self.state_data[self.current_step, self.stock_indices]
        price_next = self.state_data[self.current_step + 1, self.stock_indices]
        returns = (price_next / price_today) - 1
        portfolio_return = np.dot(action_weights, returns)
        portfolio_volatility = np.sqrt(np.dot(action_weights, np.dot(self.cov_matrix, action_weights)))
        sharpe_ratio = (portfolio_return - self.risk_free_rate) / (portfolio_volatility + 1e-10)
        reward = sharpe_ratio - 0.01 * portfolio_volatility
        self.cumulative_return *= (1 + portfolio_return)
        info = {'portfolio_return': portfolio_return, 'cumulative_return': self.cumulative_return}
        self.current_step += 1
        return self.state_data[self.current_step], reward, self.done, info

class ReplayBuffer:
    def __init__(self, max_size=1000000):
        self.buffer = []
        self.max_size = max_size
        self.ptr = 0
    def add(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.max_size:
            self.buffer.append((state, action, reward, next_state, done))
        else:
            self.buffer[self.ptr] = (state, action, reward, next_state, done)
        self.ptr = (self.ptr + 1) % self.max_size
    def sample(self, batch_size):
        idx = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, rewards, next_states, dones = zip(*[self.buffer[i] for i in idx])
        return np.array(states), np.array(actions), np.array(rewards), np.array(next_states), np.array(dones)

class MultiHeadAttention(nn.Module):
    def __init__(self, input_dim, num_heads, output_dim):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.output_dim = output_dim
        self.head_dim = output_dim // num_heads
        assert self.head_dim * num_heads == self.output_dim
        self.attention_heads = nn.ModuleList([nn.Linear(input_dim, self.head_dim) for _ in range(num_heads)])
    def forward(self, x):
        outputs = [head(x) for head in self.attention_heads]
        return torch.cat(outputs, dim=-1)

class PPOAgent(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim, num_heads, output_dim_base, lr, eps_clip, gamma, entropy_weight):
        super(PPOAgent, self).__init__()
        self.attention = MultiHeadAttention(state_dim, num_heads, output_dim_base * num_heads)
        self.actor = self.build_network(output_dim_base * num_heads, action_dim * 2, hidden_dim, final_activation=None)
        self.critic = self.build_network(output_dim_base * num_heads, 1, hidden_dim, final_activation=None)
        self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=lr)
        self.optimizer_critic = optim.Adam(self.critic.parameters(), lr=lr)
        self.scheduler_actor = optim.lr_scheduler.StepLR(self.optimizer_actor, step_size=100, gamma=0.9)
        self.scheduler_critic = optim.lr_scheduler.StepLR(self.optimizer_critic, step_size=100, gamma=0.9)
        self.eps_clip = eps_clip
        self.gamma = gamma
        self.entropy_weight = entropy_weight
        self.entropy_decay_rate = 0.995
        self.regularization_weight = 0.001
    def build_network(self, input_dim, output_dim, hidden_dim, final_activation=None):
        layers = [nn.Linear(input_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, output_dim)]
        if final_activation is not None:
            layers.append(final_activation)
        return nn.Sequential(*layers)
    def select_action(self, state):
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
        attention_output = self.attention(state_tensor)
        out = self.actor(attention_output)
        mean, log_std = out.chunk(2, dim=-1)
        log_std = torch.clamp(log_std, -20, 2)
        std = log_std.exp()
        normal = Normal(mean, std)
        action = normal.sample()
        action = torch.tanh(action)
        return action.squeeze(0).cpu().detach().numpy(), normal.log_prob(action).sum(-1), normal.entropy().sum(-1)
    def compute_returns(self, rewards, dones):
        returns = []
        R = 0
        for r, d in zip(reversed(rewards), reversed(dones)):
            R = r + self.gamma * R * (1 - d)
            returns.insert(0, R)
        return torch.FloatTensor(returns)
    def update(self, states, actions, log_probs, returns, entropies):
        states_tensor = torch.FloatTensor(np.array(states)).to(device)
        actions_tensor = torch.FloatTensor(np.array(actions)).to(device)
        log_probs_tensor = torch.stack(log_probs).to(device).detach()
        returns_tensor = torch.FloatTensor(returns).to(device)
        entropies_tensor = torch.stack(entropies).to(device).detach()
        attention_output = self.attention(states_tensor)
        values = self.critic(attention_output).squeeze()
        advantages = returns_tensor - values.detach()
        for _ in range(10):
            out = self.actor(attention_output)
            mean, log_std = out.chunk(2, dim=-1)
            std = log_std.exp()
            normal = Normal(mean, std)
            new_log_probs = normal.log_prob(actions_tensor).sum(-1)
            ratio = torch.exp(new_log_probs - log_probs_tensor)
            surr1 = ratio * advantages
            surr2 = torch.clamp(ratio, 1 - self.eps_clip, 1 + self.eps_clip) * advantages
            actor_loss = -torch.min(surr1, surr2).mean()
            critic_loss = ((advantages) ** 2).mean()
            entropy_loss = -entropies_tensor.mean() * self.entropy_weight
            l2_reg = sum(torch.norm(param) for param in self.actor.parameters())
            loss = actor_loss + 0.5 * critic_loss + entropy_loss + self.regularization_weight * l2_reg
            self.optimizer_actor.zero_grad()
            self.optimizer_critic.zero_grad()
            loss.backward(retain_graph=True)
            self.optimizer_actor.step()
            self.optimizer_critic.step()
            print(f"Update: Actor Loss={actor_loss.item():.4f}, Critic Loss={critic_loss.item():.4f}, Entropy Loss={entropy_loss.item():.4f}, Total Loss={loss.item():.4f}")
        self.scheduler_actor.step()
        self.scheduler_critic.step()
        torch.cuda.empty_cache()
    def decay_entropy_weight(self):
        self.entropy_weight *= self.entropy_decay_rate

def objective(trial):
    hidden_dim = trial.suggest_int("hidden_dim", 32, 256)
    lr = trial.suggest_float("lr", 1e-6, 1e-3, log=True)
    eps_clip = trial.suggest_float("eps_clip", 0.1, 0.3)
    gamma = trial.suggest_float("gamma", 0.95, 0.999)
    entropy_weight = trial.suggest_float("entropy_weight", 1e-4, 1e-1, log=True)
    num_heads = trial.suggest_int("num_heads", 2, 8)
    output_dim_base = trial.suggest_int("output_dim_base", 8, 32)
    env_opt = PortfolioEnv(state_data, stock_indices=range(len(stock_headers)), cov_matrix=cov_matrix)
    state_dim = state_data.shape[1]
    action_dim = len(stock_headers)
    agent = PPOAgent(state_dim, action_dim, hidden_dim, num_heads, output_dim_base, lr, eps_clip, gamma, entropy_weight).to(device)
    episodes = 50
    total_reward = 0.0
    for ep in range(episodes):
        state = env_opt.reset()
        ep_rewards = []
        done = False
        while not done:
            action, log_prob, entropy = agent.select_action(state)
            next_state, reward, done, info = env_opt.step(action)
            ep_rewards.append(reward)
            state = next_state
        total_reward += np.sum(ep_rewards)
    avg_reward = total_reward / episodes
    return -avg_reward

def run_hyperparameter_optimization(n_trials=20):
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials)
    print("Best hyperparameters:", study.best_trial.params)
    return study.best_trial.params

def train_ppo(env, agent, episodes=1000, writer=None, save_interval=10, checkpoint_path="ppo_checkpoint.pth"):
    start_episode = load_checkpoint(agent, checkpoint_path)
    for ep in range(start_episode, episodes):
        state = env.reset()
        states, actions, log_probs, rewards, dones, entropies = [], [], [], [], [], []
        done = False
        while not done:
            action, log_prob, entropy = agent.select_action(state)
            next_state, reward, done, info = env.step(action)
            states.append(state)
            actions.append(action)
            log_probs.append(log_prob)
            rewards.append(reward)
            dones.append(done)
            entropies.append(entropy)
            state = next_state
        returns = agent.compute_returns(rewards, dones)
        agent.update(states, actions, log_probs, returns, entropies)
        episode_reward = np.sum(rewards)
        print(f"Episode {ep}: RL Reward Sum = {episode_reward:.4f}, Portfolio Value = {info.get('cumulative_return', np.nan):.4f}")
        if writer:
            writer.add_scalar("RL Reward", episode_reward, ep)
            writer.add_scalar("Portfolio Value", info.get('cumulative_return', 0), ep)
        if ep % save_interval == 0:
            save_checkpoint(agent, ep, checkpoint_path)
        agent.decay_entropy_weight()
    print("Training complete.")

def mean_variance_optimization(expected_returns, cov_matrix, risk_aversion=1):
    n = len(expected_returns)
    w = cp.Variable(n)
    objective = cp.Maximize(expected_returns.T @ w - risk_aversion * cp.quad_form(w, cov_matrix))
    constraints = [cp.sum(w) == 1, w >= 0]
    problem = cp.Problem(objective, constraints)
    problem.solve()
    weights = w.value
    weights /= np.sum(weights)
    return weights

mvo_file_path = 'MVO.csv'
mvo_data = pd.read_csv(mvo_file_path)
daily_returns_mvo = mvo_data[stock_headers].pct_change().dropna()
expected_returns_mvo = daily_returns_mvo.mean().values
cov_matrix_mvo = daily_returns_mvo.cov().values
mvo_weights = mean_variance_optimization(expected_returns_mvo, cov_matrix_mvo)
print("MVO Weights:", mvo_weights)
print("Sum of MVO Weights:", np.sum(mvo_weights))

def combine_weights(ppo_weights, mvo_weights, alpha):
    return (1 - alpha) * ppo_weights + alpha * mvo_weights

def evaluate_combined_weights(env, ppo_agent, alpha_values):
    results = []
    for alpha in alpha_values:
        state = env.reset()
        done = False
        while not done:
            ppo_weights, _, _ = ppo_agent.select_action(state)
            ppo_stock_weights = ppo_weights[:len(stock_headers)]
            combined = combine_weights(ppo_stock_weights, mvo_weights, alpha)
            next_state, reward, done, info = env.step(combined)
            state = next_state
        final_value = info.get('cumulative_return', np.nan)
        print(f"Alpha {alpha:.2f}: Final Portfolio Value = {final_value:.4f}")
        results.append((alpha, final_value))
    return results

def calculate_performance_metrics(returns):
    metrics = {}
    metrics['ROI'] = np.sum(returns)
    metrics['Sharpe Ratio'] = np.mean(returns) / (np.std(returns) + 1e-10)
    downside_returns = returns[returns < 0]
    metrics['Sortino Ratio'] = np.mean(returns) / (np.std(downside_returns) + 1e-10)
    cum_returns = np.cumsum(returns)
    drawdown = np.max(cum_returns) - cum_returns
    metrics['Maximum Drawdown'] = np.max(drawdown)
    metrics['Calmar Ratio'] = metrics['ROI'] / (metrics['Maximum Drawdown'] + 1e-10)
    return metrics

def backtest(env, agent, alpha_values):
    backtest_results = {}
    for alpha in alpha_values:
        state = env.reset()
        done = False
        returns_list = []
        while not done:
            ppo_weights, _, _ = agent.select_action(state)
            ppo_stock_weights = ppo_weights[:len(stock_headers)]
            combined = combine_weights(ppo_stock_weights, mvo_weights, alpha)
            next_state, reward, done, info = env.step(combined)
            returns_list.append(info.get('portfolio_return', 0))
            state = next_state
        metrics = calculate_performance_metrics(np.array(returns_list))
        backtest_results[alpha] = metrics
        print(f"Alpha {alpha:.2f}: Metrics = {metrics}")
    return backtest_results

def equal_weight_portfolio(stock_headers):
    n = len(stock_headers)
    weights = np.ones(n) / n
    print("Equal Weight Portfolio Weights:", weights)
    print("Sum of Weights:", np.sum(weights))
    return weights

def minimum_variance_portfolio(daily_returns):
    cov_matrix_local = daily_returns.cov().values
    n = len(daily_returns.columns)
    w = cp.Variable(n)
    objective = cp.Minimize(cp.quad_form(w, cov_matrix_local))
    constraints = [cp.sum(w) == 1, w >= 0]
    problem = cp.Problem(objective, constraints)
    problem.solve()
    weights = w.value
    weights /= np.sum(weights)
    print("Minimum Variance Portfolio Weights:", weights)
    print("Sum of Weights:", np.sum(weights))
    return weights

def evaluate_static_portfolio(env, weights):
    state = env.reset()
    done = False
    portfolio_returns = []
    while not done:
        state, _, done, info = env.step(weights)
        portfolio_returns.append(info.get('portfolio_return', 0))
    return calculate_performance_metrics(np.array(portfolio_returns))

if __name__ == '__main__':
    writer = SummaryWriter(log_dir='./tensorboard_logs')
    env = PortfolioEnv(state_data, stock_indices=range(len(stock_headers)), cov_matrix=cov_matrix)
    print("Starting Hyperparameter Optimization...")
    best_params = run_hyperparameter_optimization(n_trials=20)
    print("Using Best Hyperparameters:", best_params)
    state_dim = state_data.shape[1]
    action_dim = len(stock_headers)
    ppo_agent = PPOAgent(state_dim, action_dim, best_params["hidden_dim"], best_params["num_heads"], best_params["output_dim_base"], best_params["lr"], best_params["eps_clip"], best_params["gamma"], best_params["entropy_weight"]).to(device)
    num_episodes = 1000
    print("Starting Final Training...")
    train_ppo(env, ppo_agent, episodes=num_episodes, writer=writer, save_interval=10, checkpoint_path="ppo_checkpoint.pth")
    alpha_values = np.linspace(0, 1, 11)
    combined_results = evaluate_combined_weights(env, ppo_agent, alpha_values)
    results_df = pd.DataFrame(combined_results, columns=['Alpha', 'Final Portfolio Value'])
    results_df.to_csv('combined_weights_results.csv', index=False)
    plt.figure(figsize=(10,6))
    plt.plot(results_df['Alpha'], results_df['Final Portfolio Value'], marker='o')
    plt.title('Final Portfolio Value for Different Alpha Values (PPO-MVO Combination)')
    plt.xlabel('Alpha')
    plt.ylabel('Final Portfolio Value')
    plt.grid(True)
    plt.savefig('combined_weights_performance.png')
    plt.show()
    backtest_results = backtest(env, ppo_agent, alpha_values)
    backtest_df = pd.DataFrame(backtest_results).T
    backtest_df.to_csv('backtest_results.csv', index=False)
    backtest_df.plot(kind='bar', figsize=(12,8))
    plt.title('Backtest Performance Metrics for Different Alpha Values')
    plt.xlabel('Alpha')
    plt.ylabel('Metrics')
    plt.legend(loc='best')
    plt.savefig('backtest_performance_metrics.png')
    plt.show()
    equal_weights = equal_weight_portfolio(stock_headers)
    equal_weight_metrics = evaluate_static_portfolio(env, equal_weights)
    min_variance_weights = minimum_variance_portfolio(daily_returns_mvo)
    min_variance_metrics = evaluate_static_portfolio(env, min_variance_weights)
    portfolios = ['PPO-MVO','Equal Weight','Minimum Variance']
    metrics_list = ['ROI','Sharpe Ratio','Sortino Ratio','Maximum Drawdown','Calmar Ratio']
    ppo_mvo_metrics = backtest_results[0.1]
    combined_metrics = {'PPO-MVO': ppo_mvo_metrics, 'Equal Weight': equal_weight_metrics, 'Minimum Variance': min_variance_metrics}
    combined_metrics_df = pd.DataFrame(combined_metrics, index=metrics_list)
    for metric in metrics_list:
        plt.figure(figsize=(10,6))
        combined_metrics_df.loc[metric].plot(kind='bar', legend=True)
        plt.title(f'Comparison of {metric} Across Portfolio Techniques', fontsize=14)
        plt.xlabel('Portfolio Technique', fontsize=12)
        plt.ylabel(metric, fontsize=12)
        plt.xticks(rotation=0)
        plt.grid(True)
        plt.show()


[I 2025-03-23 21:09:47,691] A new study created in memory with name: no-name-c273abc6-deb8-4c0b-b307-9a3750f83dab


MVO Weights: [ 1.99919355e-22  1.16838239e-01  1.29674746e-01  2.69604640e-01
  1.01358024e-01  1.43953446e-22  2.20026418e-23  4.09092310e-22
  3.82524351e-01  8.73035286e-23 -1.21645007e-23  2.03881335e-22
  2.13469639e-22  1.49301532e-22  1.44913988e-22]
Sum of MVO Weights: 1.0
Starting Hyperparameter Optimization...


[I 2025-03-23 21:10:04,438] Trial 0 finished with value: 608.8915634011253 and parameters: {'hidden_dim': 101, 'lr': 0.0008026126483857272, 'eps_clip': 0.1952275225762871, 'gamma': 0.9915759104524109, 'entropy_weight': 0.047265766398529724, 'num_heads': 4, 'output_dim_base': 18}. Best is trial 0 with value: 608.8915634011253.
[I 2025-03-23 21:10:22,980] Trial 1 finished with value: 571.003688211817 and parameters: {'hidden_dim': 159, 'lr': 0.00015865847923221107, 'eps_clip': 0.15045202901488505, 'gamma': 0.9731701421968795, 'entropy_weight': 0.006550534205638664, 'num_heads': 5, 'output_dim_base': 16}. Best is trial 1 with value: 571.003688211817.
[I 2025-03-23 21:10:43,223] Trial 2 finished with value: 593.0885285262061 and parameters: {'hidden_dim': 60, 'lr': 2.1449005747090203e-05, 'eps_clip': 0.2661656760604869, 'gamma': 0.9974223791053068, 'entropy_weight': 0.0003232665877144886, 'num_heads': 8, 'output_dim_base': 23}. Best is trial 1 with value: 571.003688211817.
[I 2025-03-23 21

Best hyperparameters: {'hidden_dim': 159, 'lr': 0.00015865847923221107, 'eps_clip': 0.15045202901488505, 'gamma': 0.9731701421968795, 'entropy_weight': 0.006550534205638664, 'num_heads': 5, 'output_dim_base': 16}
Starting Final Training...


RuntimeError: Error(s) in loading state_dict for PPOAgent:
	Unexpected key(s) in state_dict: "attention.attention_heads.5.weight", "attention.attention_heads.5.bias", "attention.attention_heads.6.weight", "attention.attention_heads.6.bias", "attention.attention_heads.7.weight", "attention.attention_heads.7.bias". 
	size mismatch for attention.attention_heads.0.weight: copying a param with shape torch.Size([28, 40]) from checkpoint, the shape in current model is torch.Size([16, 40]).
	size mismatch for attention.attention_heads.0.bias: copying a param with shape torch.Size([28]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for attention.attention_heads.1.weight: copying a param with shape torch.Size([28, 40]) from checkpoint, the shape in current model is torch.Size([16, 40]).
	size mismatch for attention.attention_heads.1.bias: copying a param with shape torch.Size([28]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for attention.attention_heads.2.weight: copying a param with shape torch.Size([28, 40]) from checkpoint, the shape in current model is torch.Size([16, 40]).
	size mismatch for attention.attention_heads.2.bias: copying a param with shape torch.Size([28]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for attention.attention_heads.3.weight: copying a param with shape torch.Size([28, 40]) from checkpoint, the shape in current model is torch.Size([16, 40]).
	size mismatch for attention.attention_heads.3.bias: copying a param with shape torch.Size([28]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for attention.attention_heads.4.weight: copying a param with shape torch.Size([28, 40]) from checkpoint, the shape in current model is torch.Size([16, 40]).
	size mismatch for attention.attention_heads.4.bias: copying a param with shape torch.Size([28]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for actor.0.weight: copying a param with shape torch.Size([67, 224]) from checkpoint, the shape in current model is torch.Size([159, 80]).
	size mismatch for actor.0.bias: copying a param with shape torch.Size([67]) from checkpoint, the shape in current model is torch.Size([159]).
	size mismatch for actor.2.weight: copying a param with shape torch.Size([67, 67]) from checkpoint, the shape in current model is torch.Size([159, 159]).
	size mismatch for actor.2.bias: copying a param with shape torch.Size([67]) from checkpoint, the shape in current model is torch.Size([159]).
	size mismatch for actor.4.weight: copying a param with shape torch.Size([30, 67]) from checkpoint, the shape in current model is torch.Size([30, 159]).
	size mismatch for critic.0.weight: copying a param with shape torch.Size([67, 224]) from checkpoint, the shape in current model is torch.Size([159, 80]).
	size mismatch for critic.0.bias: copying a param with shape torch.Size([67]) from checkpoint, the shape in current model is torch.Size([159]).
	size mismatch for critic.2.weight: copying a param with shape torch.Size([67, 67]) from checkpoint, the shape in current model is torch.Size([159, 159]).
	size mismatch for critic.2.bias: copying a param with shape torch.Size([67]) from checkpoint, the shape in current model is torch.Size([159]).
	size mismatch for critic.4.weight: copying a param with shape torch.Size([1, 67]) from checkpoint, the shape in current model is torch.Size([1, 159]).