# Model 3: Improvement

The purpose of this model is to utilize reinforcement learning to improve the prediction capabilities of Model 2 

## Preliminary Setup

### Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast
import random
import nbimporter
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

### ACTOR Model Architecture

In [None]:
class Actor(nn.Module):
    
    def __init__(self, num_features):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(num_features, 128)
        self.fc2 = nn.Linear(128, 5)
        
    def forward(self, x):
        x = F.relu(self.fc1(x)) # Use ReLU
        x = self.fc2(x)
        return F.softmax(x, dim = -1)

In [None]:
# class Actor(nn.Module):
    
#     def __init__(self, num_features):
#         super(Actor, self).__init__()
#         self.fc1 = nn.Linear(num_features, 128)
#         self.fc2 = nn.Linear(128, 5)

#     def forward(self, x):
#         x = F.relu(self.fc1(x))
#         x = self.fc2(x)
        
#         return F.softmax(x, dim=-1)

### ACTOR Functions

In [None]:
def train_actor(model, optimizer, X_train, y_train, epochs):
    loss_list = []
    
    for epoch in range(epochs):
        optimizer.zero_grad() # Resets gradients
        outputs = model(X_train) # Forward Propagation
        loss = F.cross_entropy(outputs, y_train) # Calculate loss using cross-entropy
        loss.backward() # Backward Propagation
        optimizer.step() # Updates params
        
        loss_list.append(loss.item())
        
        print(f"Epoch {epoch}: Loss = {loss.item()}")
        
    return loss_list

In [None]:
def evaluate_actor(model, X_train, y_train, X_test, y_test):
    with torch.no_grad():
        train_outputs = model(X_train)
        _, train_predicted = torch.max(train_outputs, 1)
        
        test_outputs = model(X_test)
        _, test_predicted = torch.max(test_outputs, 1)

    # Calculate train and test accuracy
    train_accuracy = accuracy_score(y_train, train_predicted)
    print(f'Train Accuracy: {train_accuracy}')
    
    test_accuracy = accuracy_score(y_test, test_predicted)
    print(f'Test Accuracy: {test_accuracy}')

    return train_accuracy, test_accuracy

In [None]:
def predict_action(actor, X_input, valid):
    with torch.no_grad():
        output = actor(X_input)
        
    valid_tensor = torch.tensor(valid, dtype = torch.bool)
    valid_output = torch.where(valid_tensor, output, torch.tensor(float('-inf')).to(output.dtype))
    
    random_choice = random.randint(1, 5)
    
    if random_choice == 1:
        _, predicted_action = torch.where(valid_output != float('-inf'))
        return np.random.choice(predicted_action).item()
    else:
        _, predicted_action = torch.max(valid_output, 1)
        return predicted_action.item()

In [None]:
def plot_loss(loss_list):
    plt.plot(loss_list)
    plt.title('Training Loss per Epoch')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.show()

### CRITIC Model Architecture

In [None]:
class Critic(nn.Module):
    def __init__(self, num_features):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(num_features, 128)
        self.fc2 = nn.Linear(128, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))  # Use ReLU
        x = torch.sigmoid(self.fc2(x))
        return x

### CRITIC Functions

In [None]:
def train_critic(model, optimizer, X_train, y_train, epochs):
    loss_list = []
    
    y_train_float = y_train.float()

    for epoch in range(epochs):
        optimizer.zero_grad()  # Resets gradients
        outputs = model(X_train).squeeze() # Forward Propagation
        loss = F.binary_cross_entropy(outputs, y_train_float)  # Binary Cross-Entropy Loss
        loss.backward()  # Backward Propagation
        optimizer.step()  # Updates params
        
        loss_list.append(loss.item())
        
        print(f"Epoch {epoch}: Loss = {loss.item()}")
        
    return loss_list

In [None]:
def evaluate_critic(model, X_train, y_train, X_test, y_test):
    with torch.no_grad():
        train_outputs = model(X_train).squeeze()
        test_outputs = model(X_test).squeeze()
        
        # Threshold set to 0.5
        train_predicted = (train_outputs > 0.5).float()
        test_predicted = (test_outputs > 0.5).float()

    # Calculate train and test accuracy
    train_accuracy = accuracy_score(y_train, train_predicted)
    print(f'Train Accuracy: {train_accuracy}')
    
    test_accuracy = accuracy_score(y_test, test_predicted)
    print(f'Test Accuracy: {test_accuracy}')

    return train_accuracy, test_accuracy

In [None]:
def predict_prob_success(critic, X_input):
    with torch.no_grad():
        prob_success = critic(X_input)
        
        return prob_success.item()

### Pretraining Data Preprocessing

In [None]:
pretrain_data = pd.read_csv('Data/Model3/pretraining_data_raw.csv')

In [None]:
pretrain_data = pretrain_data.drop_duplicates()

In [None]:
X_pt = pretrain_data.iloc[:, :-1]
y_pt = pretrain_data.iloc[:, -1]

In [None]:
X_pt

In [None]:
# n_components = 200
# pca = PCA(n_components=n_components)
# pca.fit(X_pt)
# X_pt = pca.transform(X_pt)
# X_pt

In [None]:
# X_pt = pd.DataFrame(X_pt, columns=[f'PC{i+1}' for i in range(X_pt.shape[1])])

In [None]:
y_pt

In [None]:
# X_pt = X_pt.drop(columns=['crew_stay'])

In [None]:
crew_columns = ['crew_up', 'crew_down', 'crew_left', 'crew_right']
X_pt['sum_crew'] = X_pt[crew_columns].sum(axis=1)

for col in crew_columns:
    X_pt.loc[X_pt['sum_crew'] != 0, col] = X_pt[col] / X_pt['sum_crew']

X_pt.drop('sum_crew', axis=1, inplace=True)

X_pt

In [None]:
y_pt = y_pt.apply(ast.literal_eval)
y_pt = y_pt.apply(lambda x: x.index(1))
y_pt

In [None]:
X_pt_train, X_pt_test, y_pt_train, y_pt_test = train_test_split(X_pt, y_pt, test_size = 0.2, random_state = 42)

In [None]:
X_pt_train_tensor = torch.tensor(X_pt_train.values, dtype=torch.float32)
y_pt_train_tensor = torch.tensor(y_pt_train.values, dtype=torch.long)

X_pt_test_tensor = torch.tensor(X_pt_test.values, dtype=torch.float32)
y_pt_test_tensor = torch.tensor(y_pt_test.values, dtype=torch.long)

In [None]:
X_pt_train_tensor

In [None]:
y_pt_train_tensor

### Initialize + Pretrain ACTOR

In [None]:
num_features = X_pt_train.shape[1]
actor = Actor(num_features)
optimizer = torch.optim.Adam(actor.parameters(), lr = 0.001) # Set learning rate to 0.001

In [None]:
# actor_rand = Actor(num_features)
# optimizer_rand = torch.optim.Adam(actor_rand.parameters(), lr = 0.001)

In [None]:
loss_list = train_actor(actor, optimizer, X_pt_train_tensor, y_pt_train_tensor, epochs=1200)

In [None]:
plot_loss(loss_list)

In [None]:
evaluate_actor(actor, X_pt_train_tensor, y_pt_train_tensor, X_pt_test_tensor, y_pt_test_tensor)

### Generate Initial ACTOR Data for CRITIC Training

In [None]:
import Bot1

In [None]:
# file_name = 'Data/Model3/actor_data.csv'

In [None]:
%run Bot1.ipynb

In [None]:
grid, open_cells = create_grid() # Fixed grid orientation

In [None]:
def is_valid(x, y, move, grid, open_cells):
    if move == 'up' and (x, y + 1) in open_cells:
        return True
    elif move == 'down' and (x, y - 1) in open_cells:
        return True
    elif move == 'left' and (x - 1, y) in open_cells:
        return True
    elif move == 'right' and (x + 1, y) in open_cells:
        return True
    elif move == 'stay':
        return True
    else:
        return False

In [None]:
def predict_to_move(bot, prediction):
    next_move = bot
    
    if prediction == 0:
        next_move = (bot[0], bot[1] + 1)
    elif prediction == 1:
        next_move = (bot[0], bot[1] - 1)
    elif prediction == 2:
        next_move = (bot[0] - 1, bot[1])
    elif prediction == 3:
        next_move = (bot[0] + 1, bot[1])
    else:
        next_move = bot
        
    return next_move

In [None]:
def create_valid_matrix(X):
    global grid, open_cells
    directions = ['up', 'down', 'left', 'right', 'stay']
    valid_list = []
    for i in range(len(X)):
        x, y = X.iloc[i, 0], X.iloc[i, 1]
        validity_for_each_direction = [is_valid(x, y, move, grid, open_cells) for move in directions]
        valid_list.append(validity_for_each_direction)

    valid_array = np.array(valid_list)
    return valid_array

In [None]:
def determine_probabilities(bot, matrix):
    directions = {'up': (bot[0], bot[1] - 1), 
                  'down': (bot[0], bot[1] + 1), 
                  'left': (bot[0] - 1, bot[1]), 
                  'right': (bot[0] + 1, bot[1]),
                  'stay': bot}
    return [matrix.get(directions[direction], 0) for direction in ['up', 'down', 'left', 'right', 'stay']]

In [None]:
def determine_d_crew(ship, bot, alpha, d_lookup_table, crew_list, crew_matrix, open_cells):
    directions = {'up': (bot[0], bot[1] - 1), 
                  'down': (bot[0], bot[1] + 1), 
                  'left': (bot[0] - 1, bot[1]), 
                  'right': (bot[0] + 1, bot[1]),
                  'stay': bot}
    
    direction_values = set(directions.values())
    
    filtered_crew_matrix = {k: v for k, v in crew_matrix.items() if k not in direction_values}
    
    if filtered_crew_matrix:
        max_crew_cell = max(filtered_crew_matrix, key=filtered_crew_matrix.get)
    else:
        return [0] * 5, d_lookup_table
        
    d_list = []
    
    for direction in ['up', 'down', 'left', 'right', 'stay']:
        if (directions[direction] in open_cells or directions[direction] == bot) and directions[direction] not in crew_list:
            _, d_lookup_table = crew_sensor(ship, directions[direction], alpha, d_lookup_table, crew_list)
            d_dict = d_lookup_table.get(directions[direction])
            d_list.append(1 / d_dict[max_crew_cell[0], max_crew_cell[1]])
        else:
            d_list.append(0)
    
    return d_list, d_lookup_table

In [None]:
def determine_d_alien(ship, bot, alpha, d_lookup_table, alien_list, crew_list, alien_matrix, open_cells):
    directions = {'up': (bot[0], bot[1] - 1), 
                  'down': (bot[0], bot[1] + 1), 
                  'left': (bot[0] - 1, bot[1]), 
                  'right': (bot[0] + 1, bot[1]),
                  'stay': bot}
    
    direction_values = set(directions.values())
    
    filtered_alien_matrix = {k: v for k, v in alien_matrix.items() if k not in direction_values}
    
    if filtered_alien_matrix:
        max_alien_cell = max(filtered_alien_matrix, key=filtered_alien_matrix.get)
    else:
        return [0] * 5, d_lookup_table
        
    d_list = []
    
    for direction in ['up', 'down', 'left', 'right', 'stay']:
        if (directions[direction] in open_cells or directions[direction] == bot) and directions[direction] not in crew_list and directions[direction] not in alien_list:
            _, d_lookup_table = crew_sensor(ship, directions[direction], alpha, d_lookup_table, crew_list)
            d_dict = d_lookup_table.get(directions[direction])
            d_list.append(1 / d_dict[max_alien_cell[0], max_alien_cell[1]])
        else:
            d_list.append(0)
    
    return d_list, d_lookup_table

In [None]:
def predict_with_params(actor, bot, alien_matrix, crew_matrix, d_crew, d_alien, alien_detected, crew_detected):
    alien_probs = determine_probabilities(bot, alien_matrix)
    crew_probs = determine_probabilities(bot, crew_matrix)
    
    X = pd.DataFrame([{
        'bot_x': bot[0],
        'bot_y': bot[1],
            
        'alien_up': alien_probs[0],
        'alien_down': alien_probs[1],
        'alien_left': alien_probs[2],
        'alien_right': alien_probs[3],
        'alien_stay': alien_probs[4],
            
        'crew_up': crew_probs[0],
        'crew_down': crew_probs[1],
        'crew_left': crew_probs[2],
        'crew_right': crew_probs[3],
            
        'd_crew_up': np.float32(d_crew[0]),
        'd_crew_down': np.float32(d_crew[1]),
        'd_crew_left': np.float32(d_crew[2]),
        'd_crew_right': np.float32(d_crew[3]),
        'd_crew_stay': np.float32(d_crew[4]),
        
        'd_alien_up': np.float32(d_alien[0]),
        'd_alien_down': np.float32(d_alien[1]),
        'd_alien_left': np.float32(d_alien[2]),
        'd_alien_right': np.float32(d_alien[3]),
        'd_alien_stay': np.float32(d_alien[4]),
            
        'alien_detected': 1 if alien_detected else 0,
        'crew_detected': 1 if crew_detected else 0,
    }])
    
#     X = pd.DataFrame([{
#         'bot_x': bot[0],
#         'bot_y': bot[1],

#         'alien_up': alien_probs[0],
#         'alien_down': alien_probs[1],
#         'alien_left': alien_probs[2],
#         'alien_right': alien_probs[3],
#         'alien_stay': alien_probs[4],

#         'crew_up': crew_probs[0],
#         'crew_down': crew_probs[1],
#         'crew_left': crew_probs[2],
#         'crew_right': crew_probs[3],

#         'alien_detected': 1 if alien_detected else 0,
#         'crew_detected': 1 if crew_detected else 0,
#     }], columns=['bot_x', 'bot_y', 'alien_up', 'alien_down', 'alien_left', 'alien_right', 'alien_stay', 'crew_up', 'crew_down', 'crew_left', 'crew_right', 'alien_detected', 'crew_detected'])
    
    valid = create_valid_matrix(X)
    
    crew_columns = ['crew_up', 'crew_down', 'crew_left', 'crew_right']
    X['sum_crew'] = X[crew_columns].sum(axis=1)

    for col in crew_columns:
        X.loc[X['sum_crew'] != 0, col] = X[col] / X['sum_crew']

    X.drop('sum_crew', axis=1, inplace=True)
    
    X_tensor = torch.tensor(X.values, dtype=torch.float32)
    
    prediction = predict_action(actor, X_tensor, valid)
    next_move = predict_to_move(bot, prediction)
    
    return next_move

In [None]:
def Actor_Bot1(k, alpha, max_iter, timeout, actor, filename):
    global grid, open_cells
    
    grid, open_cells = reset_grid(grid, open_cells)
    bot, ship, open_cells = place_bot(grid, open_cells)

    crew_list = []
    alien_list = []
    d_lookup_table = {}
    
    data_log = [] # Data Log Initialization

    crew_list, ship = place_crew(ship, open_cells, crew_list)
    alien_list, ship = place_alien(ship, open_cells, alien_list, bot, k)

    alien_matrix = initialize_alienmatrix(open_cells, bot, k)
    crew_matrix = initialize_crewmatrix(open_cells, crew_list, bot)
    
    alien_detected = alien_sensor(alien_list, bot, k) # Initially Run Alien Sensor
    crew_detected, d_lookup_table = crew_sensor(ship, bot, alpha, d_lookup_table, crew_list) # Initially Run Crew Sensor
    
    win_count = 0
    loss_count = 0
    move = 0
    win_move_count = []
    marker = 0
    
    df = pd.DataFrame()
    cur_df = pd.DataFrame()
    
    while (win_count + loss_count) < max_iter:
        neighbors = check_valid_neighbors(len(ship), bot[0], bot[1])
        open_moves = [neigh for neigh in neighbors if (grid[neigh] != 1)]
        open_moves.append(bot)
        
        # Data Collection Process
        
        alien_probs = determine_probabilities(bot, alien_matrix)
        crew_probs = determine_probabilities(bot, crew_matrix)
        d_crew, d_lookup_table = determine_d_crew(ship, bot, alpha, d_lookup_table, crew_list, crew_matrix, open_cells) # Find shortest distance from highest probability crew cell to all neighbors
        d_alien, d_lookup_table = determine_d_alien(ship, bot, alpha, d_lookup_table, alien_list, crew_list, alien_matrix, open_cells) # Find shortest distance from highest probability alien cell to all neighbors
        
        next_move = predict_with_params(actor, bot, alien_matrix, crew_matrix, d_crew, d_alien, alien_detected, crew_detected) # Predict using trained network
        
        crew_probs.pop()
        
        if sum(crew_probs) > 0:
            crew_probs = [c / sum(crew_probs) for c in crew_probs]
        
        # Convert relative move to string      
        if next_move[0] > bot[0]:
            next_move_str = 'right'
        elif next_move[0] < bot[0]:
            next_move_str = 'left'
        elif next_move[1] > bot[1]:
            next_move_str = 'up'
        elif next_move[1] < bot[1]:
            next_move_str = 'down'
        else:
            next_move_str = 'stay'
        
        actions = {'up': 0, 'down': 1, 'left': 2, 'right': 3, 'stay': 4}
        best_move_encoded = actions[next_move_str]
        
        log_entry = {
            'bot_x': bot[0],
            'bot_y': bot[1],
            
            'alien_up': alien_probs[0],
            'alien_down': alien_probs[1],
            'alien_left': alien_probs[2],
            'alien_right': alien_probs[3],
            'alien_stay': alien_probs[4],
            
            'crew_up': crew_probs[0],
            'crew_down': crew_probs[1],
            'crew_left': crew_probs[2],
            'crew_right': crew_probs[3],
            
            'd_crew_up': np.float32(d_crew[0]),
            'd_crew_down': np.float32(d_crew[1]),
            'd_crew_left': np.float32(d_crew[2]),
            'd_crew_right': np.float32(d_crew[3]),
            'd_crew_stay': np.float32(d_crew[4]),
            
            'd_alien_up': np.float32(d_alien[0]),
            'd_alien_down': np.float32(d_alien[1]),
            'd_alien_left': np.float32(d_alien[2]),
            'd_alien_right': np.float32(d_alien[3]),
            'd_alien_stay': np.float32(d_alien[4]),
            
            'alien_detected': 1 if alien_detected else 0,
            'crew_detected': 1 if crew_detected else 0,
            
            'chosen_action': best_move_encoded,
            
            'successful': 0
        }
#         data_log.append(log_entry)

        cur_df = pd.concat([cur_df, pd.DataFrame([log_entry])], ignore_index=True)

#         log_entry = {
#             'bot_x': bot[0],
#             'bot_y': bot[1],
            
#             'alien_up': alien_probs[0],
#             'alien_down': alien_probs[1],
#             'alien_left': alien_probs[2],
#             'alien_right': alien_probs[3],
#             'alien_stay': alien_probs[4],
            
#             'crew_up': crew_probs[0],
#             'crew_down': crew_probs[1],
#             'crew_left': crew_probs[2],
#             'crew_right': crew_probs[3],
            
#             'alien_detected': 1 if alien_detected else 0,
#             'crew_detected': 1 if crew_detected else 0,
            
#             'chosen_action': best_move_encoded
#         }
#         data_log.append(log_entry)
        
        prev_win_count = win_count
        bot, crew_list, ship, open_cells, win_count, marker = move_bot(ship, bot, next_move, crew_list, alien_list, open_cells, win_count, 1)
        move += 1

        if marker == 1 or move >= timeout:
            loss_count += 1
            print(f"ACTOR captured! Win Count: {win_count}, Loss Count: {loss_count}")
            
            df = pd.concat([df, cur_df], ignore_index=True)
            cur_df.drop(cur_df.index, axis=0, inplace=True)
            cur_df.drop(cur_df.columns, axis=1, inplace=True)
            
            grid, open_cells = reset_grid(grid, open_cells)
            bot, ship, open_cells = place_bot(grid, open_cells)
            crew_list = []
            alien_list = []
            d_lookup_table = {}

            crew_list, ship = place_crew(ship, open_cells, crew_list)
            alien_list, ship = place_alien(ship, open_cells, alien_list, bot, k)

            alien_matrix = initialize_alienmatrix(open_cells, bot, k)
            crew_matrix = initialize_crewmatrix(open_cells, crew_list, bot)
            marker = 0
            move = 0

            continue

        if win_count > prev_win_count:
            print(f"Crew saved! Win Count: {win_count}, Loss Count: {loss_count}")
            
            cur_df['successful'] = 1
            df = pd.concat([df, cur_df], ignore_index=True)
            cur_df.drop(cur_df.index, axis=0, inplace=True)
            cur_df.drop(cur_df.columns, axis=1, inplace=True)
            
            win_move_count.append(move)
            move = 0
            d_lookup_table = {}
            alien_matrix = initialize_alienmatrix(open_cells, bot, k)
            crew_matrix = initialize_crewmatrix(open_cells, crew_list, bot)
        
        print(f"ACTOR: {bot}, Crew: {crew_list}, Aliens: {alien_list}")

        alien_matrix, crew_matrix = update_afterbotmove(bot, alien_matrix, crew_matrix)

        # Move bot to optimal neighbor
        marker, alien_list, ship = move_aliens(ship, alien_list, bot) # Move alien randomly

        if marker == 1 or move >= timeout:
            loss_count += 1
            print(f"ACTOR captured! Win Count: {win_count}, Loss Count: {loss_count}")

            df = pd.concat([df, cur_df], ignore_index=True)
            cur_df.drop(cur_df.index, axis=0, inplace=True)
            cur_df.drop(cur_df.columns, axis=1, inplace=True)
            
            grid, open_cells = reset_grid(grid, open_cells)
            bot, ship, open_cells = place_bot(grid, open_cells)
            crew_list = []
            alien_list = []
            d_lookup_table = {}

            crew_list, ship = place_crew(ship, open_cells, crew_list)
            alien_list, ship = place_alien(ship, open_cells, alien_list, bot, k)

            alien_matrix = initialize_alienmatrix(open_cells, bot, k)
            crew_matrix = initialize_crewmatrix(open_cells, crew_list, bot)
            marker = 0
            move = 0

            continue
        
        alien_matrix = update_afteralienmove(ship, alien_list, alien_matrix) # Update after alien move
        
        alien_detected = alien_sensor(alien_list, bot, k) # Run Alien Sensor
        crew_detected, d_lookup_table = crew_sensor(ship, bot, alpha, d_lookup_table, crew_list) # Run Crew Sensor
        
        alien_matrix = update_alienmatrix(alien_matrix, alien_detected, bot, k) # Update based on alien sensor

        crew_matrix = update_crewmatrix(crew_matrix, crew_detected, d_lookup_table, bot, alpha) # Update based on crew sensor
    
    df = pd.concat([df, cur_df], ignore_index=True)
        
#     df = pd.DataFrame(data_log)
    df.to_csv(filename, mode='w', index=False, header=True)

    return sum(win_move_count) // max(1, len(win_move_count)), (win_count / max(1, (win_count + loss_count))), win_count

In [None]:
def Actor_Bot1_Simulation(alpha_values, k_values, max_iter, timeout, num_simulations, actor, filename):
    avg_rescue_moves_mbot1 = {k: [] for k in k_values}
    prob_crew_rescue_mbot1 = {k: [] for k in k_values}
    avg_crew_saved_mbot1 = {k: [] for k in k_values}

    for k in k_values:
        for alpha in alpha_values:
            total_metric1_mbot1, total_metric2_mbot1, total_metric3_mbot1 = 0, 0, 0
            
            for i in range(num_simulations):
                metric1_mbot1, metric2_mbot1, metric3_mbot1 = Actor_Bot1(k, alpha, max_iter, timeout, actor, filename)

                total_metric1_mbot1 += metric1_mbot1
                total_metric2_mbot1 += metric2_mbot1
                total_metric3_mbot1 += metric3_mbot1

            avg_metric1_mbot1 = total_metric1_mbot1 / num_simulations
            avg_metric2_mbot1 = total_metric2_mbot1 / num_simulations
            avg_metric3_mbot1 = total_metric3_mbot1 / num_simulations

            print(f"ACTOR: k={k}, Alpha={alpha}\nAverage Rescue Moves={avg_metric1_mbot1}\nProbability of Crew Rescue={avg_metric2_mbot1}\nAverage Crew Saved={avg_metric3_mbot1}\n")
            
            avg_rescue_moves_mbot1[k].append(avg_metric1_mbot1)
            prob_crew_rescue_mbot1[k].append(avg_metric2_mbot1)
            avg_crew_saved_mbot1[k].append(avg_metric3_mbot1)

    return avg_rescue_moves_mbot1, prob_crew_rescue_mbot1, avg_crew_saved_mbot1

In [None]:
def test_simulation_actor(alpha_values, k_values, max_iter, timeout, num_simulations, actor, filename):
    avg_rescue_moves_mbot1, prob_crew_rescue_mbot1, avg_crew_saved_mbot1 = Actor_Bot1_Simulation(alpha_values, k_values, max_iter, timeout, num_simulations, actor, filename)

    prob_crew_rescue_mbot1 = {k: [round(prob, 3) for prob in probs] for k, probs in prob_crew_rescue_mbot1.items()}

    print(f"ACTOR:\nAverage Rescue Moves = {avg_rescue_moves_mbot1}\nProbability of Crew Rescue = {prob_crew_rescue_mbot1}\nAverage Crew Saved = {avg_crew_saved_mbot1}\n")
    
    return avg_rescue_moves_mbot1, prob_crew_rescue_mbot1, avg_crew_saved_mbot1

In [None]:
alpha_values = [0.004]
k_values = [3]
max_iter = 1
timeout = 10000
num_simulations = 10

In [None]:
filename = 'Data/Model3/actor_data.csv'

In [None]:
metric1_mbot1, metric2_mbot1, metric3_mbot1 = test_simulation_actor(alpha_values, k_values, max_iter, timeout, num_simulations, actor, filename)

In [None]:
print(metric1_mbot1, metric2_mbot1, metric3_mbot1)

### CRITIC Loop Functions

In [None]:
def preprocess_critic(filename):
    actor_data = pd.read_csv(filename)
    
    X = actor_data.iloc[:, :-1]
    y = actor_data.iloc[:, -1]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
    
    X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)

    X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)
    
    return X, y, X_train, y_train, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor

In [None]:
def init_train_critic(X_train_tensor, y_train_tensor, critic, optimizer):
    loss_list = train_critic(critic, optimizer, X_train_tensor, y_train_tensor, epochs=500)
    return loss_list

In [None]:
def generate_critic_dataset(critic, X):
    valid = create_valid_matrix(X)
    X_critic = []
    X_actor = X.iloc[:, :-1]

    for index, row in X_actor.iterrows():
        valid_moves = valid[index]
        best_action, highest_prob = None, 0

        for move, is_valid in enumerate(valid_moves):
            if is_valid:
                modified_row = row.copy()
                modified_row['chosen_action'] = move
                prob_success = predict_prob_success(critic, torch.tensor(np.array([modified_row.values]), dtype=torch.float32))
                if prob_success > highest_prob:
                    highest_prob, best_action = prob_success, move

        X_critic.append(list(row.values) + [best_action])

    critic_data = pd.DataFrame(X_critic, columns = list(X_actor.columns) + ['chosen_action'])
    critic_data.to_csv('Data/Model3/critic_data.csv', index=False)

    return critic_data

In [None]:
def critic_loop(critic, critic_optimizer):
    actor_filename = 'Data/Model3/actor_data.csv' # Use data generated by ACTOR
    
    X, y, X_train, y_train, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor = preprocess_critic(actor_filename)
    loss_list = init_train_critic(X_train_tensor, y_train_tensor, critic, critic_optimizer)
    
    plot_loss(loss_list)
    
    critic_data = generate_critic_dataset(critic, X)
    
    return critic_data

In [None]:
# critic = Critic(24)
# critic_optimizer = torch.optim.Adam(critic.parameters(), lr = 0.001)

# critic_data = critic_loop(critic, critic_optimizer)
# critic_data

### ACTOR Loop Functions

In [None]:
def preprocess_actor(filename):
    critic_data = pd.read_csv(filename)
    
    X = critic_data.iloc[:, :-1]
    y = critic_data.iloc[:, -1]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
    
    X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)

    X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)
    
    return X, y, X_train, y_train, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor

In [None]:
def init_train_actor(X_train_tensor, y_train_tensor, actor, optimizer):
    loss_list = train_actor(actor, optimizer, X_train_tensor, y_train_tensor, epochs=1000)
    return loss_list

In [None]:
def update_actor_dataset(alpha_values, k_values, max_iter, timeout, num_simulations, actor, filename):
    metric1_mbot1, metric2_mbot1, metric3_mbot1 = test_simulation_actor(alpha_values, k_values, max_iter, timeout, num_simulations, actor, filename)

    return metric1_mbot1, metric2_mbot1, metric3_mbot1

In [None]:
def actor_loop(actor, actor_optimizer):
    critic_filename = 'Data/Model3/critic_data.csv'
    
    X, y, X_train, y_train, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor = preprocess_actor(critic_filename)
    loss_list = init_train_actor(X_train_tensor, y_train_tensor, actor, actor_optimizer)
    
    plot_loss(loss_list)
    
    alpha_values = [0.004]
    k_values = [3]
    max_iter = 1
    timeout = 10000
    num_simulations = 10
    
    actor_filename = 'Data/Model3/actor_data.csv'
    
    metric1_mbot1, metric2_mbot1, metric3_mbot1 = update_actor_dataset(alpha_values, k_values, max_iter, timeout, num_simulations, actor, actor_filename)
    
    return metric1_mbot1, metric2_mbot1, metric3_mbot1

In [None]:
# actor = Actor(23)
# actor_optimizer = torch.optim.Adam(actor.parameters(), lr = 0.001)

# metric1_mbot1, metric2_mbot1, metric3_mbot1 = actor_loop(actor, actor_optimizer)
# metric1_mbot1, metric2_mbot1, metric3_mbot1

### Reinforcement Learning

In [None]:
def plot_metrics(metric1, metric2, metric3, num_iter):
    plt.figure(figsize=(10, 6))
    plt.plot(range(num_iter), metric1, label='Average Rescue Moves')
    plt.plot(range(num_iter), metric2, label='Probability of Crew Rescue')
    plt.plot(range(num_iter), metric3, label='Average Crew Saved')
    plt.xlabel('Iteration')
    plt.ylabel('Value')
    plt.title('Reinforcement Learning Metrics')
    plt.legend()

    timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    filename = f'Data/Model3/Plots/plot_{timestamp}.png'

    plt.savefig(filename)

    plt.show()

In [None]:
def reinforcement_learning(num_iter, num_features_critic, num_features_actor):
    metric1 = []
    metric2 = []
    metric3 = []
    
    critic = Critic(num_features_critic)
    critic_optimizer = torch.optim.Adam(critic.parameters(), lr = 0.001)
    
    actor = Actor(num_features_actor)
    actor_optimizer = torch.optim.Adam(actor.parameters(), lr = 0.001)
    
    for i in range(num_iter):
        critic_data = critic_loop(critic, critic_optimizer)
        metric1_mbot1, metric2_mbot1, metric3_mbot1 = actor_loop(actor, actor_optimizer)
        
        metric1.append(metric1_mbot1.get(3)[0])
        metric2.append(metric2_mbot1.get(3)[0])
        metric3.append(metric3_mbot1.get(3)[0])
        
        print(metric1, metric2, metric3)
        
    plot_metrics(metric1, metric2, metric3, num_iter)
    
    return actor, critic

In [None]:
actor, critic = reinforcement_learning(10, 24, 23)