## Milling Tool Wear Maintenance Policy using the REINFORCE algorithm

- Ver. 4.1: 01-May-2023 - Improved plots
- Ver. 4.5: 01-May-2023 - Add Stable-Baselines algo.
- Ver. 4.6: 02-May-2023 - Expt RF vs SB-3 PPO, scaled rewards: 1/10^6 
- Ver. 4.8: 06-May-2023 - Auto run SB3 algos.
- Ver. 5.0: 06-May-2023 - Re-factor. Functions conversions to enable test runs.
- Ver. 5.1: 07-May-2023 - PPO behaviour. For SB-3 models, correct test set reshuffle and retraining of models everytime 
- Ver. 5.2: 07-May-2023 - Test cases sample 40 (constant) and _without_ replacement 

In [22]:
import numpy as np
import pandas as pd

import milling_tool_environment
import utilities
from milling_tool_environment import MillingTool_V2
from utilities import store_results, plot_learning_curve, single_axes_plot, two_axes_plot, two_variable_plot, plot_error_bounds, test_script, write_test_results
from reinforce_classes import PolicyNetwork, Agent

import datetime
dt = datetime.datetime.now()
dt_d = dt.strftime('%d-%b-%Y')
dt_t = dt.strftime('%H_%M_%S')

In [23]:
import importlib
importlib.reload(milling_tool_environment)
importlib.reload(utilities)

<module 'utilities' from 'D:\\Rajesh\\ResearchLab\\LG_\\utilities.py'>

In [24]:
VERSION = '5_2_1'
TRAIN_ROUNDS = 40 # Suggested 40
# Milling operation constants
WEAR_THRESHOLD = 3.0 # mm
WEAR_THRESHOLD_NORMALIZED = 1.0 # normalized to the max wear threshold
MILLING_OPERATIONS_MAX = 300 # Suggested 300
ADD_NOISE = 0.0
BREAKDOWN_CHANCE = 0.0

# Policy network learning parameters
gamma = 0.99
alpha = 0.01
EPISODES = 600 # Train for N episodes. # Suggested 600
TEST_CASES = 40

PREFIX = f'results_plots/V_{VERSION}_Tool_Wear_{dt_d}_{dt_t}'
DATA_FILE = 'Tool_Wear_VB.csv'
RESULTS_FILE = f'results_plots/V_{VERSION}_test_results.csv'
RF_TRAINING_FILE = f'results_plots/V_{VERSION}_RF_training_{dt_d}_{dt_t}.csv'
RF_TRAINING_FILE

'results_plots/V_5_2_1_RF_training_08-May-2023_12_26_51.csv'

In [25]:
## Read data
df = pd.read_csv(DATA_FILE)
df = df[['time', 'VB_mm', 'ACTION_CODE']]

# Normalizing entire df with min-max scaling
WEAR_MIN = df['VB_mm'].min() 
WEAR_MAX = df['VB_mm'].max()
WEAR_THRESHOLD_NORMALIZED = (WEAR_THRESHOLD-WEAR_MIN)/(WEAR_MAX-WEAR_MIN)
df_normalized = (df-df.min())/(df.max()-df.min())
df_normalized['ACTION_CODE'] = df['ACTION_CODE']
print(f'Tool wear data imported ({len(df.index)} records). WEAR_THRESHOLD_NORMALIZED: {WEAR_THRESHOLD_NORMALIZED:4.3f} \n\n')

## Visualize the data
n_records = len(df.index)
x = df['time'].values.tolist()
y1 = df['VB_mm'].values.tolist()
y2 = df['ACTION_CODE'].values.tolist()

# two_axes_plot(x, y1, y2, title='Tool Wear (mm) data', x_label='Time', y1_label='Tool Wear (mm)', y2_label='Action code (1=Replace)', xticks=10,threshold=WEAR_THRESHOLD)

Tool wear data imported (121 records). WEAR_THRESHOLD_NORMALIZED: 0.168 




## REINFORCE RL Algorithm

In [None]:
for training_round in range(TRAIN_ROUNDS):
    print(60*'-', f'\n ==== ROUND: {training_round} ====')
    ### Main loop
    rewards_history = []
    loss_history = []
    training_stats = []

    env = MillingTool_V2(df_normalized, WEAR_THRESHOLD_NORMALIZED, MILLING_OPERATIONS_MAX, ADD_NOISE, BREAKDOWN_CHANCE)

    input_dim = env.observation_space.shape[0]
    output_dim = env.action_space.n

    agent_RF = Agent(input_dim, output_dim, alpha, gamma)

    for episode in range(EPISODES):
        state = env.reset()

        # Sample a trajectory
        for t in range(MILLING_OPERATIONS_MAX): # Max. milling operations desired
            action = agent_RF.act(state)
            state, reward, done, info = env.step(action)
            agent_RF.rewards.append(reward)
            #env.render()
            if done:
                # print('** DONE **', info)
                break

        # Learn during this episode 
        loss = agent_RF.learn() # train per episode
        total_reward = sum(agent_RF.rewards)

        # Record statistics for this episode
        rewards_history.append(total_reward)
        loss_history.append(loss.item()) # Extract values from list of torch items for plotting

        # On-policy - so discard all data 
        agent_RF.onpolicy_reset()

        # if (episode%300 ==0):
        #     print(f'[{episode:04d}] Loss: {loss:>10.2e} | Reward: {total_reward:>10.2e} | Ep.length: {env.ep_length:04d}')

    # Process results
    eps = [i for i in range(EPISODES)]
    store_results(RF_TRAINING_FILE, training_round, eps, rewards_history, env.ep_tool_replaced_history)
    # Create test cases
    test_cases = np.random.choice(env.df_length-1, TEST_CASES, replace=False)
    test_cases = np.sort(test_cases)

    results = test_script(training_round, df_normalized, 'REINFORCE', EPISODES, env, agent_RF, test_cases, DATA_FILE, RESULTS_FILE)
    write_test_results(results, RESULTS_FILE)

------------------------------------------------------------ 
 ==== ROUND: 0 ====
REINFORCE algorithm results saved to results_plots/V_5_2_1_RF_training_08-May-2023_12_26_51.csv
--------------------------------------------------
Algo.	Normal cases	Error %	Replace cases	Error %	Overall  error%
REINFORCE		24	0.125		16	0.062	0.100
--------------------------------------------------
- Test results written to file: results_plots/V_5_2_1_test_results.csv
------------------------------------------------------------ 
 ==== ROUND: 1 ====
REINFORCE algorithm results saved to results_plots/V_5_2_1_RF_training_08-May-2023_12_26_51.csv
--------------------------------------------------
Algo.	Normal cases	Error %	Replace cases	Error %	Overall  error%
REINFORCE		25	0.080		15	0.067	0.075
--------------------------------------------------
- Test results written to file: results_plots/V_5_2_1_test_results.csv
------------------------------------------------------------ 
 ==== ROUND: 2 ====
REINFORCE algo

In [None]:
x = [i for i in range(EPISODES)]

## Moving average for rewards
ma_window_size = 10
# # Convert error array to pandas series
rewards = pd.Series(rewards_history)
windows = rewards.rolling(ma_window_size)
moving_avg = windows.mean()
moving_avg_lst = moving_avg.tolist()
y1 = rewards
y2 = moving_avg_lst

filename = PREFIX + '_Avg_episode_rewards.png'
two_variable_plot(x, y1, y2, 'Avg. rewards per episode', 'Episode', 'Avg. Rewards', 'Moving Avg.', 10, filename)

plot_error_bounds(x, y1)

filename = PREFIX + '_Episode_Length.png'
single_axes_plot(x, env.ep_length_history, 'Episode length', 'Episode', 'No of milling operations', 10, 0.0, filename)

filename = PREFIX + '_Tool_Replacements.png'
single_axes_plot(x, env.ep_tool_replaced_history, 'Tool replacements per episode', 'Episode', 'Replacements', 10, 0.0, filename)

## Stable-Baselines Algorithms

In [None]:
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3 import A2C, PPO, DQN

In [None]:
algos = ['A2C','DQN','PPO']
for SB_ALGO in algos:
    if SB_ALGO.upper() == 'A2C': agent_SB = A2C('MlpPolicy', env)
    if SB_ALGO.upper() == 'DQN': agent_SB = DQN('MlpPolicy', env)
    if SB_ALGO.upper() == 'PPO': agent_SB = PPO('MlpPolicy', env)

    for training_round in range(TRAIN_ROUNDS):
        env = MillingTool_V2(df_normalized, WEAR_THRESHOLD_NORMALIZED, MILLING_OPERATIONS_MAX, ADD_NOISE, BREAKDOWN_CHANCE)
        print(f'\n\n{SB_ALGO} - Round {training_round}: Training and Testing Stable-Baselines-3 {SB_ALGO} algorithm')
        agent_SB.learn(total_timesteps=EPISODES)
        
        # Create test cases
        test_cases = np.random.choice(env.df_length-1, TEST_CASES, replace=False)
        test_cases = np.sort(test_cases)
        results = test_script(training_round, df_normalized, SB_ALGO, EPISODES, env, agent_SB, test_cases, DATA_FILE, RESULTS_FILE)
        write_test_results(results, RESULTS_FILE)
        
print('\n\n ================= END OF PROGRAM =================')

In [None]:
%%time
for n in range(100):
    k = n*n
    
print(k)