## Reinforcement Learning for Predictive Mainteance

- Author: Rajesh Siraskar
- Date: 09-Jan-2022

#### Elective Objectives:
1. Basics: RL
2. Python basics: numpy, pandas and matplotlib
3. Advanced: PyTorch, RL programming, TensorBoard
4. Understand OpenAI gym and create **custom env.** 
5. Put together a RL loop - non-functional/dummy
6. Show training curves in TensorBoard
7. **Propose ideas for a functional model**

#### Approach/Ideas:
1. Action: Is one of No-action, Maintenance-Regular, Maintenance-Special, Replace
2. Cost: Associate cost of taking these: Replace = 100, Special=40, Regular=20
3. After each maintenance: Increase RUL: Replace = x10, Special=x5, Regular=x2
4. Doing regular in downward trend of operation -> RUL increase only 10% regular
5. Reward: Most life, least cost so for example: R = RUL/(Cost+1) i.e. as RUL high and cost low -> reward MOST 
6. What one expects: Regular maintenance done in normal cycle
7. Optimal Policy = When to maintain with least cost

- Running TensorBoard: 
    - conda activate RL_OptimalControl
    - tensorboard --logdir E:\Projects\RL_for_Predictive_Maintenance\tensorboard\.

In [None]:
import gym
import json

from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3 import A2C, PPO, DQN

#### Constants

In [None]:
EPISODES_MAX = 4000

BEARING_FAILURE_DATA_FILE = 'data\data.csv'
TRAINING_RESULTS_FILE = 'Test_run_A2C_09-Feb.csv'

## Environment: Custom class based on OpenGym abstract class 
### Bearing failure environment

In [None]:
import random
import math
import json
import gym
from gym import spaces
import pandas as pd
import numpy as np

MAX_LIFE = 470

MAX_VIBRATION = 2
MAX_REWARD = 10
MAX_STEPS = 20000
RUL_WINDOW = 10
LAMBDA = 0.01
INITIAL_RUL = MAX_LIFE

NO_ACTION = 0
MAINTENANCE_REGULAR = 1
MAINTENANCE_SPECIAL = 2
MAINTENANCE_REPLACE = 3

a_cost = []
a_action_recommended = []
a_actions = []
a_action_text = []
a_rul = []
a_rewards = []
a_events = []
a_time = []

class BearingEnv(gym.Env):
    """A Bearing RUL estimating environment for OpenAI gym"""
    metadata = {'render.modes': ['human']}

    def __init__(self, df):        
        super(BearingEnv, self).__init__()

        self.df = df
        self.maintenance_cost = 0
        self.maintenance_events = 0
        self.rul = 0
        self.reward = 0
        self.current_action = NO_ACTION
        
        self.reward_range = (0, MAX_REWARD)

        high = np.array(
            [
                MAX_LIFE,
                MAX_VIBRATION,
                MAINTENANCE_REPLACE,
            ],
            dtype=np.float32,
        )
        
        self.action_space = spaces.Discrete(4)
        self.observation_space = spaces.Box(-high, high, dtype=np.float32)
        
        self.state = None
            
    def _next_observation(self):      
        frame = np.array([
            self.df.loc[self.current_step, 'RUL'] / MAX_LIFE,
            self.df.loc[self.current_step, 'vibration'] / MAX_VIBRATION,
            self.df.loc[self.current_step, 'temperature'],
        ])
        
        self.rul = self.df.loc[self.current_step: self.current_step, 'RUL'].values[0]
        
        # Append additional data and scale each value to between 0-1
        obs = frame.flatten()
        return obs

    def _take_action(self, action):

        if action == NO_ACTION: # Normal state
            # 1% reduction in life
            # self.rul *= 0.99
            # self.maintenance_cost += 0.1
            action_text = 'None'
            
        elif action == MAINTENANCE_REGULAR: 
            # 1% increase in life
            self.rul *= 1.01
            self.maintenance_cost += 1
            self.maintenance_events += 1
            action_text = 'Maintenance-Regular'

        elif action == MAINTENANCE_SPECIAL: 
            # 5% increase in life
            self.rul *= 1.05
            self.maintenance_cost += 4
            self.maintenance_events += 1
            action_text = 'Maintenance-Special' 
            
        elif action == MAINTENANCE_REPLACE: 
            # Normal state
            self.rul = MAX_LIFE
            self.maintenance_cost += 20
            self.maintenance_events += 1
            action_text = '* REPLACE *' 
            
        #a_actions.append(action)
        a_action_text.append(action_text)
        self.reward = self.rul / (self.maintenance_cost+LAMBDA)
        
        print('{0:<20} | RUL: {1:>8.2f} | Cost: {2:>8.2f} | Maintenance events: {3:>3d} | Reward: {4:>12.3f}'.
              format(action_text, self.rul, self.maintenance_cost, self.maintenance_events, self.reward))
                 
        self.state = (self.rul, self.maintenance_cost, action)

  
    def step(self, action):        
        # Execute one time step within the environment
        self._take_action(action)
        reward = self.reward
        
        if self.current_step >= (len(self.df.loc[:, 'RUL'].values)-1):
            done = True
        else:
            self.current_step += 1
            done = False
                
        if self.rul <= 10:
            done = True
        elif self.rul >= MAX_LIFE:
            done = True
        else:
            done = False

        obs = self._next_observation()
        
        a_time.append(self.current_step)
        a_rewards.append(self.reward)
        a_rul.append(self.rul)
        a_cost.append(self.maintenance_cost)
        a_actions.append(action)
        a_events.append(self.maintenance_events)
        
        # From database extract recommended action
        recommended_action = self.df.loc[self.current_step: self.current_step, 'ACTION_CODE'].values[0]
        a_action_recommended.append(recommended_action)
        
        return obs, reward, done, {}

    def reset(self):
        done = False
        self.state = np.random.uniform(low=-0.0, high=1.0, size=(3,))

        # Set the current step to a random point within the data frame
        self.current_step = 0
        # self.current_step = random.randint(0, len(self.df.loc[:, 'RUL'].values))

        self.rul = self.df.loc[self.current_step: self.current_step, 'RUL'].values[0]
        print('\n --- RESET. Starting RUL = ', self.rul)
        # Reset the state of the environment to an initial state
        self.maintenance_cost = 0
        self.maintenance_events = 0
        self.reward = 0
        self.current_action = NO_ACTION
        
        return np.array(self.state, dtype=np.float32)

    def render(self, mode='human', close=False):
        # Render the environment to the screen
        RUL = self.rul
        
        print('>> {0:<20} | RUL: {1:>8.2f} | Cost: {2:>8.2f} | Reward: {3:>12.3f}'.
              format(self.current_action, self.rul, self.maintenance_cost, self.reward))

### Check environment definition stability

In [None]:
# from stable_baselines3.common.env_checker import check_env
# env = BearingEnv(df)
# check_env(env)

### Read data

In [None]:
df = pd.read_csv(BEARING_FAILURE_DATA_FILE)

In [None]:
EPISODES = EPISODES_MAX
RUN_TEST = 20

### Train the agent

In [None]:
# The algorithms require a vectorized environment to run
env = DummyVecEnv([lambda: BearingEnv(df)])

model = A2C('MlpPolicy', env, verbose=1, tensorboard_log="./tensorboard/")
#model = DQN('MlpPolicy', env, verbose=1, tensorboard_log="./tensorboard/")
#model = PPO('MlpPolicy', env, verbose=1, tensorboard_log="./tensorboard/")

model.learn(total_timesteps=EPISODES)

### Store results in a Pandas data-frame

In [None]:
df_results = pd.DataFrame({'Time-step':a_time[:], 'Recommended_Action':a_action_recommended[:], 'Action':a_action_text[:], 
                           'Action_Code':a_actions[:], 'Rewards':a_rewards[:], 'RUL':a_rul[:], 
                           'MaintenanceEvents':a_events[:], 'Cost':a_cost[:]})
df_results.head()

In [None]:
df_results.Rewards = df_results.Rewards/df_results.Rewards.max()
df_results.RUL = df_results.RUL/df_results.RUL.max()
df_results.Cost = df_results.Cost/df_results.Cost.max()

### Convert to CSV for Analysis notebook

In [None]:
df_results.to_csv(TRAINING_RESULTS_FILE)

### Test agent

In [None]:
obs = env.reset()
for i in range(20):
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    env.render()