In [2]:
import pandas as pd
import numpy as np
from collections import OrderedDict, deque, Counter
import matplotlib.pyplot as plt
from tqdm import tqdm
from random import randint
import random

random.seed(6)

In [3]:
history = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/switching_with_reinforcement_bachelors/history.csv')
submit = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/switching_with_reinforcement_bachelors/submit.csv')

In [4]:
submit.head()

Unnamed: 0,state,action0,action1,action2
0,393a1,34,33,33
1,2c674,34,33,33
2,eb3be,34,33,33
3,5aca6,34,33,33
4,9512d,34,33,33


In [5]:
print(history.shape)
print('Всего уникальных записей в таблице: ' + str(history.drop_duplicates().shape[0]))
history

(305000, 4)
Всего уникальных записей в таблице: 1813


Unnamed: 0,state,action,reward,next_state
0,785f3,1,-5,785f3
1,0c75c,0,-1,da287
2,aa820,2,-1,aa820
3,e52d0,1,-2,e52d0
4,3055e,1,-2,3055e
...,...,...,...,...
304995,37b73,0,-1,a6c2a
304996,27e16,0,1,81f27
304997,6fb47,2,-3,6fb47
304998,81f27,2,-7,81f27


In [6]:
ind2state = {}
state2ind = {}
for i, state in enumerate(submit.state.values):
  ind2state[i] = state
  state2ind[state] = i

In [7]:
class environment:
  def __init__(self, history):
    self.states = history['state'].unique()
    self.env = {}

    for state in tqdm(self.states):
        next_states = history[history['state'] == state].drop_duplicates()

        self.env[state] = {}
        try:
            self.env[state][0] = [1.0, next_states[next_states['action'] == 0]['next_state'].values[0], int(next_states[next_states['action'] == 0]['reward'].values)]
        except:
            self.env[state][0] = [0.0, state, -100]

        try:
            self.env[state][1] = [1.0, next_states[next_states['action'] == 1]['next_state'].values[0], int(next_states[next_states['action'] == 1]['reward'].values)]
        except:
            self.env[state][1] = [0.0, state, -100]

        try:
            self.env[state][2] = [1.0, next_states[next_states['action'] == 2]['next_state'].values[0], int(next_states[next_states['action'] == 2]['reward'].values)]
        except:
            self.env[state][2] = [0.0, state, -100]
    
    self.cur_state = self.states[0]
    
  def reset(self, state=None):
    if state == None:
        rand = randint(0, len(self.states) - 1)
        self.cur_state = self.states[rand]
    else:
        self.cur_state = state
        
    return self.cur_state
  
  def sample(self):
    p0 = self.env[self.cur_state][0][0]
    p1 = self.env[self.cur_state][1][0]
    p2 = self.env[self.cur_state][2][0]

    if p0 == 0:
      action = np.random.choice(np.arange(0, 3), p=[0., 0.5, 0.5])
    elif p1 == 0:
      action = np.random.choice(np.arange(0, 3), p=[0.5, 0., 0.5])      
    elif p2 == 0:
      action = np.random.choice(np.arange(0, 3), p=[0.5, 0.5, 0.])
    else:
      action = np.random.choice(np.arange(0, 3), p=[0.34, 0.33, 0.33])
    
    return action

  def step(self, action):
    next_state = self.env[self.cur_state][action][1]
    reward = self.env[self.cur_state][action][2]

    self.cur_state = next_state

    return next_state, reward


env = environment(history)

100%|██████████| 610/610 [00:28<00:00, 21.24it/s]


In [8]:
import numpy as np
q_table = np.zeros([610, 3])

In [9]:
"""Training the agent"""

import random
from IPython.display import clear_output

# Hyperparameters
alpha = 0.1
gamma = 1.0
epsilon = 0.1

for i in tqdm(range(1, 60001)):
    state = env.reset()

    epochs, reward, = 0, 0
    while epochs < 100:
        if random.uniform(0, 1) < epsilon:
            action = env.sample() # Explore action space
        else:
            action = np.argmax(q_table[state2ind[state]]) # Exploit learned values

        next_state, reward = env.step(action)

        old_value = q_table[state2ind[state], action]
        next_max = np.max(q_table[state2ind[next_state]])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state2ind[state], action] = new_value

        state = next_state

        epochs += 1
        
    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")

print("Training finished.\n")

100%|██████████| 60000/60000 [02:27<00:00, 405.70it/s]

Episode: 60000
Training finished.






In [10]:
"""Evaluate agent's performance after Q-learning"""

total_penalties = 0
episodes = 100

for state in tqdm(env.states):
    state = env.reset(state)
    for _ in range(episodes):
        epochs, penalties, reward = 0, 0, 0
        while epochs < 100:
            action = np.argmax(q_table[state2ind[state]])
            state, reward = env.step(action)

            epochs += 1

            penalties += reward


        total_penalties += penalties

print(f"Average rewards per state: {total_penalties / (len(env.states) * episodes * 100)}")

100%|██████████| 610/610 [00:26<00:00, 22.63it/s]

Average rewards per state: 3.1770588524590164





In [11]:
"""Evaluate agent's random performance"""

total_penalties = 0
episodes = 100

for state in tqdm(env.states):
    state = env.reset(state)
    for _ in range(episodes):
        epochs, penalties, reward = 0, 0, 0
        while epochs < 100:
            action = env.sample()
            state, reward = env.step(action)

            epochs += 1

            penalties += reward


        total_penalties += penalties

print(f"Average rewards per episode: {total_penalties / (len(env.states) * episodes * 100)}")

100%|██████████| 610/610 [03:10<00:00,  3.20it/s]

Average rewards per episode: -3.1826581967213117





In [None]:
submission = pd.DataFrame(q_table, columns=['action0', 'action1', 'action2'])
submission['state'] = submission.index
submission['state'] = submission['state'].replace(ind2state)
submission = submission[['state','action0', 'action1', 'action2']]

minimum = submission[['action0', 'action1', 'action2']].min(axis=1)
submission['action0'] = submission.action0 - minimum
submission['action1'] = submission.action1 - minimum
submission['action2'] = submission.action2 - minimum

sum = submission[['action0', 'action1', 'action2']].sum(axis=1)
submission['action0'] = round(submission.action0 * 100 / sum).apply(int)
submission['action1'] = round(submission.action1 * 100 / sum).apply(int)
submission['action2'] = round(submission.action2 * 100 / sum).apply(int)

submission.to_csv('submit.csv', index=False)

#Детерминированный алгоритм переходов

In [None]:
submission['action0'] = submission['action0'].apply(lambda x: 100 if x >= 50 else 0)
submission['action1'] = submission['action1'].apply(lambda x: 100 if x >= 50 else 0)
submission['action2'] = submission['action2'].apply(lambda x: 100 if x >= 50 else 0)

submission.to_csv('determine_submit.csv', index=False)
submission.head()

Unnamed: 0,state,action0,action1,action2
0,393a1,100,0,0
1,2c674,100,0,0
2,eb3be,100,0,0
3,5aca6,100,0,0
4,9512d,100,0,0
