In [None]:
import numpy as np
class ReLU:
  def __init__(self):
    self.params,self.grads=[],[]
    self.out=None

  def forward(self,x):
    out=np.maximum(0,x)
    self.out=out
    return out

  def backward(self,dout):
      out=self.out
      dx=dout*np.where(out>0,1,0)
      return dx

In [None]:
import numpy as np
class Sigmoid:
  def __init__(self):
    self.params,self.grads=[],[]
    self.out=None

  def forward(self,x):
    out=1/(1+np.exp(-x))
    self.out=out
    return out

  def backward(self,dout):
    out=self.out
    dx=dout*(out)*(1.0-out)
    return dx

In [None]:
class MSELoss:
  def __init__(self):
    self.params=[]
    self.grads=[]

  def forward(self,x,t,actions):
    t=np.vstack((t,t)).T
    onehot_actions=np.zeros((len(actions),2))
    for i,action in enumerate(actions):
      onehot_actions[i][int(action)]=1

    self.x=x
    self.t=t
    self.actions=onehot_actions
    square=(x-t)*(x-t)*onehot_actions
    out=square.sum()/len(x)
    return out

  def backward(self,dout):
    return 2*dout*(self.x-self.t)*self.actions

In [None]:
class Affine:
  def __init__(self,W,b):
    self.params=[W,b]
    self.grads=[np.zeros_like(W),np.zeros_like(b)]
    self.x=None

  def forward(self,x):
    W,b=self.params
    out=np.matmul(x,W)+b
    self.x=x
    return out

  def backward(self,dout):
    W,b=self.params
    dx=np.matmul(dout,W.T)
    dW=np.matmul(self.x.T,dout)
    db=np.sum(dout,axis=0)

    self.grads[0][...]=dW
    self.grads[1][...]=db

    return dx



In [None]:
import numpy as np
class NeuralNet():
  def __init__(self,input_dim,output_dim,hidden_dim=(8,4)):

    W=[]
    b=[]
    W.append(np.random.rand(input_dim,hidden_dim[0]))
    b.append(np.random.rand(hidden_dim[0]))

    for i in range(len(hidden_dim)-1):
      W.append(np.random.rand(hidden_dim[i],hidden_dim[i+1]))
      b.append(np.random.rand(hidden_dim[i+1]))
    W.append(np.random.rand(hidden_dim[-1],output_dim))
    b.append(np.random.rand(output_dim))

    self.layers=[]
    for i in range(len(W)-1):
      self.layers.append(Affine(W[i],b[i]))
      self.layers.append(Sigmoid())
    self.layers.append(Affine(W[-1],b[-1]))
    self.loss_fn=MSELoss()

    self.params=[]
    self.grads=[]
    for layer in self.layers:
      self.params+=layer.params
      self.grads+=layer.grads
  def predict(self,x):
    self.batch_size=len(x)
    for layer in self.layers:
      x=layer.forward(x)
    return x
  def forward(self,x,t,actions):
    y=self.predict(x)
    loss=self.loss_fn.forward(y,t,actions)
    return loss
  def backward(self,dout=1):
    dout=self.loss_fn.backward(dout)
    for layer in reversed(self.layers):
      dout=layer.backward(dout)


# Optimizers

In [None]:
class semi_GD():
  def __init__(self,model,learning_rate):
    self.lr=learning_rate
    self.model=model
  def update(self):
    for i in range(len(self.model.params)):
      self.model.params[i]-=self.lr*self.model.grads[i]

class Momentum():
  def __init__(self,model,lr,momentum):
    self.lr=lr
    self.momentum=momentum
    self.model=model
    self.v=[]
    for i in range(len(self.model.params)):
      self.v.append(np.zeros_like(self.model.grads[i]))
  def update(self):
    for i in range(len(self.model.params)):
      self.v[i]=self.momentum*self.v[i]+self.model.grads[i]
      self.model.params[i]-=self.lr*self.v

class RMSProp():
  def __init__(self,model,lr,gamma):
    self.model=model
    self.lr=lr
    self.gamma=gamma
    self.s=[]
    for i in range(len(self.model.params)):
      self.s.append(np.zeros_like(self.model.grads[i]))
  def update(self):
    epsilon=1e-6
    for i in range(len(self.model.params)):
      self.s[i]=self.gamma*self.s[i]+(1-self.gamma)*self.model.grads[i]*self.model.grads[i]
      self.model.params[i]-=self.lr/np.sqrt(self.s[i]+epsilon)*self.model.grads[i]



In [None]:
import gym
env=gym.make('Blackjack-v1')
print(env.observation_space.sample())
env.reset()
s,r,terminated,truncated=env.step(1)
print(s,r,terminated,truncated)

(22, 5, 1)
(21, 1, False) 0.0 False {}


  deprecation(
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):


# Q-learning using neural network
  train function

In [None]:
from tqdm import tqdm
def epsilon_greedy(Q,epsilon=0.01):
  p=np.random.rand()
  if p>epsilon:
    action=np.argmax(Q)
  else:
    action=np.random.choice(len(Q))
  return action

def train(env,strategy,model,optimizer,max_episode,max_epoch,batch_size,gamma):
  batch=[]
  counter=0
  for episode in tqdm(range(max_episode)):
    state,done=env.reset(),False
    state=np.array(list(state)).reshape(1,-1)

    while True:
      Qs=model.predict(state)[0]
      action=epsilon_greedy(Qs)
      next_state,reward,done,_=env.step(action)
      next_state=np.array(list(next_state)).reshape(1,-1)
      batch.append([state[0],action,reward,next_state[0],float(done)])

      #batch 데이터를 학습시킨다.
      #문제점:첫번째 학습에만 그래디언트가 업데이트되고 두번째학습부터는 그래디언트 업데이트가 되지 않는다.
      if len(batch)==batch_size:
        counter+=1
        states=np.array([x[0] for x in batch])
        actions=[x[1] for x in batch]
        rewards=np.array([x[2] for x in batch])
        next_states=np.array([x[3] for x in batch])
        dones=np.array([x[4] for x in batch])
        for epoch in range(max_epoch):
          Qns=model.predict(next_states)
          Qns_max=Qns.max(axis=1)
          Q_targets=rewards+gamma*Qns_max*(1-dones)
          model.forward(states,Q_targets,actions)
          model.backward()
          optimizer.update()
        if counter%100==0:
          print(f'{counter}th batch loss={model.forward(states,Q_targets,actions)}')
        batch=[]
      if done:
        break
      state=next_state


# Deep Q-learning using gradient descent

In [None]:
import numpy as np
def learned_policy(model):
  states=[]
  for i in range(2):
    print('usable ace=',i)
    for j in range(12,22):
      print('Card sum=',j)
      for k in range(10):
        Qs=model.predict(np.array([j,k,i]).reshape(1,-1))
        print(f'best action for state:{[j,k,i]}:{Qs.argmax()}')
      print('\n')
    print('\n')




In [None]:
max_episode=10**5
max_epoch=40
batch_size=128
decay_rate=1
lr=1e-4

model_GD=NeuralNet(3,2)
optimizer=semi_GD(model_GD,lr)

train(env,epsilon_greedy,model_GD,optimizer,max_episode,max_epoch,batch_size,decay_rate)
learned_policy(model_GD)

 13%|█▎        | 13222/100000 [00:03<00:23, 3655.72it/s]

100th batch loss=0.917914183712629


 26%|██▌       | 25849/100000 [00:07<00:18, 4079.27it/s]

200th batch loss=0.968452554257319


 39%|███▉      | 38825/100000 [00:10<00:15, 3922.59it/s]

300th batch loss=0.8395742208400955


 51%|█████▏    | 51317/100000 [00:13<00:12, 3812.40it/s]

400th batch loss=0.9005250322548581


 64%|██████▍   | 64401/100000 [00:16<00:08, 4123.39it/s]

500th batch loss=0.8193616167693974


 77%|███████▋  | 77056/100000 [00:20<00:05, 4121.02it/s]

600th batch loss=0.8905757560677321


 90%|████████▉ | 89879/100000 [00:23<00:02, 4071.17it/s]

700th batch loss=0.9391243946425842


100%|██████████| 100000/100000 [00:25<00:00, 3899.31it/s]

usable ace= 0
Card sum= 12
best action for state:[12, 0, 0]:0
best action for state:[12, 1, 0]:0
best action for state:[12, 2, 0]:0
best action for state:[12, 3, 0]:0
best action for state:[12, 4, 0]:0
best action for state:[12, 5, 0]:0
best action for state:[12, 6, 0]:0
best action for state:[12, 7, 0]:0
best action for state:[12, 8, 0]:0
best action for state:[12, 9, 0]:0


Card sum= 13
best action for state:[13, 0, 0]:0
best action for state:[13, 1, 0]:0
best action for state:[13, 2, 0]:0
best action for state:[13, 3, 0]:0
best action for state:[13, 4, 0]:0
best action for state:[13, 5, 0]:0
best action for state:[13, 6, 0]:0
best action for state:[13, 7, 0]:0
best action for state:[13, 8, 0]:0
best action for state:[13, 9, 0]:0


Card sum= 14
best action for state:[14, 0, 0]:0
best action for state:[14, 1, 0]:0
best action for state:[14, 2, 0]:0
best action for state:[14, 3, 0]:0
best action for state:[14, 4, 0]:0
best action for state:[14, 5, 0]:0
best action for state:[14, 6, 0]:




#deep Q-learning using RMSProp


In [None]:
max_episode=10**7
max_epoch=40
batch_size=1024
decay_rate=1
lr=0.01
RMSProp_gamma=0.99

model_RMS=NeuralNet(3,2)
optimizer=RMSProp(model_RMS,lr,RMSProp_gamma)

train(env,epsilon_greedy,model_RMS,optimizer,max_episode,max_epoch,batch_size,decay_rate)
learned_policy(model_RMS)

  1%|          | 57733/10000000 [00:16<48:26, 3420.64it/s]

100th batch loss=2.510375910639123


  1%|▏         | 125549/10000000 [00:33<39:00, 4218.75it/s]

200th batch loss=4.776357246888992


  2%|▏         | 199341/10000000 [00:50<37:41, 4333.94it/s]

300th batch loss=3.620938913939032


  3%|▎         | 264856/10000000 [01:07<41:06, 3946.83it/s]

400th batch loss=2.067684980013074


  3%|▎         | 328027/10000000 [01:24<38:44, 4161.23it/s]

500th batch loss=4.2440382037526305


  4%|▍         | 390259/10000000 [01:40<41:45, 3835.89it/s]

600th batch loss=1.7493375697027462


  4%|▍         | 449562/10000000 [01:57<42:48, 3718.57it/s]

700th batch loss=0.46509220283074776


  5%|▌         | 512260/10000000 [02:13<39:13, 4030.76it/s]

800th batch loss=0.44289942030051566


  6%|▌         | 576794/10000000 [02:30<42:10, 3723.98it/s]

900th batch loss=0.4676506380322639


  6%|▋         | 641340/10000000 [02:47<43:06, 3618.44it/s]

1000th batch loss=0.4613093871084779


  7%|▋         | 705719/10000000 [03:04<42:48, 3619.03it/s]

1100th batch loss=0.44594224193271315


  8%|▊         | 771126/10000000 [03:21<41:17, 3724.36it/s]

1200th batch loss=0.48689132283942754


  8%|▊         | 836923/10000000 [03:38<41:28, 3681.55it/s]

1300th batch loss=0.4838292411280123


  9%|▉         | 902580/10000000 [03:55<40:54, 3706.12it/s]

1400th batch loss=0.4491996910583159


 10%|▉         | 968779/10000000 [04:12<42:13, 3564.05it/s]

1500th batch loss=0.46707394869985264


 10%|█         | 1034581/10000000 [04:29<39:34, 3776.01it/s]

1600th batch loss=0.4946830107508291


 11%|█         | 1100523/10000000 [04:46<38:36, 3842.57it/s]

1700th batch loss=0.47070376627327304


 12%|█▏        | 1166971/10000000 [05:03<38:46, 3796.47it/s]

1800th batch loss=0.5112439649978338


 12%|█▏        | 1233378/10000000 [05:20<39:19, 3715.63it/s]

1900th batch loss=0.47629736428917824


 13%|█▎        | 1299126/10000000 [05:37<34:21, 4220.27it/s]

2000th batch loss=0.47458019683629316


 14%|█▎        | 1365270/10000000 [05:54<34:03, 4224.87it/s]

2100th batch loss=0.49401878556946094


 14%|█▍        | 1431799/10000000 [06:11<34:33, 4132.78it/s]

2200th batch loss=0.46864850970510785


 15%|█▍        | 1498263/10000000 [06:28<33:29, 4231.39it/s]

2300th batch loss=0.45611224895729385


 16%|█▌        | 1564871/10000000 [06:45<37:43, 3727.00it/s]

2400th batch loss=0.4789392041901258


 16%|█▋        | 1630972/10000000 [07:02<35:01, 3982.57it/s]

2500th batch loss=0.48028735307399284


 17%|█▋        | 1697550/10000000 [07:19<33:16, 4159.17it/s]

2600th batch loss=0.4609086746754215


 18%|█▊        | 1763965/10000000 [07:37<33:36, 4083.49it/s]

2700th batch loss=0.4433511357447123


 18%|█▊        | 1830469/10000000 [07:54<36:20, 3747.03it/s]

2800th batch loss=0.4439812594532306


 19%|█▉        | 1897593/10000000 [08:11<27:50, 4850.21it/s]

2900th batch loss=5.981544755394546


 20%|█▉        | 1993208/10000000 [08:30<30:46, 4336.58it/s]

3000th batch loss=3.195518915818352


 21%|██        | 2062935/10000000 [08:48<38:03, 3475.31it/s]

3100th batch loss=0.8151421145615729


 21%|██▏       | 2125089/10000000 [09:05<34:38, 3788.38it/s]

3200th batch loss=0.49101201451860943


 22%|██▏       | 2193525/10000000 [09:22<34:18, 3792.16it/s]

3300th batch loss=0.5043070504506483


 23%|██▎       | 2262698/10000000 [09:39<30:32, 4222.08it/s]

3400th batch loss=0.4904544701923842


 23%|██▎       | 2330794/10000000 [09:56<30:04, 4250.81it/s]

3500th batch loss=0.4844829560882642


 24%|██▍       | 2398039/10000000 [10:13<29:45, 4256.79it/s]

3600th batch loss=0.5069705489231188


 25%|██▍       | 2465251/10000000 [10:30<29:53, 4201.44it/s]

3700th batch loss=0.47661838061677


 25%|██▌       | 2533080/10000000 [10:47<30:00, 4148.02it/s]

3800th batch loss=0.49788061322617544


 26%|██▌       | 2600845/10000000 [11:05<32:29, 3795.90it/s]

3900th batch loss=0.5098731643554635


 27%|██▋       | 2668113/10000000 [11:22<29:08, 4192.96it/s]

4000th batch loss=0.49586984086299746


 27%|██▋       | 2736281/10000000 [11:39<32:02, 3779.00it/s]

4100th batch loss=0.48328540214219773


 28%|██▊       | 2804151/10000000 [11:56<31:15, 3836.24it/s]

4200th batch loss=0.5213032662448958


 29%|██▊       | 2871822/10000000 [12:13<28:18, 4196.38it/s]

4300th batch loss=0.46970465721369103


 29%|██▉       | 2939959/10000000 [12:30<31:02, 3790.28it/s]

4400th batch loss=0.5275594550081159


 30%|███       | 3008317/10000000 [12:47<27:48, 4190.66it/s]

4500th batch loss=0.4745664816154166


 31%|███       | 3076403/10000000 [13:05<27:31, 4192.04it/s]

4600th batch loss=0.4827558540040162


 31%|███▏      | 3145279/10000000 [13:22<30:50, 3705.16it/s]

4700th batch loss=0.4839263095049928


 32%|███▏      | 3212852/10000000 [13:40<30:01, 3767.33it/s]

4800th batch loss=0.4735253016002877


 33%|███▎      | 3280913/10000000 [13:57<29:31, 3793.57it/s]

4900th batch loss=0.48621127818013266


 33%|███▎      | 3349203/10000000 [14:14<26:24, 4196.40it/s]

5000th batch loss=0.48866041954157236


 34%|███▍      | 3417137/10000000 [14:31<26:14, 4181.44it/s]

5100th batch loss=0.48948714140113936


 35%|███▍      | 3485288/10000000 [14:48<30:27, 3564.08it/s]

5200th batch loss=0.4565126990030861


 36%|███▌      | 3553064/10000000 [15:06<26:04, 4120.10it/s]

5300th batch loss=0.47160550036795024


 36%|███▌      | 3621457/10000000 [15:23<25:25, 4181.39it/s]

5400th batch loss=0.4820497655913132


 37%|███▋      | 3689632/10000000 [15:40<28:19, 3713.32it/s]

5500th batch loss=0.45059279020235987


 38%|███▊      | 3757411/10000000 [15:57<24:38, 4221.80it/s]

5600th batch loss=0.4835144818342295


 38%|███▊      | 3825285/10000000 [16:15<27:01, 3807.93it/s]

5700th batch loss=0.5170588398650992


 39%|███▉      | 3892707/10000000 [16:32<24:18, 4186.87it/s]

5800th batch loss=0.4593124051538127


 40%|███▉      | 3960325/10000000 [16:49<25:19, 3973.93it/s]

5900th batch loss=0.45311659810977745


 40%|████      | 4028247/10000000 [17:06<23:36, 4214.47it/s]

6000th batch loss=0.48403413372686965


 41%|████      | 4095627/10000000 [17:23<24:39, 3991.03it/s]

6100th batch loss=0.44127766244291916


 42%|████▏     | 4163705/10000000 [17:40<25:43, 3782.36it/s]

6200th batch loss=0.4657308965823707


 42%|████▏     | 4231356/10000000 [17:57<22:55, 4192.96it/s]

6300th batch loss=0.5166873727949002


 43%|████▎     | 4299487/10000000 [18:15<24:57, 3807.08it/s]

6400th batch loss=0.5082714239206623


 44%|████▎     | 4366846/10000000 [18:32<22:29, 4174.99it/s]

6500th batch loss=0.4608565541995934


 44%|████▍     | 4434365/10000000 [18:49<22:35, 4104.54it/s]

6600th batch loss=0.45606596802244936


 45%|████▌     | 4501881/10000000 [19:06<24:24, 3753.49it/s]

6700th batch loss=0.49796963101819375


 46%|████▌     | 4569333/10000000 [19:23<24:28, 3698.68it/s]

6800th batch loss=0.41698308469871026


 46%|████▋     | 4637347/10000000 [19:40<23:55, 3735.41it/s]

6900th batch loss=0.46296249707898274


 47%|████▋     | 4704347/10000000 [19:58<21:10, 4168.99it/s]

7000th batch loss=0.4728377133372133


 48%|████▊     | 4772142/10000000 [20:15<22:59, 3789.57it/s]

7100th batch loss=0.5190445404858879


 48%|████▊     | 4839097/10000000 [20:32<20:43, 4150.85it/s]

7200th batch loss=0.45633872294170147


 49%|████▉     | 4906575/10000000 [20:49<20:13, 4195.98it/s]

7300th batch loss=0.4576897568741356


 50%|████▉     | 4974078/10000000 [21:06<21:45, 3851.06it/s]

7400th batch loss=0.47697025221192935


 50%|█████     | 5041012/10000000 [21:23<19:44, 4185.62it/s]

7500th batch loss=0.48738150738713715


 51%|█████     | 5108214/10000000 [21:40<21:37, 3771.16it/s]

7600th batch loss=0.47516822583043683


 52%|█████▏    | 5175346/10000000 [21:57<21:47, 3688.93it/s]

7700th batch loss=0.4480824602247106


 52%|█████▏    | 5242230/10000000 [22:14<19:59, 3967.37it/s]

7800th batch loss=0.4555477815382195


 53%|█████▎    | 5309662/10000000 [22:31<20:53, 3742.88it/s]

7900th batch loss=0.489320231735348


 54%|█████▍    | 5376132/10000000 [22:49<21:31, 3580.89it/s]

8000th batch loss=0.435973796013324


 54%|█████▍    | 5443212/10000000 [23:06<18:12, 4169.18it/s]

8100th batch loss=0.4781331880922275


 55%|█████▌    | 5510211/10000000 [23:23<17:53, 4183.43it/s]

8200th batch loss=0.5316445046277598


 56%|█████▌    | 5577126/10000000 [23:40<19:59, 3686.84it/s]

8300th batch loss=0.4356108336741504


 56%|█████▋    | 5644195/10000000 [23:57<17:17, 4197.28it/s]

8400th batch loss=0.4847094135728763


 57%|█████▋    | 5711173/10000000 [24:15<19:41, 3628.63it/s]

8500th batch loss=0.48216147346603183


 58%|█████▊    | 5778459/10000000 [24:32<16:35, 4240.54it/s]

8600th batch loss=0.49504052023503536


 58%|█████▊    | 5846287/10000000 [24:49<16:42, 4143.36it/s]

8700th batch loss=0.46894510419636926


 59%|█████▉    | 5913634/10000000 [25:06<15:56, 4270.27it/s]

8800th batch loss=0.47489279030748055


 60%|█████▉    | 5980565/10000000 [25:23<17:40, 3789.81it/s]

8900th batch loss=0.4973178639962794


 60%|██████    | 6047738/10000000 [25:40<15:30, 4248.92it/s]

9000th batch loss=0.4787946466341501


 61%|██████    | 6115106/10000000 [25:57<16:54, 3829.95it/s]

9100th batch loss=0.4971758619699247


 62%|██████▏   | 6181889/10000000 [26:14<17:22, 3662.30it/s]

9200th batch loss=0.46151894263293275


 62%|██████▏   | 6248884/10000000 [26:31<15:26, 4049.20it/s]

9300th batch loss=0.4871551510369289


 63%|██████▎   | 6315815/10000000 [26:48<15:12, 4036.31it/s]

9400th batch loss=0.42256465702495233


 64%|██████▍   | 6383062/10000000 [27:06<16:18, 3695.23it/s]

9500th batch loss=0.4660297591187421


 65%|██████▍   | 6450224/10000000 [27:23<15:45, 3755.09it/s]

9600th batch loss=0.4947885122830829


 65%|██████▌   | 6516899/10000000 [27:40<15:20, 3785.70it/s]

9700th batch loss=0.483561837667551


 66%|██████▌   | 6583956/10000000 [27:57<15:17, 3724.79it/s]

9800th batch loss=0.5005462123831683


 67%|██████▋   | 6650912/10000000 [28:14<15:01, 3715.23it/s]

9900th batch loss=0.47502078838444306


 67%|██████▋   | 6717596/10000000 [28:31<13:18, 4112.02it/s]

10000th batch loss=0.45960025292686973


 68%|██████▊   | 6784519/10000000 [28:48<15:06, 3548.65it/s]

10100th batch loss=0.43875779402297665


 69%|██████▊   | 6851925/10000000 [29:05<14:03, 3733.57it/s]

10200th batch loss=0.4719412368040519


 69%|██████▉   | 6919144/10000000 [29:23<14:01, 3660.74it/s]

10300th batch loss=0.4771375258926625


 70%|██████▉   | 6986019/10000000 [29:40<11:59, 4187.08it/s]

10400th batch loss=0.447034254077027


 71%|███████   | 7053728/10000000 [29:57<11:47, 4161.79it/s]

10500th batch loss=0.45027347124400036


 71%|███████   | 7120616/10000000 [30:14<11:35, 4141.38it/s]

10600th batch loss=0.46172614513573984


 72%|███████▏  | 7187877/10000000 [30:31<11:07, 4213.29it/s]

10700th batch loss=0.5003732785657229


 73%|███████▎  | 7254937/10000000 [30:48<12:07, 3775.39it/s]

10800th batch loss=0.5059685802080525


 73%|███████▎  | 7322244/10000000 [31:05<11:21, 3927.91it/s]

10900th batch loss=0.504357668555518


 74%|███████▍  | 7389822/10000000 [31:23<10:49, 4018.91it/s]

11000th batch loss=0.48752284101364124


 75%|███████▍  | 7457206/10000000 [31:40<11:06, 3817.32it/s]

11100th batch loss=0.49867996197578857


 75%|███████▌  | 7524605/10000000 [31:57<10:47, 3823.95it/s]

11200th batch loss=0.49443851347813367


 76%|███████▌  | 7591742/10000000 [32:14<09:54, 4052.16it/s]

11300th batch loss=0.4735372485337699


 77%|███████▋  | 7659400/10000000 [32:31<09:24, 4149.92it/s]

11400th batch loss=0.46251108005147323


 77%|███████▋  | 7726695/10000000 [32:48<10:29, 3610.65it/s]

11500th batch loss=0.49801526285515874


 78%|███████▊  | 7793539/10000000 [33:05<10:19, 3564.27it/s]

11600th batch loss=0.4728197808120655


 79%|███████▊  | 7860702/10000000 [33:23<08:36, 4143.36it/s]

11700th batch loss=0.4771058519179036


 79%|███████▉  | 7928001/10000000 [33:40<09:22, 3681.89it/s]

11800th batch loss=0.46820517785382043


 80%|███████▉  | 7994739/10000000 [33:57<07:56, 4206.29it/s]

11900th batch loss=0.4617602981735328


 81%|████████  | 8061663/10000000 [34:14<07:48, 4134.36it/s]

12000th batch loss=0.4819263560379183


 81%|████████▏ | 8129079/10000000 [34:31<08:12, 3795.72it/s]

12100th batch loss=0.4742867913423465


 82%|████████▏ | 8196024/10000000 [34:48<07:16, 4133.84it/s]

12200th batch loss=0.4593941809048747


 83%|████████▎ | 8262577/10000000 [35:05<08:00, 3616.52it/s]

12300th batch loss=0.4441700354611981


 83%|████████▎ | 8329478/10000000 [35:23<06:45, 4124.72it/s]

12400th batch loss=0.48358094310635114


 84%|████████▍ | 8396373/10000000 [35:40<06:21, 4200.26it/s]

12500th batch loss=0.4593128758880999


 85%|████████▍ | 8463428/10000000 [35:57<06:19, 4051.85it/s]

12600th batch loss=0.4555721708281115


 85%|████████▌ | 8530234/10000000 [36:14<06:36, 3711.10it/s]

12700th batch loss=0.4586284949467643


 86%|████████▌ | 8597268/10000000 [36:31<06:17, 3713.16it/s]

12800th batch loss=0.48430533856144803


 87%|████████▋ | 8664135/10000000 [36:48<05:59, 3713.02it/s]

12900th batch loss=0.4620972421461713


 87%|████████▋ | 8730755/10000000 [37:05<05:45, 3675.99it/s]

13000th batch loss=0.44940046465872985


 88%|████████▊ | 8798223/10000000 [37:23<05:22, 3727.68it/s]

13100th batch loss=0.48535065966526025


 89%|████████▊ | 8865466/10000000 [37:40<05:00, 3771.40it/s]

13200th batch loss=0.4808501549569778


 89%|████████▉ | 8932083/10000000 [37:57<04:17, 4143.16it/s]

13300th batch loss=0.48049576782756126


 90%|████████▉ | 8999463/10000000 [38:14<04:26, 3752.89it/s]

13400th batch loss=0.44072698853759273


 91%|█████████ | 9066046/10000000 [38:31<03:41, 4214.99it/s]

13500th batch loss=0.44064723518226356


 91%|█████████▏| 9132207/10000000 [38:48<03:45, 3850.52it/s]

13600th batch loss=0.4583858357580559


 92%|█████████▏| 9199411/10000000 [39:05<03:37, 3688.77it/s]

13700th batch loss=0.4761968349495568


 93%|█████████▎| 9266323/10000000 [39:22<02:52, 4252.07it/s]

13800th batch loss=0.503273687126472


 93%|█████████▎| 9333250/10000000 [39:39<02:39, 4188.51it/s]

13900th batch loss=0.46684083907801055


 94%|█████████▍| 9400644/10000000 [39:56<02:42, 3692.83it/s]

14000th batch loss=0.48065960704436406


 95%|█████████▍| 9466909/10000000 [40:13<02:26, 3648.04it/s]

14100th batch loss=0.48280154178729506


 95%|█████████▌| 9534296/10000000 [40:31<02:05, 3718.01it/s]

14200th batch loss=0.4547945113985629


 96%|█████████▌| 9601170/10000000 [40:48<01:47, 3714.61it/s]

14300th batch loss=0.4519583506545181


 97%|█████████▋| 9668412/10000000 [41:05<01:26, 3824.84it/s]

14400th batch loss=0.48212540729627384


 97%|█████████▋| 9735660/10000000 [41:22<01:12, 3641.05it/s]

14500th batch loss=0.47796761886497907


 98%|█████████▊| 9803182/10000000 [41:39<00:56, 3465.03it/s]

14600th batch loss=0.46998941959885243


 99%|█████████▊| 9870025/10000000 [41:56<00:31, 4187.37it/s]

14700th batch loss=0.49290815297744073


 99%|█████████▉| 9936958/10000000 [42:13<00:17, 3698.93it/s]

14800th batch loss=0.45822251609291065


100%|██████████| 10000000/10000000 [42:30<00:00, 3921.11it/s]

usable ace= 0
Card sum= 12
best action for state:[12, 0, 0]:1
best action for state:[12, 1, 0]:1
best action for state:[12, 2, 0]:1
best action for state:[12, 3, 0]:1
best action for state:[12, 4, 0]:1
best action for state:[12, 5, 0]:1
best action for state:[12, 6, 0]:1
best action for state:[12, 7, 0]:1
best action for state:[12, 8, 0]:1
best action for state:[12, 9, 0]:1


Card sum= 13
best action for state:[13, 0, 0]:1
best action for state:[13, 1, 0]:1
best action for state:[13, 2, 0]:1
best action for state:[13, 3, 0]:1
best action for state:[13, 4, 0]:1
best action for state:[13, 5, 0]:1
best action for state:[13, 6, 0]:1
best action for state:[13, 7, 0]:1
best action for state:[13, 8, 0]:1
best action for state:[13, 9, 0]:1


Card sum= 14
best action for state:[14, 0, 0]:1
best action for state:[14, 1, 0]:1
best action for state:[14, 2, 0]:0
best action for state:[14, 3, 0]:0
best action for state:[14, 4, 0]:0
best action for state:[14, 5, 0]:0
best action for state:[14, 6, 0]:




In [None]:
gpu_info=!nvidia-smi
gpu_info='\n'.join(gpu_info)
if gpu_info.find('failed')>=0:
  print('Not connected to GPU')
else:
  print(gpu_info)


Thu Feb 27 01:39:34 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   57C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                