This is a notebook deticated to try an find out the best strategy in a casino (note that there most definitely are better ways to accomplish this than the one displayed here)

In [1581]:
import requests
import random 
import pandas as pd
import numpy as np
import random
import bisect
import collections
import json
from tqdm.notebook import tqdm

In [1582]:
def zero_one_policy(policy):
    """Turns given policy into a "only pick one slot machine" type of policy 

    Parameters
    ----------
    policy: list(float) 
        List of non-negative numbers where the highest number indicates the "best" slot machine.
        0 indicates never choose the slot machine represented by that index.

    Returns
    -------
    list(float)
        Returns an array of equal length as the policy with zeros in all indexes except one where is is "1", which
        represent the "best" slot machine according to the given policy.
        
    """
    idx = np.argmax(policy)
    new_pol = np.zeros(len(policy))
    new_pol[idx] = 1.0
    return new_pol

def get_prob(policy):
    """Turns given policy into a array of probabilities 

    Parameters
    ----------
    policy: list(float) 
        List of non-negative numbers where the highest number indicates the "best" slot machine.
        0 indicates never choose the slot machine represented by that index.

    Returns
    -------
    list(float)
        Returns an array of equal length as the policy with each number representing the probability of 
        choosing that index (i.e. slot machine) if using the given policy.
        
    """
    cdf_pol = cdf(policy)
    n = len(policy)
    probs = np.zeros(n)
    probs[0] = cdf_pol[0]
    for i in range(1,n):
        probs[i] = cdf_pol[i] - cdf_pol[i-1]
    return probs

def cdf(policy):
    """Turns given policy into an array as a cumulative distribution function. 

    Parameters
    ----------
    policy: list(float) 
        List of non-negative numbers where the highest number indicates the "best" slot machine.
        0 indicates never choose the slot machine represented by that index.

    Returns
    -------
    list(float)
        Returns an array of equal length as the policy with each number representing the cumulative probability of 
        choosing and index equal to or less than the given index (i.e. slot machine) if using the given policy.
        
    """
    total = sum(policy)
    result = []
    cumsum = 0
    for w in policy:
        cumsum += w
        result.append(cumsum / total)
    return result

def choice(policy):
    """Choose an index (i.e. slot machine) according to the policy.

    Parameters
    ----------
    policy: list(float) 
        List of non-negative numbers where the highest number indicates the "best" slot machine.
        0 indicates never choose the slot machine represented by that index.

    Returns
    -------
    Integer
        the "random"-integer/slot-machine chosen by following the given policy.
        
    """
    cdf_vals = cdf(policy)
    x = random.random()
    idx = bisect.bisect(cdf_vals, x)
    return idx

In [1583]:
def play_test(player_id, game_id, policy, iterations, penalty = 0.1, reward = 0.025, isTraining = True, easy=True):
    """Plays in the test-casino.

    Parameters
    ----------
    player_id: Integer
        The id of the player
        
    game_id: Integer
        The id of the game (i.e. 0,1,2,3,4)
        
    policy:list(float) 
        List of non-negative numbers where the highest number indicates the "best" slot machine.
        0 indicates never choose the slot machine represented by that index.
    
    iterations: Integer
        Number of times the game is played.
        
    penalty: float
        A number that changes the policy by subtracting from the particular policy index (i.e. slot machine) played
        if for that iteration it led to a loss (does nothing if isTraining is False)
    
    reward: float
        A number that changes the policy by adding to the particular policy-index (i.e. slot machine) played 
        if for that iteration it led to a win (does nothing if isTraining is False)
    
    isTraining: Boolean
        If true the function changes the policy according to the training. IF false does not change the policy.
    
    easy: Boolean
        Is only used for game_id = 0, if True it will be easier to differentiate between the slot machines than if False
        

    Returns
    -------
    list(float):
        The policy, new policy if isTraining is True otherwise the same policy as supplied.
    pandas.DataFrame:
        With information on each play/iteration, if the algorithm won/lost that round and what slot it bet on
        ({"winnings":list(Integers), "slotid":list(Integer)})
        
    """
    data = []
    
    for i in range(iterations):
        slot_id = choice(policy)

        dct = {"slotid":slot_id}
        result = test_casino(game_id, slot_id, player_id,easy)

        if(result == "Error"):
            print("Handle")

        if(isTraining):
            if(result["winnings"] == 1):
                if(policy[slot_id] + reward <= 2):
                    policy[slot_id] += reward
            elif (policy[slot_id] - penalty > 0):
                policy[slot_id] -= penalty
            

        dct.update(result)
        data.append(dct)
        
    df = pd.DataFrame(data) 
    return policy, df

In [1584]:
def train(game_id, player_id, penalty = 0.1, reward = 0.025, easy=True, iterations = 100):
    """Trains a new policy in the test-casino.

    Parameters
    ----------
    player_id: Integer
        The id of the player
        
    game_id: Integer
        The id of the game (i.e. 0,1,2,3,4)
    
    iterations: Integer
        Number of times the game is played /trained on.
        
    penalty: float
        A number that changes the policy by subtracting from the particular policy index (i.e. slot machine) played
        if for that iteration it led to a loss (does nothing if isTraining is False)
    
    reward: float
        A number that changes the policy by adding to the particular policy-index (i.e. slot machine) played 
        if for that iteration it led to a win (does nothing if isTraining is False)
    
    easy: Boolean
        Is only used for game_id = 0, if True it will be easier to differentiate between the slot machines than if False
        

    Returns
    -------
    list(float):
        The policy, new policy if isTraining is True otherwise the same policy as supplied.
    pandas.DataFrame:
        With information on each play/iteration, if the algorithm won/lost that round and what slot it bet on
        ({"winnings":list(Integers), "slotid":list(Integer)})
        
    """
    policy = []
    if game_id == 0:
        policy = np.ones(2)
    elif game_id == 1 or game_id == 2:
        policy = np.ones(3)
    elif game_id == 3:
        policy = np.ones(4)
    elif game_id == 4:
        policy = np.ones(6)
    return play(
        player_id, 
        game_id, 
        policy, 
        iterations, 
        penalty=penalty,
        reward=reward,
        training=True,
    easy=easy)

In [1585]:
def test_casino(game_id, slot_id, player_id, easy = True):
    """A test casino trying to mirror the casino in the project.

    Parameters
    ----------
    game_id: Integer
        The id of the game (i.e. 0,1,2,3,4)
        
    slot_id: Integer
        The id of the slot to be bet on.
    
    player_id: Integer
        The id of the player
    
    easy: Boolean
        Is only used for game_id = 0, if True it will be easier to differentiate between the slot machines than if False
        

    Returns
    -------
    Dictionary:
       {"winnings": Integer (0 or 1), "slotwon": Integer}
        
    """
    slot_probs = []
    if game_id == 0:
        if easy:
            slot_probs = [0.3, 0.7]
        else:
            slot_probs = [0.45, 0.55]
    elif game_id == 1:
        slot_probs = [0.1, 0.1, 0.8]
    elif game_id == 2:
        slot_probs = [0.05, 0.4, 0.55]
    elif game_id == 3:
        slot_probs = [0.1, 0.2, 0.3, 0.4]
    elif game_id == 4:
        slot_probs = [0.01, 0.1, 0.29, 0.1, 0.2, 0.3]

    slot_won = choice(slot_probs)
    draw = random.random()
    winner = {"winnings":int(slot_id == slot_won), "slotwon":slot_won}
    return winner

In [1586]:
def reinf_hyperparameters(game_id = 0):
    """A Grid search to try and figure out which hyperparameters work the best for each slot-machine scenario.

    Parameters
    ----------
    game_id: Integer
        The id of the game to be played / tested (i.e. 0,1,2,3,4)
        
    Returns
    -------
    Dictionary:
       {"reward": list(float), "penalty": list(float), "iterations":list(Integer), "accuracy": list(float)}
        
    """
    player_id = 1
    iterations = 50
    
    if game_id == 0:
        best_slot_idx = 1
        grid = {
            "reward": [0.001, 0.002, 0.004, 0.008, 0.016],
            "penalty": [0.0005, 0.001, 0.002, 0.004, 0.008],
            "iterations": [100, 200]
        }
    elif game_id == 1 or game_id == 2:
        best_slot_idx = 2
        grid = {
            "reward": [0.001, 0.002, 0.004, 0.008, 0.016],
            "penalty": [0.0005, 0.001, 0.002, 0.004, 0.008],
            "iterations": [100, 200]
        }
    elif game_id == 3:
        best_slot_idx = 3
        grid = {
            "reward": [0.001, 0.002, 0.004, 0.008, 0.016],
            "penalty": [0.0001, 0.0005, 0.001, 0.002],
            "iterations": [100, 200]
        }
    elif game_id == 4:
        best_slot_idx = 5
        grid = {
            "reward": [0.001, 0.002, 0.004, 0.008, 0.016],
            "penalty": [0.0001, 0.0005, 0.001, 0.002],
            "iterations": [100, 200, 400]
        }
    
    results = {
        "reward":[],
        "penalty":[],
        "iterations":[],
        "accuracy":[],
    }
    for rew in grid["reward"]:
        for pen in grid["penalty"]:
            for it in grid["iterations"]:
                correct = 0
                for i in range(iterations):
                    new_policy, df = train(
                        game_id, 
                        player_id,
                        penalty = pen, 
                        reward = rew, 
                        easy = False,
                        iterations = it
                    )
                    idx = np.argmax(new_policy)
                    if(idx == best_slot_idx):
                        correct += 1

                accuracy = float(correct / iterations)
                results["reward"].append(rew)
                results["penalty"].append(pen)
                results["accuracy"].append(accuracy)
                results["iterations"].append(it)
    return results

In [1587]:
res_dict = reinf_hyperparameters(game_id = 0)
df = pd.DataFrame(res_dict)
print(df.loc[df["accuracy"].idxmax()])
print(df.loc[df["accuracy"] > 0.9])

reward          0.0020
penalty         0.0005
iterations    200.0000
accuracy        0.9400
Name: 11, dtype: float64
    reward  penalty  iterations  accuracy
3    0.001   0.0010         200      0.92
7    0.001   0.0040         200      0.92
11   0.002   0.0005         200      0.94
13   0.002   0.0010         200      0.94
27   0.004   0.0040         200      0.92
29   0.004   0.0080         200      0.94
31   0.008   0.0005         200      0.92
35   0.008   0.0020         200      0.94
37   0.008   0.0040         200      0.94
39   0.008   0.0080         200      0.94
43   0.016   0.0010         200      0.92


In [1588]:
res_dict = reinf_hyperparameters(game_id = 1)
df = pd.DataFrame(res_dict)
print(df.loc[df["accuracy"].idxmax()])
print(df.loc[df["accuracy"] > 0.9])

reward          0.0010
penalty         0.0005
iterations    100.0000
accuracy        1.0000
Name: 0, dtype: float64
    reward  penalty  iterations  accuracy
0    0.001   0.0005         100       1.0
1    0.001   0.0005         200       1.0
2    0.001   0.0010         100       1.0
3    0.001   0.0010         200       1.0
4    0.001   0.0020         100       1.0
5    0.001   0.0020         200       1.0
6    0.001   0.0040         100       1.0
7    0.001   0.0040         200       1.0
8    0.001   0.0080         100       1.0
9    0.001   0.0080         200       1.0
10   0.002   0.0005         100       1.0
11   0.002   0.0005         200       1.0
12   0.002   0.0010         100       1.0
13   0.002   0.0010         200       1.0
14   0.002   0.0020         100       1.0
15   0.002   0.0020         200       1.0
16   0.002   0.0040         100       1.0
17   0.002   0.0040         200       1.0
18   0.002   0.0080         100       1.0
19   0.002   0.0080         200       1.0
20

In [1589]:
res_dict = reinf_hyperparameters(game_id = 2)
df = pd.DataFrame(res_dict)
print(df.loc[df["accuracy"].idxmax()])
print(df.loc[df["accuracy"] > 0.9])

reward          0.002
penalty         0.001
iterations    200.000
accuracy        1.000
Name: 13, dtype: float64
    reward  penalty  iterations  accuracy
1    0.001   0.0005         200      0.98
3    0.001   0.0010         200      0.96
5    0.001   0.0020         200      0.92
7    0.001   0.0040         200      0.92
9    0.001   0.0080         200      0.94
11   0.002   0.0005         200      0.92
13   0.002   0.0010         200      1.00
15   0.002   0.0020         200      1.00
17   0.002   0.0040         200      0.98
19   0.002   0.0080         200      0.94
21   0.004   0.0005         200      0.96
23   0.004   0.0010         200      0.92
25   0.004   0.0020         200      0.98
27   0.004   0.0040         200      0.98
29   0.004   0.0080         200      0.98
31   0.008   0.0005         200      0.92
35   0.008   0.0020         200      0.98
37   0.008   0.0040         200      0.96
39   0.008   0.0080         200      0.96
40   0.016   0.0005         100      0.92
43   

In [1590]:
res_dict = reinf_hyperparameters(game_id = 3)
df = pd.DataFrame(res_dict)
print(df.loc[df["accuracy"].idxmax()])
print(df.loc[df["accuracy"] > 0.9])

reward          0.0160
penalty         0.0001
iterations    200.0000
accuracy        0.9000
Name: 33, dtype: float64
Empty DataFrame
Columns: [reward, penalty, iterations, accuracy]
Index: []


In [1591]:
res_dict = reinf_hyperparameters(game_id = 4)
df = pd.DataFrame(res_dict)
print(df.loc[df["accuracy"].idxmax()])
print(df.loc[df["accuracy"] > 0.9])

reward          0.001
penalty         0.002
iterations    400.000
accuracy        0.700
Name: 11, dtype: float64
Empty DataFrame
Columns: [reward, penalty, iterations, accuracy]
Index: []


## Play !!!

In [1592]:
def save_to_file(game_id,df):
    """Saves the Dataframe supplied to the end of the file ./data/real/real_game_{game_id}.csv.

    Parameters
    ----------
    game_id: Integer
        The id of the game to be played / tested (i.e. 0,1,2,3,4)
    
    df: pandas.DataFrame
        the dataframe to save to the end of the file.
        
    Returns
    -------
    None
        
    """
    new_query = False
    
    try:
        old_df = pd.read_csv(f"./data/real/real_game_{game_id}.csv", index_col=0)
        print("Old Query :)")
    except FileNotFoundError:
        new_query = True
        print("New Query boys!")
    
    if(new_query):
        df.to_csv(f"./data/real/real_game_{game_id}.csv")
    else:
        new_df = old_df.append(df,ignore_index=True,sort=True)
        new_df.to_csv(f"./data/real/real_game_{game_id}.csv")

In [1633]:
def play_game(policy, reward = 0.04, penalty=0.02 ,game_id = 0, iterations = 10, isTraining = True,verbose=True):
    """ Place game at game_id according to the policy and saves the history to a csv file. 

    Parameters
    ----------
    policy:list(float) 
        List of non-negative numbers where the highest number indicates the "best" slot machine.
        0 indicates never choose the slot machine represented by that index.
    
    reward: float
        A number that changes the policy by adding to the particular policy-index (i.e. slot machine) played 
        if for that iteration it led to a win (does nothing if isTraining is False)
        
    penalty: float
        A number that changes the policy by subtracting from the particular policy index (i.e. slot machine) played
        if for that iteration it led to a loss (does nothing if isTraining is False)
    
    game_id: Integer
        The id of the game (i.e. 0,1,2,3,4)
        
    iterations: Integer
        Number of times the game is played.
    
    isTraining: Boolean
        If true the function changes the policy according to the training. IF false does not change the policy.
        
    Returns
    -------
    list(float)
        The ending policy.
        
    """
    player_id = "e196"
    
    data = {
        "winnings": [],
        "slotid": []
    }
    
    for it in tqdm(range(iterations)):
        try:
            slot_id = choice(policy)
            PARAMS = {'gameid':game_id,'slotid':slot_id,'playerid':player_id}
            r = requests.get('http://188.166.85.114:80/play', params = PARAMS)
            r.raise_for_status()
            if(r.content == b'You cannot play more turns.'):
                break
            result = json.loads(r.content.decode("utf-8"))
            data["winnings"].append(int(result["winnings"]))
            data["slotid"].append(slot_id)
            
            if(isTraining):
                if(result["winnings"] == 1):
                    if(policy[slot_id] + reward <= 2):
                        policy[slot_id] += reward
                elif (policy[slot_id] - penalty > 0):
                    policy[slot_id] -= penalty
            
        except requests.exceptions.RequestException as e:  # This is the correct syntax
            #print("Error", e)
            break
        except JSONDecodeError as e:
            break
    if(len(data["winnings"]) == 0):
        return False
    df = pd.DataFrame(data)
    save_to_file(game_id, df, name="real_game")
    return policy

## Play Game 0. 2 Slots

In [1634]:
def get_old_policy(n=2,game_id=0, reward=0.04, penalty=0.02, verbose=True):
    """ Calculates the policy one would get given the reward, penalty and the history of games in the file 
    corresponding to the game_id given.

    Parameters
    ----------
    n: Integer
        Number of slot machines in this game_id
    
    game_id: Integer
        The id of the game (i.e. 0,1,2,3,4)
        
    reward: float
        A number that changes the policy by adding to the particular policy-index (i.e. slot machine) played 
        if for that iteration it led to a win
        
    penalty: float
        A number that changes the policy by subtracting from the particular policy index (i.e. slot machine) played
        if for that iteration it led to a loss
    
    Returns
    -------
    list(float)
        The calculated policy.
        
    """
    policy = np.ones(n)
    try:
        df = pd.read_csv(f"./data/real/real_game_{game_id}.csv", index_col=0)
        if(verbose):
            print("Old Query :)")
        for index, row in df.iterrows():
            if(row["winnings"] == 1):
                if(policy[int(row['slotid'])] + reward <= 2):
                    policy[int(row['slotid'])] += reward
            elif (policy[int(row['slotid'])] - penalty > 0):
                policy[int(row['slotid'])] -= penalty
                
            
    except FileNotFoundError:
        print("No Data Found")
        return np.ones(n)
    return policy

In [1635]:
def get_stats(n, game_id=0):
    """ Calculates the stats from the history of place for this game_id. Only stats calculated are probabilities of
        given slot machine (winnings / number of times this machine was picked), counts number of wins overall and 
        number of plays

    Parameters
    ----------
    n: Integer
        Number of slot machines in this game_id
    
    game_id: Integer
        The id of the game (i.e. 0,1,2,3,4)

    Returns
    -------
    list(float)
        Probabilities
    
    Integer
        Overall wins
    Integer
        Overall Plays
        
    """
    try:
        df = pd.read_csv(f"./data/real/real_game_{game_id}.csv", index_col=0)
        print("Old Query :)")
        counts = np.zeros(n)
        iters = np.zeros(n)
        for index, row in df.iterrows():
            counts[int(row['slotid'])] += row["winnings"]
            iters[int(row['slotid'])] += 1
        probabilities = counts / iters
        return probabilities, sum(counts), sum(iters)
                
            
    except FileNotFoundError:
        print("No Data Found")

In [1636]:
def play_game_with(n = 2, game_id = 0, train_iterations=10, plays = 100, reward=0.1, penalty=0.1, verbose = True):
    """ Place all 10.000 games accordin to the specified parameters.

    Parameters
    ----------
    n: Integer
        Number of slot machines in this game
    
    game_id: Integer
        The id of the game (i.e. 0,1,2,3,4)
    
    train_iterations: Integer
        Number of times to iterate the "training" before deciding which slot machine is best
    
    plays: Integer
        Number of plays per training iteration.
    
    reward: float
        A number that changes the policy by adding to the particular policy-index (i.e. slot machine) played 
        if for that iteration it led to a win (does nothing if isTraining is False)
        
    penalty: float
        A number that changes the policy by subtracting from the particular policy index (i.e. slot machine) played
        if for that iteration it led to a loss (does nothing if isTraining is False)
    Returns
    -------
    None
        
    """
    policy = np.ones(n)
    if(verbose):
        print(get_stats(n, game_id=game_id))
    for i in range(train_iterations):
        policy = get_old_policy(n=n, game_id=game_id, penalty=0.001, reward=0.001,verbose=verbose)
        super_threshold_indices = policy < np.max(policy)-0.01
        policy[super_threshold_indices] = 0
        if(verbose):
            print(policy)
        play_game(policy,
          reward = 0.001, penalty=0.001 ,game_id = game_id, 
          iterations = plays, isTraining = True, verbose=verbose)
    get_stats(n, game_id=game_id)
    if(verbose):
        print(get_stats(n, game_id=game_id))
    policy = get_old_policy(n=n, game_id=game_id, penalty=0.001, reward=0.001,verbose=verbose)
    super_threshold_indices = policy < np.max(policy)-0.01
    policy[super_threshold_indices] = 0
    print(policy)
    
    play_game(policy,
          reward = 0.001, penalty=0.001 ,game_id = 4, 
          iterations = 10000 - plays*iterations, isTraining = True, verbose=verbose)
    print(get_stats(n, game_id=game_id))

In [1637]:
play_game_with(n=2, game_id = 0, train_iterations = 6, plays = 75, reward = 0.04, penalty=0.02, verbose=False)

HBox(children=(IntProgress(value=0, max=75), HTML(value='')))

HBox(children=(IntProgress(value=0, max=75), HTML(value='')))




HBox(children=(IntProgress(value=0, max=75), HTML(value='')))

HBox(children=(IntProgress(value=0, max=75), HTML(value='')))

HBox(children=(IntProgress(value=0, max=75), HTML(value='')))

HBox(children=(IntProgress(value=0, max=75), HTML(value='')))


Old Query :)
[1.013 0.   ]


HBox(children=(IntProgress(value=0, max=8125), HTML(value='')))

Old Query :)
(array([0.50066551, 0.13533835]), 4908.0, 9900.0)


## Play Game 1. 3 Slots

In [1638]:
play_game_with(n=3, game_id = 1, train_iterations = 3, plays = 100, reward = 0.0005, penalty=0.001)

Old Query :)
(array([0.304     , 0.46979866, 0.69504421]), 6868.0, 10000.0)
Old Query :)
[0. 0. 2.]


HBox(children=(IntProgress(value=0), HTML(value='')))

Old Query :)
[0. 0. 2.]


HBox(children=(IntProgress(value=0), HTML(value='')))

Old Query :)
[0. 0. 2.]


HBox(children=(IntProgress(value=0), HTML(value='')))



Old Query :)
Old Query :)
(array([0.304     , 0.46979866, 0.69504421]), 6868.0, 10000.0)
Old Query :)
[0. 0. 2.]


HBox(children=(IntProgress(value=0, max=7500), HTML(value='')))

Old Query :)
(array([0.304     , 0.46979866, 0.69504421]), 6868.0, 10000.0)


### GAmeID = 2

In [1639]:
play_game_with(n=3, game_id = 2, train_iterations = 5, plays = 100, reward = 0.0005, penalty=0.001)

Old Query :)
(array([0.26315789, 0.52857143, 0.60386914]), 6014.0, 10000.0)
Old Query :)
[0.    0.    1.997]


HBox(children=(IntProgress(value=0), HTML(value='')))

Old Query :)
[0.    0.    1.997]


HBox(children=(IntProgress(value=0), HTML(value='')))

Old Query :)
[0.    0.    1.997]


HBox(children=(IntProgress(value=0), HTML(value='')))



Old Query :)
[0.    0.    1.997]


HBox(children=(IntProgress(value=0), HTML(value='')))

Old Query :)
[0.    0.    1.997]


HBox(children=(IntProgress(value=0), HTML(value='')))

Old Query :)
Old Query :)
(array([0.26315789, 0.52857143, 0.60386914]), 6014.0, 10000.0)
Old Query :)
[0.    0.    1.997]


HBox(children=(IntProgress(value=0, max=7500), HTML(value='')))

Old Query :)
(array([0.26315789, 0.52857143, 0.60386914]), 6014.0, 10000.0)


### Game 3:

In [1640]:
play_game_with(n=4, game_id = 3, train_iterations = 10, plays = 100, reward = 0.001, penalty=0.002)

Old Query :)
(array([0.38613861, 0.61034263, 0.45535714, 0.57840617]), 6051.0, 10000.0)
Old Query :)
[0.    1.994 0.    0.   ]


HBox(children=(IntProgress(value=0), HTML(value='')))

Old Query :)
[0.    1.994 0.    0.   ]


HBox(children=(IntProgress(value=0), HTML(value='')))

Old Query :)
[0.    1.994 0.    0.   ]


HBox(children=(IntProgress(value=0), HTML(value='')))

Old Query :)
[0.    1.994 0.    0.   ]


HBox(children=(IntProgress(value=0), HTML(value='')))

Old Query :)
[0.    1.994 0.    0.   ]





HBox(children=(IntProgress(value=0), HTML(value='')))

Old Query :)
[0.    1.994 0.    0.   ]


HBox(children=(IntProgress(value=0), HTML(value='')))

Old Query :)
[0.    1.994 0.    0.   ]


HBox(children=(IntProgress(value=0), HTML(value='')))

Old Query :)
[0.    1.994 0.    0.   ]


HBox(children=(IntProgress(value=0), HTML(value='')))

Old Query :)
[0.    1.994 0.    0.   ]


HBox(children=(IntProgress(value=0), HTML(value='')))


Old Query :)
[0.    1.994 0.    0.   ]


HBox(children=(IntProgress(value=0), HTML(value='')))

Old Query :)
Old Query :)
(array([0.38613861, 0.61034263, 0.45535714, 0.57840617]), 6051.0, 10000.0)
Old Query :)
[0.    1.994 0.    0.   ]


HBox(children=(IntProgress(value=0, max=7500), HTML(value='')))

Old Query :)
(array([0.38613861, 0.61034263, 0.45535714, 0.57840617]), 6051.0, 10000.0)


### Game 4:

In [1641]:
play_game_with(n=6, game_id = 4, train_iterations = 10, plays = 100, reward = 0.001, penalty=0.001)

Old Query :)
(array([0.5483871 , 0.60550459, 0.6039604 , 0.53      , 0.55172414,
       0.64778404]), 6435.0, 10000.0)
Old Query :)
[0.    0.    0.    0.    0.    1.999]


HBox(children=(IntProgress(value=0), HTML(value='')))

Old Query :)
[0.    0.    0.    0.    0.    1.999]


HBox(children=(IntProgress(value=0), HTML(value='')))

Old Query :)
[0.    0.    0.    0.    0.    1.999]





HBox(children=(IntProgress(value=0), HTML(value='')))

Old Query :)
[0.    0.    0.    0.    0.    1.999]


HBox(children=(IntProgress(value=0), HTML(value='')))

Old Query :)
[0.    0.    0.    0.    0.    1.999]


HBox(children=(IntProgress(value=0), HTML(value='')))

Old Query :)
[0.    0.    0.    0.    0.    1.999]


HBox(children=(IntProgress(value=0), HTML(value='')))

Old Query :)
[0.    0.    0.    0.    0.    1.999]


HBox(children=(IntProgress(value=0), HTML(value='')))




Old Query :)
[0.    0.    0.    0.    0.    1.999]


HBox(children=(IntProgress(value=0), HTML(value='')))

Old Query :)
[0.    0.    0.    0.    0.    1.999]


HBox(children=(IntProgress(value=0), HTML(value='')))

Old Query :)
[0.    0.    0.    0.    0.    1.999]


HBox(children=(IntProgress(value=0), HTML(value='')))

Old Query :)
Old Query :)
(array([0.5483871 , 0.60550459, 0.6039604 , 0.53      , 0.55172414,
       0.64778404]), 6435.0, 10000.0)
Old Query :)
[0.    0.    0.    0.    0.    1.999]


HBox(children=(IntProgress(value=0, max=7500), HTML(value='')))



Old Query :)
(array([0.5483871 , 0.60550459, 0.6039604 , 0.53      , 0.55172414,
       0.64778404]), 6435.0, 10000.0)


### Game 5:

In [1642]:
def final_shot():
    policy = np.ones(2) 
    for i in range(14):
        print(get_stats(2, game_id=5))
        if(i % 2 == 0):
            policy = np.ones(2)
            new_policy = play_game(policy,
              reward = 0.008, penalty=0.008 ,game_id = 5, 
              iterations = 50, isTraining = True)
            super_threshold_indices = new_policy < np.max(new_policy)
            policy[super_threshold_indices] = 0
            policy[super_threshold_indices == False] = 1
        else:
            print(policy)
            play_game(policy,
          reward = 0.008, penalty=0.008 ,game_id = 5, 
          iterations = 950, isTraining = True)
            

In [1643]:
final_shot()

Old Query :)
(array([0.54727579, 0.58526204]), 5123.0, 9100.0)


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Old Query :)
(array([0.54727579, 0.58526204]), 5123.0, 9100.0)
[1. 1.]


HBox(children=(IntProgress(value=0, max=950), HTML(value='')))

Old Query :)
(array([0.54727579, 0.58526204]), 5123.0, 9100.0)


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Old Query :)
(array([0.54727579, 0.58526204]), 5123.0, 9100.0)
[1. 1.]




HBox(children=(IntProgress(value=0, max=950), HTML(value='')))

Old Query :)
(array([0.54727579, 0.58526204]), 5123.0, 9100.0)


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Old Query :)
(array([0.54727579, 0.58526204]), 5123.0, 9100.0)
[1. 1.]


HBox(children=(IntProgress(value=0, max=950), HTML(value='')))

Old Query :)
(array([0.54727579, 0.58526204]), 5123.0, 9100.0)


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Old Query :)
(array([0.54727579, 0.58526204]), 5123.0, 9100.0)
[1. 1.]


HBox(children=(IntProgress(value=0, max=950), HTML(value='')))


Old Query :)
(array([0.54727579, 0.58526204]), 5123.0, 9100.0)


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Old Query :)
(array([0.54727579, 0.58526204]), 5123.0, 9100.0)
[1. 1.]


HBox(children=(IntProgress(value=0, max=950), HTML(value='')))

Old Query :)
(array([0.54727579, 0.58526204]), 5123.0, 9100.0)


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Old Query :)
(array([0.54727579, 0.58526204]), 5123.0, 9100.0)
[1. 1.]


HBox(children=(IntProgress(value=0, max=950), HTML(value='')))




Old Query :)
(array([0.54727579, 0.58526204]), 5123.0, 9100.0)


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Old Query :)
(array([0.54727579, 0.58526204]), 5123.0, 9100.0)
[1. 1.]


HBox(children=(IntProgress(value=0, max=950), HTML(value='')))