In [6]:
import pandas as pd
df = pd.read_csv("../wargame-data/data/510064564/final_data.csv", index_col=0)
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,AirplaneMinimalAltitude,Amphibious,ArmorFront,ArmorFrontSplashResistant,ArmorRear,ArmorRearSplashResistant,ArmorSides,ArmorSidesSplashResistant,ArmorTop,ArmorTopSplashResistant,...,Weapon9RangeShipMinimum,Weapon9RayonPinned,Weapon9ShotsPerSalvo,Weapon9SupplyCost,Weapon9Tags,Weapon9TimeBetweenSalvos,Weapon9TimeBetweenShots,Weapon9Type,Transporters,Decks
0,,False,2,False,1,False,1,False,1,False,...,,,,,,,,,18064.0,Motorized|Support
1,,False,1,False,1,False,1,False,1,False,...,,,,,,,,,,Mechanized|Armored|Support
2,,False,2,False,1,False,1,False,1,False,...,,,,,,,,,,Mechanized|Armored|Support
3,,False,2,False,1,False,1,False,1,False,...,,,,,,,,,,Mechanized|Armored|Support
4,,True,2,False,1,False,1,False,1,False,...,,,,,,,,,,Mechanized|Motorized|Armored


The base `DataFrame` contains a lot of information, much of which is specific to a particular unit class. It makes the most sense to examine the attributes and sub-attributes of specific weapon classes. Tanks the simplest unit conceptually, so we'll study them first.

Selecting the attributes of the unit card that are relevant to the tank type is easy. Weapons are a bit harder. Obviously the weapon is just as important as the armor when it comes to deciding how to price a tank. But there are a lot of them, all having different types. For example, napalm bombs are very different from cluster bombs, which are very different from a tank cannon. And so on.

Instead of trying to build this complexity into our model by modeling things like weapon category, range, ammo count, and so on, we'll take a shortcut: just consider the weapons as static elements. A unit either has a weapon or it doesn't, and if it does have that weapon the "price" of that weapon gets tacked on to the price of the unit overall.

A difficulty with this approach is that weapons can appear in any position on a unit card, technically. For example, the same machine gun might be mounted in card position three on an ATGM tank, and in card position two on a non-ATGM tank. If we naively dummy encode weapon names, we will get duplicate columns. You can check where and when this occurs with the following code:

```python

pd.concat([pd.get_dummies(tanks[c]) for c in ['Weapon1Name', 'Weapon2Name', 'Weapon3Name']], 
           axis='columns', 
           verify_integrity=True
          )
```

So we have to do a little bit of work to treat those weapons differently (running a row-wise aggregator instead of a column-wise dummy ufunc). That, implemented as `merge_weapon_list`, follows below.

In [38]:
tanks = df.query("Tab == 'TNK'")

In [119]:
tanks = df.query("Tab == 'TNK'")

def merge_weapon_list(subframe, weapons):
    subsrs = []
    for w in weapons:
        subsrs.append(subframe.apply(lambda row: int((row == w).any()), axis='columns').rename(w))
    return pd.concat(subsrs, axis='columns')

multiple_position_weapons = ['DShK', 'HS.820', 'M2 Browning', 'MAG 7', 'Mk19', 'NF-1', 'NSVT', 'T54']

weapons = (pd.concat([pd.get_dummies(tanks[c]) for c in ['Weapon1Name', 'Weapon2Name', 'Weapon3Name']], 
           axis='columns')
     .pipe(lambda df: df.loc[:, [c for c in df if c not in multiple_position_weapons]])
     .join(merge_weapon_list(tanks[['Weapon1Name', 'Weapon2Name', 'Weapon3Name']], multiple_position_weapons),
          )
)

tanks_cols = [
    'Amphibious', 'ArmorFront', 'ArmorRear', 'ArmorSides', 'ArmorTop', 'Autonomy', 'MaxSpeed', 'Price', 'Year'
]
tanks = tanks.loc[:, tanks_cols].join(weapons).assign(NumWeapons=weapons.sum(axis=1))

In [120]:
tanks.head()

Unnamed: 0,Amphibious,ArmorFront,ArmorRear,ArmorSides,ArmorTop,Autonomy,MaxSpeed,Price,Year,2A20,...,Strela-2M,DShK,HS.820,M2 Browning,MAG 7,Mk19,NF-1,NSVT,T54,NumWeapons
30,False,3,1,2,1,400.0,60.0,35.0,1975.0,0,...,0,0,0,0,1,0,0,0,0,2
37,False,4,1,3,1,900.0,85.0,40.0,1991.0,0,...,0,0,0,0,0,0,1,0,0,2
40,False,3,1,2,1,400.0,60.0,25.0,1974.0,0,...,0,0,0,0,0,0,1,0,0,3
41,False,6,2,3,1,600.0,60.0,30.0,1967.0,0,...,0,0,0,0,0,0,1,0,0,2
42,False,7,2,4,2,600.0,70.0,40.0,1982.0,0,...,0,0,0,0,0,0,1,0,0,3


In [122]:
from sklearn.neighbors import KernelDensity
from sklearn.model_selection import GridSearchCV
import numpy as np

# use grid search cross-validation to optimize the bandwidth
params = {'bandwidth': np.logspace(-1, 1, 20)}
grid = GridSearchCV(KernelDensity(), params)
grid.fit(tanks)

print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))

model = grid.best_estimator_

best bandwidth: 1.8329807108324356


In [123]:
model

KernelDensity(algorithm='auto', atol=0, bandwidth=1.8329807108324356,
       breadth_first=True, kernel='gaussian', leaf_size=40,
       metric='euclidean', metric_params=None, rtol=0)

In [203]:
def draw_sample(model, n_nonweapon_vars):
    while True:
        # Get the draw.
        draw = pd.Series(model.sample(1)[0], tanks.columns)

        # Get the weapons.
        numweapons = draw.iloc[-1]
        weapons = draw.iloc[n_nonweapon_vars + 1:-1]
        weapons = weapons.sort_values(ascending=False).head(int(numweapons)).index

        # Establish the result.
        result = pd.concat([draw.iloc[:n_nonweapon_vars], 
                            pd.Series({'Weapon{0}Name'.format(i + 1): w for i, w in enumerate(weapons)})])

        # Clean up result values.
        result['Amphibious'] = result['Amphibious'] > 0.5
        for c in ['ArmorFront', 'ArmorRear', 'ArmorSides', 'ArmorTop']:
            result[c] = int(result[c])
        result['Autonomy'] = np.round(result['Autonomy'], -1)
        result['MaxSpeed'] = np.round(result['MaxSpeed'], -1)
        result['Price'] = np.round(result['Price'], -1)
        result['Year'] = np.round(result['Year'], 0)
        
        if (len(result) > 9) and (len(result) < 13):
            return result

In [226]:
draw_sample(model, 9)

Amphibious     False
ArmorFront         8
ArmorRear         -1
ArmorSides         5
ArmorTop           1
Autonomy         500
MaxSpeed          60
Price             40
Year            1973
Weapon1Name    XM-35
dtype: object

Hmm. Unfortunately there's no way to tell the kernel density estimator that we need strictly between 1 and 3 weapons, and that the weapon types have to be X, Y, Z (main cannon, perhaps a missile, and an MG), so an approach based on more specific and focused feature parameterization is the way to go.