In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MultiLabelBinarizer

# Read Data

In [2]:
df = pd.read_csv('../data/bgg_data.csv', index_col=0)

print(df.shape)
df.head()

(18740, 4)


Unnamed: 0,name,description,boardgamecategorys,boardgamemechanics
174430,Gloomhaven,Gloomhaven is a game of Euro-inspired tactica...,"['Adventure', 'Exploration', 'Fantasy', 'Fight...","['Action Retrieval', 'Campaign / Battle Card D..."
161936,Pandemic Legacy: Season 1,Pandemic Legacy is a co-operative campaign gam...,"['Environmental', 'Medical']","['Action Points', 'Cooperative Game', 'Hand Ma..."
167791,Terraforming Mars,"In the 2400s, mankind begins to terraform the ...","['Economic', 'Environmental', 'Industry / Manu...","['Card Drafting', 'End Game Bonuses', 'Hand Ma..."
224517,Brass: Birmingham,Brass: Birmingham is an economic strategy game...,"['Economic', 'Industry / Manufacturing', 'Tran...","['Connections', 'Hand Management', 'Income', '..."
182028,Through the Ages: A New Story of Civilization,Through the Ages: A New Story of Civilization ...,"['Card Game', 'Civilization', 'Economic']","['Action Points', 'Auction/Bidding', 'Auction:..."


In [3]:
df['game_id'] = df.index
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,name,description,boardgamecategorys,boardgamemechanics,game_id
0,Gloomhaven,Gloomhaven is a game of Euro-inspired tactica...,"['Adventure', 'Exploration', 'Fantasy', 'Fight...","['Action Retrieval', 'Campaign / Battle Card D...",174430
1,Pandemic Legacy: Season 1,Pandemic Legacy is a co-operative campaign gam...,"['Environmental', 'Medical']","['Action Points', 'Cooperative Game', 'Hand Ma...",161936
2,Terraforming Mars,"In the 2400s, mankind begins to terraform the ...","['Economic', 'Environmental', 'Industry / Manu...","['Card Drafting', 'End Game Bonuses', 'Hand Ma...",167791
3,Brass: Birmingham,Brass: Birmingham is an economic strategy game...,"['Economic', 'Industry / Manufacturing', 'Tran...","['Connections', 'Hand Management', 'Income', '...",224517
4,Through the Ages: A New Story of Civilization,Through the Ages: A New Story of Civilization ...,"['Card Game', 'Civilization', 'Economic']","['Action Points', 'Auction/Bidding', 'Auction:...",182028


# Handle Dups

In [4]:
df.name.nunique()

18338

In [5]:
temp = df.name.value_counts()
temp[temp > 1]

Robin Hood          5
Saga                4
Gettysburg          4
Gangster            4
Cosmic Encounter    4
                   ..
Cat & Mouse         2
Xerxes              2
Labyrinth           2
Singapore           2
Antarctica          2
Name: name, Length: 315, dtype: int64

In [6]:
df[df.name == 'Cosmic Encounter']

Unnamed: 0,name,description,boardgamecategorys,boardgamemechanics,game_id
129,Cosmic Encounter,Build a galactic empire... In the depths of sp...,"['Bluffing', 'Negotiation', 'Science Fiction',...","['Alliances', 'Auction/Bidding', 'Hand Managem...",39463
1016,Cosmic Encounter,"By request of Fantasy Flight Games, Board Game...","['Bluffing', 'Negotiation', 'Science Fiction']","['Hand Management', 'Negotiation', 'Variable P...",15
1724,Cosmic Encounter,Players represent alien races that are seeking...,"['Bluffing', 'Card Game', 'Science Fiction']",['Variable Player Powers'],40529
2783,Cosmic Encounter,"In Cosmic Encounter, you play the leader of a ...","['Bluffing', 'Card Game', 'Science Fiction']",['Variable Player Powers'],40531


We will keep the duplicates as long as the decription differs

In [7]:
temp = df.groupby(['name', 'description']).name.count()
temp[temp > 1]

Series([], Name: name, dtype: int64)

# Format Data

## Dummy Encode cats & mechs

### Cats

In [8]:
mlb = MultiLabelBinarizer()
temp_cats = mlb.fit_transform(df.boardgamecategorys.apply(lambda x: (
    x[1:-1]
    .replace("'", '')
    .replace('"', '')
    .replace(', ', ',')
    .replace(' / ', '/')
    .replace(' ', '_')
    .split(',')
)))
temp_cats = pd.DataFrame(temp_cats, columns=mlb.classes_).add_prefix('CAT:')

print(temp_cats.shape)
temp_cats.head()

(18740, 84)


Unnamed: 0,CAT:,CAT:Abstract_Strategy,CAT:Action/Dexterity,CAT:Adventure,CAT:Age_of_Reason,CAT:American_Civil_War,CAT:American_Indian_Wars,CAT:American_Revolutionary_War,CAT:American_West,CAT:Ancient,...,CAT:Transportation,CAT:Travel,CAT:Trivia,CAT:Video_Game_Theme,CAT:Vietnam_War,CAT:Wargame,CAT:Word_Game,CAT:World_War_I,CAT:World_War_II,CAT:Zombies
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
cat_features = list(temp_cats.columns)
display(cat_features[:5])
print(len(cat_features))
temp_cats.sum().sort_values(ascending=False)

['CAT:',
 'CAT:Abstract_Strategy',
 'CAT:Action/Dexterity',
 'CAT:Adventure',
 'CAT:Age_of_Reason']

84


CAT:Card_Game                  5439
CAT:Wargame                    3402
CAT:Fantasy                    2268
CAT:Party_Game                 1668
CAT:Dice                       1612
                               ... 
CAT:Vietnam_War                  51
CAT:American_Indian_Wars         35
CAT:Game_System                  26
CAT:Korean_War                   20
CAT:Expansion_for_Base-game      20
Length: 84, dtype: int64

In [10]:
top_10_cats = list(temp_cats.sum().sort_values(ascending=False).index[:10])

top_10_cats

['CAT:Card_Game',
 'CAT:Wargame',
 'CAT:Fantasy',
 'CAT:Party_Game',
 'CAT:Dice',
 'CAT:Fighting',
 'CAT:Science_Fiction',
 'CAT:Childrens_Game',
 'CAT:Abstract_Strategy',
 'CAT:Economic']

In [11]:
((temp_cats.sum() / len(temp_cats)) * 100).sort_values(ascending=False).head(50)

CAT:Card_Game                 29.023479
CAT:Wargame                   18.153682
CAT:Fantasy                   12.102455
CAT:Party_Game                 8.900747
CAT:Dice                       8.601921
CAT:Fighting                   7.774813
CAT:Science_Fiction            7.716115
CAT:Childrens_Game             7.475987
CAT:Abstract_Strategy          7.203842
CAT:Economic                   7.043757
CAT:World_War_II               5.928495
CAT:Animals                    5.891142
CAT:Bluffing                   5.827108
CAT:Humor                      5.538954
CAT:Adventure                  5.213447
CAT:Action/Dexterity           5.122732
CAT:Deduction                  5.106724
CAT:Miniatures                 4.845251
CAT:Movies/TV/Radio_theme      4.823906
CAT:Medieval                   4.626467
CAT:Exploration                3.996798
CAT:Ancient                    3.505870
CAT:Real-time                  3.415155
CAT:Racing                     3.281750
CAT:Negotiation                3.169691


In [12]:
df_mod = pd.concat([df, temp_cats], axis=1)
df_mod.head()

Unnamed: 0,name,description,boardgamecategorys,boardgamemechanics,game_id,CAT:,CAT:Abstract_Strategy,CAT:Action/Dexterity,CAT:Adventure,CAT:Age_of_Reason,...,CAT:Transportation,CAT:Travel,CAT:Trivia,CAT:Video_Game_Theme,CAT:Vietnam_War,CAT:Wargame,CAT:Word_Game,CAT:World_War_I,CAT:World_War_II,CAT:Zombies
0,Gloomhaven,Gloomhaven is a game of Euro-inspired tactica...,"['Adventure', 'Exploration', 'Fantasy', 'Fight...","['Action Retrieval', 'Campaign / Battle Card D...",174430,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Pandemic Legacy: Season 1,Pandemic Legacy is a co-operative campaign gam...,"['Environmental', 'Medical']","['Action Points', 'Cooperative Game', 'Hand Ma...",161936,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Terraforming Mars,"In the 2400s, mankind begins to terraform the ...","['Economic', 'Environmental', 'Industry / Manu...","['Card Drafting', 'End Game Bonuses', 'Hand Ma...",167791,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Brass: Birmingham,Brass: Birmingham is an economic strategy game...,"['Economic', 'Industry / Manufacturing', 'Tran...","['Connections', 'Hand Management', 'Income', '...",224517,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,Through the Ages: A New Story of Civilization,Through the Ages: A New Story of Civilization ...,"['Card Game', 'Civilization', 'Economic']","['Action Points', 'Auction/Bidding', 'Auction:...",182028,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Mechs

In [13]:
mlb = MultiLabelBinarizer()
temp_mechs = mlb.fit_transform(df.boardgamemechanics.apply(lambda x: (
    x[1:-1]
    .replace("'", '')
    .replace('"', '')
    .replace(', ', ',')
    .replace(' / ', '/')
    .replace(' ', '_')
    .split(',')
)))
temp_mechs = pd.DataFrame(temp_mechs, columns=mlb.classes_).add_prefix('mech:')

print(temp_mechs.shape)
temp_mechs.head()

(18740, 177)


Unnamed: 0,mech:,mech:Acting,mech:Action/Event,mech:Action_Drafting,mech:Action_Points,mech:Action_Queue,mech:Action_Retrieval,mech:Action_Timer,mech:Advantage_Token,mech:Alliances,...,mech:Variable_Phase_Order,mech:Variable_Player_Powers,mech:Variable_Setup,mech:Victory_Points_as_a_Resource,mech:Voting,mech:Worker_Placement,mech:Worker_Placement_with_Dice_Workers,mech:You_Choose,mech:Zone_of_Control,mech:and_Pool_Building
0,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
mech_features = list(temp_mechs.columns)
display(mech_features[:5])
print(len(mech_features))
temp_mechs.sum().sort_values(ascending=False)

['mech:',
 'mech:Acting',
 'mech:Action/Event',
 'mech:Action_Drafting',
 'mech:Action_Points']

177


mech:Dice_Rolling                4554
mech:Hand_Management             3799
mech:Set_Collection              2488
mech:Variable_Player_Powers      2278
mech:Hexagon_Grid                2060
                                 ... 
mech:Relative_Movement              2
mech:Auction:_Fixed_Placement       2
mech:Pattern_Movement               2
mech:Auction:_English               1
mech:Passed_Action_Token            1
Length: 177, dtype: int64

In [15]:
((temp_mechs.sum() / len(temp_mechs)) * 100).sort_values(ascending=False).head(20)

mech:Dice_Rolling                     24.300961
mech:Hand_Management                  20.272145
mech:Set_Collection                   13.276414
mech:Variable_Player_Powers           12.155816
mech:Hexagon_Grid                     10.992529
mech:                                  8.740662
mech:Card_Drafting                     8.409819
mech:Tile_Placement                    8.228388
mech:Modular_Board                     7.844184
mech:Area_Majority/Influence           7.321238
mech:Roll/Spin_and_Move                6.643543
mech:Simulation                        6.611526
mech:Cooperative_Game                  6.264674
mech:Simultaneous_Action_Selection     5.490928
mech:Auction/Bidding                   5.394877
mech:Action_Points                     5.256137
mech:Area_Movement                     5.186766
mech:Team-Based_Game                   4.471718
mech:Grid_Movement                     4.466382
mech:Take_That                         4.434365
dtype: float64

In [16]:
df_mod = pd.concat([df_mod, temp_mechs], axis=1)
df_mod.head()

Unnamed: 0,name,description,boardgamecategorys,boardgamemechanics,game_id,CAT:,CAT:Abstract_Strategy,CAT:Action/Dexterity,CAT:Adventure,CAT:Age_of_Reason,...,mech:Variable_Phase_Order,mech:Variable_Player_Powers,mech:Variable_Setup,mech:Victory_Points_as_a_Resource,mech:Voting,mech:Worker_Placement,mech:Worker_Placement_with_Dice_Workers,mech:You_Choose,mech:Zone_of_Control,mech:and_Pool_Building
0,Gloomhaven,Gloomhaven is a game of Euro-inspired tactica...,"['Adventure', 'Exploration', 'Fantasy', 'Fight...","['Action Retrieval', 'Campaign / Battle Card D...",174430,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
1,Pandemic Legacy: Season 1,Pandemic Legacy is a co-operative campaign gam...,"['Environmental', 'Medical']","['Action Points', 'Cooperative Game', 'Hand Ma...",161936,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,Terraforming Mars,"In the 2400s, mankind begins to terraform the ...","['Economic', 'Environmental', 'Industry / Manu...","['Card Drafting', 'End Game Bonuses', 'Hand Ma...",167791,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,Brass: Birmingham,Brass: Birmingham is an economic strategy game...,"['Economic', 'Industry / Manufacturing', 'Tran...","['Connections', 'Hand Management', 'Income', '...",224517,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,Through the Ages: A New Story of Civilization,Through the Ages: A New Story of Civilization ...,"['Card Game', 'Civilization', 'Economic']","['Action Points', 'Auction/Bidding', 'Auction:...",182028,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
df_mod.description.values[0]

'Gloomhaven  is a game of Euro-inspired tactical combat in a persistent world of shifting motives. Players will take on the role of a wandering adventurer with their own special set of skills and their own reasons for traveling to this dark corner of the world. Players must work together out of necessity to clear out menacing dungeons and forgotten ruins. In the process, they will enhance their abilities with experience and loot, discover new locations to explore and plunder, and expand an ever-branching story fueled by the decisions they make.<br/><br/>This is a game with a persistent and changing world that is ideally played over many game sessions. After a scenario, players will make decisions on what to do, which will determine how the story continues, kind of like a &ldquo;Choose Your Own Adventure&rdquo; book. Playing through a scenario is a cooperative affair where players will fight against automated monsters using an innovative card system to determine the order of play and wh

### Clean html text

In [18]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    try:
        s = MLStripper()
        s.feed(html)
        return s.get_data()
    except:
        print(html)

In [19]:
df_mod['description'] = df_mod.description.fillna('').apply(lambda x: strip_tags(x))
df_mod.description.values[0]

'Gloomhaven  is a game of Euro-inspired tactical combat in a persistent world of shifting motives. Players will take on the role of a wandering adventurer with their own special set of skills and their own reasons for traveling to this dark corner of the world. Players must work together out of necessity to clear out menacing dungeons and forgotten ruins. In the process, they will enhance their abilities with experience and loot, discover new locations to explore and plunder, and expand an ever-branching story fueled by the decisions they make.This is a game with a persistent and changing world that is ideally played over many game sessions. After a scenario, players will make decisions on what to do, which will determine how the story continues, kind of like a “Choose Your Own Adventure” book. Playing through a scenario is a cooperative affair where players will fight against automated monsters using an innovative card system to determine the order of play and what a player does on th

## Save Data

In [20]:
df_mod.to_csv('../data/bgg_data_mod.csv')

# BERT Multi-label Classification

Ref: https://github.com/kaushaltrivedi/fast-bert

In [21]:
from transformers import BertTokenizer
from pathlib import Path
import torch

from box import Box
import pandas as pd
import collections
import os
from tqdm import tqdm, trange
import sys
import random
import numpy as np
import apex
from sklearn.model_selection import train_test_split

import datetime

from fast_bert.modeling import BertForMultiLabelSequenceClassification
from fast_bert.data_cls import BertDataBunch, InputExample, InputFeatures, MultiLabelTextProcessor, convert_examples_to_features
from fast_bert.learner_cls import BertLearner
from fast_bert.metrics import accuracy_multilabel, accuracy_thresh, fbeta, roc_auc

In [22]:
import logging
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)
logger = logging.getLogger(__name__)

In [23]:
DATA_PATH=Path('../data/cats')
DATA_PATH.mkdir(exist_ok=True)
LABEL_PATH = Path('../data/cats/labels/')

PATH=Path('../data/cats/tmp')
PATH.mkdir(exist_ok=True)

MODEL_PATH=Path('../models/')
LOG_PATH=Path('../logs/')
MODEL_PATH.mkdir(exist_ok=True)

model_state_dict = None

# BERT_PRETRAINED_PATH = Path('../../bert_models/pretrained-weights/cased_L-12_H-768_A-12/')
# BERT_PRETRAINED_PATH = Path('../../bert_models/pre_trained/uncased_L-12_H-768_A-12/')
# BERT_PRETRAINED_PATH = Path('../../bert_fastai/pretrained-weights/uncased_L-24_H-1024_A-16/')
BERT_PRETRAINED_PATH = Path('../../bert_fastai/pretrained-weights/uncased_L-2_H-128_A-2/')


# FINETUNED_PATH = Path('../models/finetuned_model.bin')
FINETUNED_PATH = None
# model_state_dict = torch.load(FINETUNED_PATH)

LOG_PATH.mkdir(exist_ok=True)

OUTPUT_PATH = MODEL_PATH/'output'
OUTPUT_PATH.mkdir(exist_ok=True)

In [24]:
torch.cuda.empty_cache()

In [25]:
args = Box({
    "run_text": "multilabel bgg comments with freezable layers",
    "train_size": -1,
    "val_size": -1,
    "log_path": LOG_PATH,
    "full_data_dir": DATA_PATH,
    "data_dir": DATA_PATH,
    "task_name": "bgg_cat_classification_lib",
    "no_cuda": False,
    "bert_model": BERT_PRETRAINED_PATH,
    "output_dir": OUTPUT_PATH,
    "max_seq_length": 512,
    "do_train": True,
    "do_eval": True,
    "do_lower_case": True,
    "train_batch_size": 6,
    "eval_batch_size": 6,
    "learning_rate": 5e-5,
    "num_train_epochs": 5,
    "warmup_proportion": 0.0,
    "no_cuda": False,
    "local_rank": -1,
    "seed": 42,
    "gradient_accumulation_steps": 1,
    "optimize_on_cpu": False,
    "fp16": True,
    "fp16_opt_level": "O1",
    "weight_decay": 0.0,
    "adam_epsilon": 1e-8,
    "max_grad_norm": 1.0,
    "max_steps": -1,
    "warmup_steps": 500,
    "logging_steps": 50,
    "eval_all_checkpoints": True,
    "overwrite_output_dir": True,
    "overwrite_cache": False,
    "seed": 42,
    "loss_scale": 128,
    "task_name": 'intent',
    "model_name": 'xlnet-base-cased',
    "model_type": 'xlnet'
})

In [26]:
import logging

run_start_time = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M-%S')

logfile = str(LOG_PATH/'log-{}-{}.txt'.format(run_start_time, args["run_text"]))

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    handlers=[
        logging.FileHandler(logfile),
        logging.StreamHandler(sys.stdout)
    ])

logger = logging.getLogger()

logger.info(args)

03/23/2020 22:15:06 - INFO - root -   {'run_text': 'multilabel bgg comments with freezable layers', 'train_size': -1, 'val_size': -1, 'log_path': WindowsPath('../logs'), 'full_data_dir': WindowsPath('../data/cats'), 'data_dir': WindowsPath('../data/cats'), 'task_name': 'intent', 'no_cuda': False, 'bert_model': WindowsPath('../../bert_fastai/pretrained-weights/uncased_L-2_H-128_A-2'), 'output_dir': WindowsPath('../models/output'), 'max_seq_length': 512, 'do_train': True, 'do_eval': True, 'do_lower_case': True, 'train_batch_size': 6, 'eval_batch_size': 6, 'learning_rate': 5e-05, 'num_train_epochs': 5, 'warmup_proportion': 0.0, 'local_rank': -1, 'seed': 42, 'gradient_accumulation_steps': 1, 'optimize_on_cpu': False, 'fp16': True, 'fp16_opt_level': 'O1', 'weight_decay': 0.0, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'max_steps': -1, 'warmup_steps': 500, 'logging_steps': 50, 'eval_all_checkpoints': True, 'overwrite_output_dir': True, 'overwrite_cache': False, 'loss_scale': 128, 'model_na

In [27]:
device = torch.device('cuda')
if torch.cuda.device_count() > 1:
    args.multi_gpu = True
else:
    args.multi_gpu = False

## Data Prep

In [28]:
top_10_cats_clean = [x[4:] for x in top_10_cats]
top_10_cats_clean

['Card_Game',
 'Wargame',
 'Fantasy',
 'Party_Game',
 'Dice',
 'Fighting',
 'Science_Fiction',
 'Childrens_Game',
 'Abstract_Strategy',
 'Economic']

In [29]:
cat_df_mod = df_mod[['game_id', 'description'] + top_10_cats].copy()

# keep only popular cats
cat_df_mod = cat_df_mod[cat_df_mod[top_10_cats].sum(axis=1) > 0]

# fix CAT names
cat_df_mod.columns = ['game_id', 'description'] + top_10_cats_clean

print(cat_df_mod.shape)
cat_df_mod.head()

(15371, 12)


Unnamed: 0,game_id,description,Card_Game,Wargame,Fantasy,Party_Game,Dice,Fighting,Science_Fiction,Childrens_Game,Abstract_Strategy,Economic
0,174430,Gloomhaven is a game of Euro-inspired tactica...,0,0,1,0,0,1,0,0,0,0
2,167791,"In the 2400s, mankind begins to terraform the ...",0,0,0,0,0,0,1,0,0,1
3,224517,Brass: Birmingham is an economic strategy game...,0,0,0,0,0,0,0,0,0,1
4,182028,Through the Ages: A New Story of Civilization ...,1,0,0,0,0,0,0,0,0,1
5,233078,Twilight Imperium (Fourth Edition) is a game o...,0,1,0,0,0,0,1,0,0,1


In [30]:
cat_df_mod[top_10_cats_clean].sum()

Card_Game            5439
Wargame              3402
Fantasy              2268
Party_Game           1668
Dice                 1612
Fighting             1457
Science_Fiction      1446
Childrens_Game       1401
Abstract_Strategy    1350
Economic             1320
dtype: int64

In [31]:
cat_df_mod['rand'] = np.random.uniform(0, 1, cat_df_mod.shape[0])

train_df = cat_df_mod[cat_df_mod['rand'] < .7]
valid_df = cat_df_mod[((cat_df_mod['rand'] > .7) & (cat_df_mod['rand'] < .9))]
test_df = cat_df_mod[cat_df_mod['rand'] > .9]

print(len(train_df), len(valid_df), len(test_df), len(train_df)+ len(valid_df)+ len(test_df))

10761 3072 1538 15371


In [32]:
train_df.to_csv('../data/cats/train.csv')
valid_df.to_csv('../data/cats/valid.csv')
test_df.to_csv('../data/cats/test.csv')

In [33]:
train_df['description'] = train_df.description.apply(lambda x: strip_tags(x))
valid_df['description'] = valid_df.description.apply(lambda x: strip_tags(x))
test_df['description'] = test_df.description.apply(lambda x: strip_tags(x))

train_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,game_id,description,Card_Game,Wargame,Fantasy,Party_Game,Dice,Fighting,Science_Fiction,Childrens_Game,Abstract_Strategy,Economic,rand
0,174430,Gloomhaven is a game of Euro-inspired tactica...,0,0,1,0,0,1,0,0,0,0,0.319428
2,167791,"In the 2400s, mankind begins to terraform the ...",0,0,0,0,0,0,1,0,0,1,0.074314
6,187645,From the publisher:Star Wars: Rebellion is a b...,0,1,0,0,0,1,1,0,0,0,0.599277
7,12333,"""Now the trumpet summons us again, not as a ca...",0,1,0,0,0,0,0,0,0,0,0.696492
10,169786,It is a time of unrest in 1920s Europa. The as...,0,0,0,0,0,1,1,0,0,1,0.417969


### Create Label File

In [34]:
import csv

with open('../data/cats/labels/labels.csv', 'w', newline="") as myfile:
    wr = csv.writer(myfile)
    for x in top_10_cats:
        wr.writerow([x[4:]])

## Mdl Run

The databunch object takes training, validation and test csv files and converts the data into internal representation for BERT, RoBERTa, DistilBERT or XLNet. The object also instantiates the correct data-loaders based on device profile and batch_size and max_sequence_length.

In [35]:
databunch = BertDataBunch(args['data_dir'], LABEL_PATH, args.model_name, train_file='train.csv', val_file='valid.csv',
                          test_data='test.csv',
                          text_col="description", label_col=top_10_cats_clean,
                          batch_size_per_gpu=args['train_batch_size'], max_seq_length=args['max_seq_length'], 
                          multi_gpu=args.multi_gpu, multi_label=True, model_type=args.model_type)

03/23/2020 22:15:07 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model from cache at C:\Users\mwald\.cache\torch\transformers\dad589d582573df0293448af5109cb6981ca77239ed314e15ca63b7b8a318ddd.8b10bd978b5d01c21303cc761fc9ecd464419b3bf921864a355ba807cfbfafa8
03/23/2020 22:15:07 - INFO - root -   Loading features from cached file ..\data\cats\cache\cached_xlnet_train_multi_label_512_train.csv
03/23/2020 22:15:08 - INFO - root -   Loading features from cached file ..\data\cats\cache\cached_xlnet_dev_multi_label_512_valid.csv
03/23/2020 22:15:09 - INFO - root -   Loading features from cached file ..\data\cats\cache\cached_xlnet_test_multi_label_512_test


In [36]:
databunch.train_dl.dataset[0][3]

tensor([0., 0., 0., 0., 0., 0., 1., 0., 0., 1.])

In [37]:
metrics = []
metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh})
metrics.append({'name': 'roc_auc', 'function': roc_auc})
metrics.append({'name': 'fbeta', 'function': fbeta})

In [38]:
learner = BertLearner.from_pretrained_model(databunch, args.model_name, metrics=metrics, 
                                            device=device, logger=logger, output_dir=args.output_dir, 
                                            finetuned_wgts_path=FINETUNED_PATH, warmup_steps=args.warmup_steps,
                                            multi_gpu=args.multi_gpu, is_fp16=args.fp16, 
                                            multi_label=True, logging_steps=0)

03/23/2020 22:15:09 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json from cache at C:\Users\mwald\.cache\torch\transformers\c9cc6e53904f7f3679a31ec4af244f4419e25ebc8e71ebf8c558a31cbcf07fc8.8df552e150a401a37ae808caf2a2c86fb6fedaa1f6963d1f21fbf3d0085c9e74
03/23/2020 22:15:09 - INFO - transformers.configuration_utils -   Model config XLNetConfig {
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": null,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "do_sample": false,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_ids": null,
  "ff_activation": "gelu",
  "finetuning_task": null,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "ma

xlnet-base-cased
<class 'str'>


03/23/2020 22:15:09 - INFO - transformers.modeling_utils -   loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin from cache at C:\Users\mwald\.cache\torch\transformers\24197ba0ce5dbfe23924431610704c88e2c0371afa49149360e4c823219ab474.7eac4fe898a021204e63c88c00ea68c60443c57f94b4bc3c02adbde6465745ac
03/23/2020 22:15:11 - INFO - transformers.modeling_utils -   Weights of XLNetForMultiLabelSequenceClassification not initialized from pretrained model: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
03/23/2020 22:15:11 - INFO - transformers.modeling_utils -   Weights from pretrained model not used in XLNetForMultiLabelSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']


In [39]:
learner.fit(args.num_train_epochs, args.learning_rate, validate=True)

03/23/2020 22:15:12 - INFO - root -   ***** Running training *****
03/23/2020 22:15:12 - INFO - root -     Num examples = 10753
03/23/2020 22:15:12 - INFO - root -     Num Epochs = 5
03/23/2020 22:15:12 - INFO - root -     Total train batch size (w. parallel, distributed & accumulation) = 6
03/23/2020 22:15:12 - INFO - root -     Gradient Accumulation steps = 1
03/23/2020 22:15:12 - INFO - root -     Total optimization steps = 8965


Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic




Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0


03/23/2020 22:50:55 - INFO - root -   Running evaluation
03/23/2020 22:50:55 - INFO - root -     Num examples = 10753
03/23/2020 22:50:55 - INFO - root -     Batch size = 12


03/23/2020 23:06:16 - INFO - root -   eval_loss after epoch 1: 0.20959679483370638: 
03/23/2020 23:06:16 - INFO - root -   eval_accuracy_thresh after epoch 1: 0.9235469698905945: 
03/23/2020 23:06:16 - INFO - root -   eval_roc_auc after epoch 1: 0.9362991626458933: 
03/23/2020 23:06:16 - INFO - root -   eval_fbeta after epoch 1: 0.7266228795051575: 
03/23/2020 23:06:16 - INFO - root -   lr after epoch 1: 4.7176401272713025e-05
03/23/2020 23:06:16 - INFO - root -   train_loss after epoch 1: 0.34617121147830193
03/23/2020 23:06:16 - INFO - root -   

03/23/2020 23:42:10 - INFO - root -   Running evaluation
03/23/2020 23:42:10 - INFO - root -     Num examples = 10753
03/23/2020 23:42:10 - INFO - root -     Batch size = 12


03/23/2020 23:57:32 - INFO - root -   eval_loss after epoch 2: 0.1680510353292842: 
03/23/2020 23:57:32 - INFO - root -   eval_accuracy_thresh after epoch 2: 0.9393565058708191: 
03/23/2020 23:57:32 - INFO - root -   eval_roc_auc after epoch 2: 0.9564444258381268: 
03/23/2020 23:57:32 - INFO - root -   eval_fbeta after epoch 2: 0.7954102754592896: 
03/23/2020 23:57:32 - INFO - root -   lr after epoch 2: 3.5319348342881726e-05
03/23/2020 23:57:32 - INFO - root -   train_loss after epoch 2: 0.2002632713025333
03/23/2020 23:57:32 - INFO - root -   

03/24/2020 00:32:59 - INFO - root -   Running evaluation
03/24/2020 00:32:59 - INFO - root -     Num examples = 10753
03/24/2020 00:32:59 - INFO - root -     Batch size = 12


03/24/2020 00:48:20 - INFO - root -   eval_loss after epoch 3: 0.1542115117752366: 
03/24/2020 00:48:20 - INFO - root -   eval_accuracy_thresh after epoch 3: 0.9434576630592346: 
03/24/2020 00:48:20 - INFO - root -   eval_roc_auc after epoch 3: 0.9631579523844859: 
03/24/2020 00:48:20 - INFO - root -   eval_fbeta after epoch 3: 0.8127401471138: 
03/24/2020 00:48:20 - INFO - root -   lr after epoch 3: 1.905904182045629e-05
03/24/2020 00:48:20 - INFO - root -   train_loss after epoch 3: 0.1731901742763115
03/24/2020 00:48:20 - INFO - root -   



Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0


03/24/2020 01:23:49 - INFO - root -   Running evaluation
03/24/2020 01:23:49 - INFO - root -     Num examples = 10753
03/24/2020 01:23:49 - INFO - root -     Batch size = 12


03/24/2020 01:39:10 - INFO - root -   eval_loss after epoch 4: 0.1486950461257461: 
03/24/2020 01:39:10 - INFO - root -   eval_accuracy_thresh after epoch 4: 0.9458290934562683: 
03/24/2020 01:39:10 - INFO - root -   eval_roc_auc after epoch 4: 0.9656132106478386: 
03/24/2020 01:39:10 - INFO - root -   eval_fbeta after epoch 4: 0.8236656785011292: 
03/24/2020 01:39:10 - INFO - root -   lr after epoch 4: 5.333735045914379e-06
03/24/2020 01:39:10 - INFO - root -   train_loss after epoch 4: 0.16239377306083427
03/24/2020 01:39:10 - INFO - root -   

03/24/2020 02:14:34 - INFO - root -   Running evaluation
03/24/2020 02:14:34 - INFO - root -     Num examples = 10753
03/24/2020 02:14:34 - INFO - root -     Batch size = 12


03/24/2020 02:29:55 - INFO - root -   eval_loss after epoch 5: 0.14776265039757608: 
03/24/2020 02:29:55 - INFO - root -   eval_accuracy_thresh after epoch 5: 0.9461266994476318: 
03/24/2020 02:29:55 - INFO - root -   eval_roc_auc after epoch 5: 0.9661402275749296: 
03/24/2020 02:29:55 - INFO - root -   eval_fbeta after epoch 5: 0.8254989981651306: 
03/24/2020 02:29:55 - INFO - root -   lr after epoch 5: 0.0
03/24/2020 02:29:55 - INFO - root -   train_loss after epoch 5: 0.15801390515597785
03/24/2020 02:29:55 - INFO - root -   



(8965, 0.20800646705479178)

In [40]:
learner.validate()

03/24/2020 08:58:56 - INFO - root -   Running evaluation
03/24/2020 08:58:56 - INFO - root -     Num examples = 10753
03/24/2020 08:58:56 - INFO - root -     Batch size = 12


{'loss': 0.14776265039757608,
 'accuracy_thresh': 0.9461266994476318,
 'roc_auc': 0.9661402275749296,
 'fbeta': 0.8254989981651306}

In [41]:
learner.save_model()

03/24/2020 09:14:09 - INFO - transformers.configuration_utils -   Configuration saved in ..\models\output\model_out\config.json
03/24/2020 09:14:10 - INFO - transformers.modeling_utils -   Model weights saved in ..\models\output\model_out\pytorch_model.bin


In [43]:
learner.predict_batch(list(pd.read_csv('../data/cats/test.csv')['description'].values[:3]))

03/24/2020 09:14:31 - INFO - root -   Writing example 0 of 3


[[('Economic', 0.95654296875),
  ('Abstract_Strategy', 0.07476806640625),
  ('Dice', 0.0411376953125),
  ('Card_Game', 0.039794921875),
  ('Fantasy', 0.030731201171875),
  ('Wargame', 0.0282135009765625),
  ('Science_Fiction', 0.0235137939453125),
  ('Childrens_Game', 0.01812744140625),
  ('Party_Game', 0.017913818359375),
  ('Fighting', 0.0170440673828125)],
 [('Science_Fiction', 0.953125),
  ('Economic', 0.270751953125),
  ('Wargame', 0.2100830078125),
  ('Fighting', 0.07421875),
  ('Dice', 0.06756591796875),
  ('Card_Game', 0.062103271484375),
  ('Abstract_Strategy', 0.047607421875),
  ('Fantasy', 0.0216217041015625),
  ('Childrens_Game', 0.011871337890625),
  ('Party_Game', 0.00981903076171875)],
 [('Economic', 0.9521484375),
  ('Card_Game', 0.06854248046875),
  ('Abstract_Strategy', 0.047088623046875),
  ('Dice', 0.0391845703125),
  ('Science_Fiction', 0.034423828125),
  ('Fantasy', 0.0264129638671875),
  ('Wargame', 0.0235595703125),
  ('Childrens_Game', 0.0193023681640625),
  ('

In [44]:
pd.read_csv('../data/cats/test.csv', index_col=0).head(3).T

Unnamed: 0,3,5,22
game_id,224517,233078,3076
description,Brass: Birmingham is an economic strategy game...,Twilight Imperium (Fourth Edition) is a game o...,In Puerto Rico players assume the roles of col...
Card_Game,0,0,0
Wargame,0,1,0
Fantasy,0,0,0
Party_Game,0,0,0
Dice,0,0,0
Fighting,0,0,0
Science_Fiction,0,1,0
Childrens_Game,0,0,0


In [None]:
pd.read_csv('../data/cats/test.csv', index_col=0).description.values[2]