<a href="https://www.kaggle.com/srgrace/bert-multiclass-txt-classification?scriptVersionId=89247627" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
%%bash
pip install pytorch-pretrained-bert

In [None]:
import csv
import pandas as pd
from pathlib import Path
import matplotlib.cm as cm
from fastai import *
from fastai.text import *
from fastai.callbacks import *
from fastai.metrics import *
import numpy as np
import pandas as pd

from pathlib import Path
from typing import *

import torch
import torch.optim as optim

In [None]:
# using default hyperparametere

class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)


config = Config(
    testing=False,
    bert_model_name="bert-base-uncased",
    max_lr=3e-5,
    epochs=4,
    use_fp16=True,
    bs=32,
    discriminative=False,
    max_seq_len=256,
)

In [None]:
from pytorch_pretrained_bert import BertTokenizer
bert_tok = BertTokenizer.from_pretrained(
    config.bert_model_name,
)

In [None]:
class FastAiBertTokenizer(BaseTokenizer):
    """Wrapper around BertTokenizer to be compatible with fast.ai"""
    def __init__(self, tokenizer: BertTokenizer, max_seq_len: int=128, **kwargs):
        self._pretrained_tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __call__(self, *args, **kwargs):
        return self

    def tokenizer(self, t:str) -> List[str]:
        """Limits the maximum sequence length"""
        return ["[CLS]"] + self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2] + ["[SEP]"]

In [None]:
fastai_tokenizer = Tokenizer(tok_func=FastAiBertTokenizer(bert_tok, max_seq_len=config.max_seq_len), pre_rules=[], post_rules=[])


In [None]:
fastai_bert_vocab = Vocab(list(bert_tok.vocab.keys()))

In [None]:
train_df1 = pd.read_csv('../input/dataset/train_file.csv')
train_df1.head()

In [None]:
train_df1.info()

In [None]:
train_df = pd.DataFrame()
train_df = train_df1[['Application/Permit Number', 'Description', 'Category']]
train_df.head()

In [None]:
category = {
        'SINGLE FAMILY / DUPLEX': 1,
        'COMMERCIAL': 2,
        'MULTIFAMILY': 3,
        'INSTITUTIONAL': 4,
        'INDUSTRIAL': 5
    }
train_df['Category'] = train_df['Category'].apply(lambda a: category[a])
train_df.head()

In [None]:
train_df['Description'].fillna('default text', inplace=True)
train_df['Description'].isna().sum()

In [None]:
test_df1 = pd.read_csv('../input/dataset/test_file.csv')
test_df1.head()

In [None]:
test_df = pd.DataFrame()
test_df = test_df1[['Application/Permit Number', 'Description']]
test_df.head()

In [None]:
test_df['Description'].fillna('default text', inplace=True)
test_df.isna().sum()

In [None]:
#Split data into Train and Validation 
from sklearn.model_selection import train_test_split
train, val = train_test_split(train_df)

In [None]:
databunch = TextDataBunch.from_df(".", train, val, 
                  tokenizer=fastai_tokenizer,
                  vocab=fastai_bert_vocab,
                  include_bos=False,
                  include_eos=False,
                  text_cols="Description",
                  label_cols="Category",
                  bs=config.bs,
                  collate_fn=partial(pad_collate, pad_first=False, pad_idx=0),
             )

In [None]:
databunch.show_batch()

In [None]:
databunch.classes

In [None]:

from pytorch_pretrained_bert.modeling import BertConfig, BertForSequenceClassification
bert_model = BertForSequenceClassification.from_pretrained(config.bert_model_name, num_labels=5)

In [None]:
# loss_func = torch.nn.BCEWithLogitsLoss()
learner = Learner(
    databunch, bert_model,
    metrics=[accuracy],
#     loss_func = loss_func
)
learner.callbacks.append(ShowGraph(learner))

In [None]:
learner.lr_find()
learner.recorder.plot(suggestion=True)

In [None]:
learner.fit_one_cycle(3, max_lr=3e-5)

In [None]:
pred = []
for i in test_df['Description']:
    pred.append(np.argmax(np.array(learner.predict(i)[2])) + 1)
    

In [None]:
print(len(pred))
test_df.shape

In [None]:
test_df['Category'] = pred
test_df.head()

In [None]:
category = {
        1: 'SINGLE FAMILY / DUPLEX',
        2: 'COMMERCIAL',
        3: 'MULTIFAMILY',
        4: 'INSTITUTIONAL',
        5: 'INDUSTRIAL'
    }
predict = test_df['Category'].apply(lambda a: category[a])

In [None]:
permit_no = test_df['Application/Permit Number']

submission = pd.DataFrame(
    {'Application/Permit Number': permit_no,
     'Category': predict
    })
submission.to_csv('submission.csv',index=False)

In [None]:
submission.head()

In [None]:
## To download the submission file without Commiting the kernel.

from IPython.display import HTML
import pandas as pd
import numpy as np
import base64

# download it (will only work for files < 2MB or so)
def create_download_link(df, title = "Download CSV file", filename = "bert_submission_3.2_epoch.csv"):  
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

create_download_link(submission)