In [1]:
import os
import random

from fastai.vision.all import *
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification, BertPreTrainedModel, BertTokenizerFast, AdamW, BertConfig, get_linear_schedule_with_warmup

In [2]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [3]:
DATA_PATH = '../data'
IMAGES_PATH = '../data/images'
AC_DATA_PATH = 'animal_crossing'
DOOM_DATA_PATH = 'doom'

In [4]:
ac_images_files = os.listdir(os.path.join(IMAGES_PATH, AC_DATA_PATH))
doom_images_files = os.listdir(os.path.join(IMAGES_PATH, DOOM_DATA_PATH))
ac_text_data = pd.read_csv(os.path.join(DATA_PATH, 'animal_crossing_dataset.csv'))
doom_text_data = pd.read_csv(os.path.join(DATA_PATH, 'doom_dataset.csv'))

In [5]:
paths = (
    [os.path.join(AC_DATA_PATH, p) for p in ac_images_files] +
    [os.path.join(DOOM_DATA_PATH, p) for p in doom_images_files] 
)
df = pd.DataFrame({
    'filename': ac_images_files + doom_images_files,
    'path': paths,
    'ac': [True] * len(ac_images_files) + ([False] * len(doom_images_files)),
    'doom': [False] * len(ac_images_files) + ([True] * len(doom_images_files)),
})
df = df.join(pd.concat([ac_text_data, doom_text_data])[['filename', 'title']].set_index('filename'), on='filename')
# df['tokeniked'] = df.apply(lambda r: tokenizer(str(r['title']), padding=True, truncation=True), axis=1)

In [6]:
def get_sets(N=10):
    df_train, df_test = train_test_split(df, train_size=N)

    
    return df_train[['path', 'ac', 'doom']], df_test[['path', 'ac', 'doom']], df_train[['title', 'ac', 'doom']], df_test[['title', 'ac', 'doom']]

In [7]:
images_train, images_test, text_train, text_test = get_sets(20)

In [8]:
text_train

Unnamed: 0,title,ac,doom
1528,Mick Gordon simp exposed,False,True
759,Petition to re-rip the Classic Doom Marine’s pants in Doom Eternal,False,True
391,Scary movie night! Poor Stitches :(,True,False
616,Stayed up till 4 AM finishing my magical study!,True,False
219,Thank you Lobo but Isabelle doesn't think the same way.. 😔😔😔,True,False
404,"Met Redd today, seems like a pretty nice guy.",True,False
466,Hope you guys like my AC grad cap!,True,False
131,I drew Lily!,True,False
805,Doom Slayer/Samuel Hayden chemistry in a nutshell,False,True
938,I dunno if something like this was posted before,False,True


# LEARN AC IMAGES

In [9]:
# data_image_ac = ImageDataLoaders.from_df(df_train, fn_col='path', label_col='ac', item_tfms=Resize(224), bs=32, folder=IMAGES_PATH)
# learn_image_ac = cnn_learner(data_image_ac, resnet34, metrics=error_rate)
# learn_image_ac.fine_tune(5)
# learn_image_ac.export('../models/learn_image_ac_500.pkl')
learn_image_ac = load_learner('../models/learn_image_ac_500.pkl')

# LEARN DOOM IMAGES

In [10]:
# data_image_doom = ImageDataLoaders.from_df(df_train, fn_col='path', label_col='doom', item_tfms=Resize(224), bs=32, folder=IMAGES_PATH)
# learn_image_doom = cnn_learner(data_image_doom, resnet34, metrics=error_rate)
# learn_image_doom.fine_tune(5)
# learn_image_doom.export('../models/learn_image_doom_500.pkl')
learn_image_doom = load_learner('../models/learn_image_doom_500.pkl')

In [14]:
class TextDataset(Dataset):
    def __init__(self, data, label_column):
        self.data_ = data[['title', label_column]].copy().reset_index()
        self.label_column = label_column
            
    def __len__(self):
        return self.data_.shape[0]
    
    def __getitem__(self, idx):
        inputs = tokenizer.encode_plus(
            str(self.data_.loc[idx]['title']),
            None,
            add_special_tokens=True,
            max_length=256,
            pad_to_max_length=True,
            return_token_type_ids=True
        )

        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'token_type_ids': torch.tensor(inputs["token_type_ids"], dtype=torch.long), 
            'labels': torch.tensor(self.data_.loc[idx][self.label_column], dtype=torch.long)
        }

# def compute_metrics(pred):
#     labels = pred.label_ids
#     preds = pred.predictions.argmax(-1)
#     precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
#     acc = accuracy_score(labels, preds)
#     return {
#         'accuracy': acc,
#         'f1': f1,
#         'precision': precision,
#         'recall': recall
#     }

# images_train, images_test, text_train, text_test = get_sets(10)

# model_bert_ac = BertForSequenceClassification.from_pretrained(
#     'bert-base-uncased', 
#     num_labels=2
# )

# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=2,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=64,
#     warmup_steps=500,
#     weight_decay=0.01,
#     evaluate_during_training=True,
#     logging_dir='./logs'
# )

# trainer = Trainer(
#     model=model_bert_ac,
#     args=training_args,
#     compute_metrics=compute_metrics,
#     train_dataset=TextDataset(text_train, 'ac'),
#     eval_dataset=TextDataset(text_test, 'ac')
# )

# trainer.train()

In [None]:
trainer.predict([tokenizer('Mullet Slayer dont got shit on Buscemi Marauder')])

In [44]:
images_train, images_test, text_train, text_test = get_sets(750)
text_train['ac'].sum()

332

In [21]:
device = torch.device("cpu")

def train_madel(text_train):
    batch_size = 16
    seed_val = 42
    epochs = 4

    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    train_dataloader = DataLoader(
        TextDataset(text_train, 'ac'),
        batch_size = batch_size 
    )

    total_steps = len(train_dataloader) * epochs

    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased", 
        num_labels = 2
    )

    optimizer = AdamW(
        model.parameters(),
        lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
        eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
    )

    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0,
                                                num_training_steps = total_steps)

    for epoch_i in range(0, epochs):
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        total_train_loss = 0
        model.train()

        for step, batch in enumerate(train_dataloader):
            model.zero_grad()        
            loss, logits, *_ = model(
                batch['input_ids'].to(device),
                attention_mask=batch['attention_mask'].to(device),
                token_type_ids=None,
                labels=batch['labels'].to(device)
            )
            total_train_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()

            scheduler.step()

        avg_train_loss = total_train_loss / len(train_dataloader)            

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        model.eval()

    print("")
    print("Training complete!")
    
    return model

In [11]:
for i, batch in enumerate(validation_dataloader):
    with torch.no_grad():        
        out_ = model(
            batch['input_ids'].to(device),
            attention_mask=batch['attention_mask'].to(device),
            token_type_ids=None
        )
        acc = (out_[0].numpy().argmax(axis=1) == batch['labels'].numpy()).sum() / len(batch['labels'])
        print(f'{i} = {acc}')

NameError: name 'validation_dataloader' is not defined

In [15]:
images_train, images_test, text_train, text_test = get_sets(10)
model_2 = train_madel(text_train)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


Training...

  Average training loss: 0.70

Training...

  Average training loss: 0.67

Training...

  Average training loss: 0.52

Training...

  Average training loss: 0.57

Training complete!


In [22]:
test_dataloader = DataLoader(
    TextDataset(text_test, 'ac'),
    batch_size = 32 
)
for i, batch in enumerate(test_dataloader):
    with torch.no_grad():        
        out_ = model_2(
            batch['input_ids'].to(device),
            attention_mask=batch['attention_mask'].to(device),
            token_type_ids=None
        )
        acc = (out_[0].numpy().argmax(axis=1) == batch['labels'].numpy()).sum() / len(batch['labels'])
        print(f'{i} = {acc}')
    if i == 10: break

0 = 0.40625
1 = 0.375
2 = 0.5625
3 = 0.5
4 = 0.53125
5 = 0.59375
6 = 0.46875
7 = 0.3125
8 = 0.53125
9 = 0.59375
10 = 0.59375
