In [1]:
! kaggle competitions download -c nlp-getting-started
! unzip nlp-getting-started.zip
! mkdir -p data && mv *.csv data/
! rm nlp-getting-started.zip

Downloading nlp-getting-started.zip to /home/xmiler/projects/kaggle-distweets
100%|████████████████████████████████████████| 593k/593k [00:00<00:00, 5.93MB/s]
100%|████████████████████████████████████████| 593k/593k [00:00<00:00, 5.88MB/s]
Archive:  nlp-getting-started.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [32]:
import pandas as pd
import numpy as np
import torch
from torch import optim
from torch import LongTensor
from torch.utils.data import TensorDataset, DataLoader
import pytorch_lightning as pl
from transformers import BertTokenizer, BertForSequenceClassification


def tokenize(tokenizer, X):
    input_ids, attention_mask = [], []
    for text in X:
        token = tokenizer(text, 
                          max_length=256, 
                          truncation=True, 
                          padding='max_length',
                          add_special_tokens=True)
        input_ids.append(token['input_ids'])
        attention_mask.append(token['attention_mask'])
    return np.array(input_ids), np.array(attention_mask)


class LMModel(pl.LightningModule):
    def __init__(self, model):
        super().__init__()
        self._model = model
    
    def training_step(self, batch, batch_idx):
        x_ids, x_attention, y = batch
        output = self._model(x_ids, x_attention, labels=y)
        return output.loss
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=3e-6, eps = 1e-8)
        return optimizer
    
    def predict_step(self, batch, batch_idx):
        x_ids, x_attention = batch
        output = self._model(x_ids, x_attention)
        return output.logits

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [3]:
train_data = pd.read_csv('data/train.csv')
train_data = train_data[['text', 'target']]

X_train, y_train = train_data['text'].to_numpy(), train_data['target'].to_numpy()
X_train = tokenize(tokenizer, X_train)

ds_train = TensorDataset(LongTensor(X_train[0]), LongTensor(X_train[1]), LongTensor(y_train))

In [4]:
model = BertForSequenceClassification.from_pretrained('bert-base-cased')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [33]:
lmmodel = LMModel(model)
trainer = pl.Trainer(max_epochs=3, gpus=1)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [6]:
dl_train = DataLoader(ds_train, batch_size=16, num_workers=8)
trainer.fit(lmmodel, dl_train)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /home/xmiler/projects/kaggle-distweets/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type                          | Params
---------------------------------------------------------
0 | _model | BertForSequenceClassification | 108 M 
---------------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
433.247   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

In [7]:
trainer.save_checkpoint('final.ckpt')

In [16]:
test_data = pd.read_csv('data/test.csv')
test_data = test_data[['text']]

X_test = train_data['text'].to_numpy()
X_test = tokenize(tokenizer, X_test)

ds_test = TensorDataset(LongTensor(X_test[0]), LongTensor(X_test[1]))

In [29]:
dl_test = DataLoader(ds_test, batch_size=16, num_workers=8)

In [34]:
predictions = trainer.predict(lmmodel, dl_test)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]