In [1]:
! kaggle competitions download -c nlp-getting-started
! unzip nlp-getting-started.zip
! mkdir -p data && mv *.csv data/
! rm nlp-getting-started.zip

Downloading nlp-getting-started.zip to /home/xmiler/projects/kaggle-distweets
100%|████████████████████████████████████████| 593k/593k [00:00<00:00, 5.93MB/s]
100%|████████████████████████████████████████| 593k/593k [00:00<00:00, 5.88MB/s]
Archive:  nlp-getting-started.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [1]:
import pandas as pd
import numpy as np
import torch
from torch import optim
from torch import LongTensor
from torch.utils.data import TensorDataset, DataLoader
import pytorch_lightning as pl
from transformers import BertTokenizer, BertForSequenceClassification


class Tokenizer:
    def __init__(self):
        self._tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        
    def __call__(self, X):
        input_ids, attention_mask = [], []
        for text in X:
            token = self._tokenizer(text, 
                                    max_length=256, 
                                    truncation=True, 
                                    padding='max_length',
                                    add_special_tokens=True)
            input_ids.append(token['input_ids'])
            attention_mask.append(token['attention_mask'])
        return np.array(input_ids), np.array(attention_mask)


class LModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self._model = BertForSequenceClassification.from_pretrained('bert-base-cased')
    
    def training_step(self, batch, batch_idx):
        x_ids, x_attention, y = batch
        output = self._model(x_ids, x_attention, labels=y)
        return output.loss
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=3e-6, eps = 1e-8)
        return optimizer
    
    def predict_step(self, batch, batch_idx):
        x_ids, x_attention = batch
        output = self._model(x_ids, x_attention)
        return output.logits

In [2]:
tokenizer = Tokenizer()
lmodel = LModel()
trainer = pl.Trainer(max_epochs=3, gpus=1)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [3]:
train_data = pd.read_csv('data/train.csv')
train_data = train_data[['text', 'target']]

X_train, y_train = train_data['text'].to_numpy(), train_data['target'].to_numpy()
X_train = tokenizer(X_train)

ds_train = TensorDataset(LongTensor(X_train[0]), LongTensor(X_train[1]), LongTensor(y_train))
dl_train = DataLoader(ds_train, batch_size=16, num_workers=8)

In [5]:
trainer.fit(lmodel, dl_train)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type                          | Params
---------------------------------------------------------
0 | _model | BertForSequenceClassification | 108 M 
---------------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
433.247   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

In [7]:
trainer.save_checkpoint('data/final.ckpt')

In [9]:
lmodel.load_from_checkpoint('data/final.ckpt')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

LModel(
  (_model): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(28996, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
   

In [52]:
test_data = pd.read_csv('data/test.csv')
test_data = test_data[['text']]

X_test = test_data['text'].to_numpy()
X_test = tokenizer(X_test)

ds_test = TensorDataset(LongTensor(X_test[0]), LongTensor(X_test[1]))
dl_test = DataLoader(ds_test, shuffle=False, batch_size=16, num_workers=8)

In [53]:
predictions = trainer.predict(lmodel, dl_test)
predictions = torch.concat(predictions)
predictions = predictions.argmax(dim=1)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 476it [00:00, ?it/s]

In [59]:
submission = pd.read_csv('data/sample_submission.csv')
submission['target'] = predictions
submission.to_csv('data/submission.csv', index=False, header=True)