In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [8]:
import torch
import transformers
import pandas as pd
import numpy as np

from sklearn import model_selection, metrics

In [2]:
"""
* Dataset class
* Model
* Trainer - training arguments
"""

'\n* Dataset class\n* Model\n* Trainer - training arguments\n'

In [3]:
config = {
    "max_length": 360,
    "model_path": "microsoft/xtremedistil-l6-h256-uncased",

    "output_dir": "./my-model",
    "train_batch_size": 64,
    "valid_batch_size": 64,
    "learning_rate": 3e-5,
    "epochs": 3,

    "debug": True,
}

In [24]:
tokenizer = transformers.AutoTokenizer.from_pretrained(config["model_path"])
class TextDataset:

    def __init__(self, data):
        self.data = data

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        enc = enc = tokenizer(
            row["text"],
            add_special_tokens=True,
            max_length=config["max_length"],
            padding="max_length",
            truncation=True
        )

        return {
            "input_ids": torch.tensor(enc["input_ids"]),
            "attention_mask": torch.tensor(enc["attention_mask"]),
            "label": torch.tensor(row["label"]),
        }



In [25]:
df = pd.read_csv("/kaggle/input/imdb-data/IMDB Dataset 3.csv").rename(columns={"review": "text"})

id2label = {0: "negative", 1: "positive"}
label2id = {label: id_ for id_, label in id2label.items()}

df["label"] = df["sentiment"].map(label2id)

if config["debug"]:
    print("DEBUG MODE!")
    df = df.sample(10_000, random_state=123)

print(df.shape)
df.head()

DEBUG MODE!
(10000, 3)


Unnamed: 0,text,sentiment,label
11872,"This movie was beyond awful, it was a pimple o...",negative,0
40828,As of this writing John Carpenter's 'Halloween...,positive,1
36400,I must admit a slight disappointment with this...,positive,1
5166,Oh dear! The BBC is not about to be knocked of...,negative,0
30273,its a totally average film with a few semi-alr...,negative,0


In [26]:
tokenizer = transformers.AutoTokenizer.from_pretrained(config["model_path"])



In [27]:
train, valid = model_selection.train_test_split(
    df,
    test_size=0.2,
    random_state=23,
    shuffle=True,
    stratify=df["label"]
)

In [28]:
train_ds = TextDataset(train)
valid_ds = TextDataset(valid)

In [29]:
valid_ds[0]

{'input_ids': tensor([  101, 18224,  4735,  5760,  2011,  5529,  8040, 10222, 11705,  1010,
          2040,  2003,  3161,  1999,  5365,  1010,  2066,  3680,  2061, 10421,
         16089, 15992,  1012,  2175, 26327,  2577,  1998,  7658,  5267, 22770,
         13542,  2063,  2024, 11065,  1996, 21027,  1005,  1055,  2197,  3521,
          1997,  1055, 11231, 10177,  4757,  1012,  2027,  2024,  2205,  2583,
          1010,  2129,  2064,  1045,  2131,  5475,  2046,  1037,  6336,  2279,
          2051,  1029,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [30]:
model = transformers.AutoModelForSequenceClassification.from_pretrained(config["model_path"])


pytorch_model.bin:   0%|          | 0.00/51.0M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
def compute_metrics(eval_data):
   
    preds = eval_data.predictions.argmax(-1)
    labels = eval_data.label_ids 
    print(eval_data)
    print(preds)
    print(labels)

    return {
        'accuracy': metrics.accuracy_score(labels, preds),
        'precision': metrics.precision_score(labels, preds),
        'recall': metrics.recall_score(labels, preds),
        'classification_report': metrics.classification_report(labels, preds, target_names=list(id2label.values()), output_dict=True)

    }

training_args = transformers.TrainingArguments(
     output_dir="./results",                      # Directory for storing results
    evaluation_strategy="steps",                 # Evaluate every few steps
    per_device_train_batch_size=config['train_batch_size'],              # Batch size per device during training
    per_device_eval_batch_size=config['train_batch_size'],               # Batch size per device during evaluation
    num_train_epochs=config['epochs'],                          # Total number of training epochs
    warmup_steps=500,                            # Number of warmup steps for learning rate scheduler
    save_total_limit=2,
    logging_dir=None,                            # Disable logging directory
    logging_strategy="no",
    report_to=[]# Limit the total amount of checkpoints`

)



In [32]:
trainer = transformers.Trainer(
    model=model,                                 # The model to be trained
    args=training_args,                          # The training arguments, defined above
    train_dataset=train_ds,                 # The training dataset
    eval_dataset=valid_ds,                   # The evaluation dataset
    tokenizer=tokenizer,                         # The tokenizer
    compute_metrics=compute_metrics, 
    
)

In [33]:
trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss,Validation Loss


TrainOutput(global_step=189, training_loss=0.565313389692357, metrics={'train_runtime': 114.4895, 'train_samples_per_second': 209.626, 'train_steps_per_second': 1.651, 'total_flos': 249110795520000.0, 'train_loss': 0.565313389692357, 'epoch': 3.0})

In [34]:
trainer.save_state()

In [35]:
trainer.save_model()