In [6]:
!pip install pymorphy2

In [7]:
import numpy as np
import pandas as pd
import re
from wordcloud import WordCloud

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords_ru = stopwords.words("russian")
from nltk.stem import WordNetLemmatizer
import pymorphy2
from matplotlib import pyplot
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
#import transformers
#import torch
#import tensorflow as tf
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
#from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
#from transformers import BertTokenizer, BertModel, BertConfig

import transformers
from transformers import DistilBertTokenizer
from transformers import DistilBertModel, DistilBertConfig
from transformers import TFDistilBertForSequenceClassification
import tensorflow as tf
import json
import gc

In [8]:
train = pd.read_csv('../input/scan-classification-challange/df_train.csv')
train.sample(3)

In [9]:
# Удалим дубликаты

train.drop_duplicates(subset={'text'}, inplace=True) 
train.shape

In [10]:
# # Названия категорий переводим в числовой формат и записываем в отдельный столбец

train['encoded_cat'] = train['class'].astype('category').cat.codes
train.sample(5)

In [11]:
test = pd.read_csv('../input/scan-classification-challange/df_test.csv', index_col=0)
test.sample(3)

In [12]:
# Посмотрим на состав имеющихся стоп-слов
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords_ru = stopwords.words("russian")
print(stopwords.words("russian"))

In [13]:
# Теперь выведем все слова длиной менее 3-х символов и дополним список стоп-слов
# пробовал искать среди стопслов равных 4-м, но там получаем слова 'газа', 'прав', 'дела', 'Рост'
# как мне кажется данные слова, могут быть полезны для score

stopworlds_new = set()
mas_stop = set()
for words in train['text']:
    for i in words.split():
        if len(i) <= 3:
            mas_stop.add(i)
stopworlds_new = set(stopwords_ru).union(mas_stop)

In [14]:
# Создадим функцию по очистке данных. Будем переводить слова в нижний регистр, 
# удалять стоп слова, удалять числа и раздичные знаки которые не несут смысловой нагрузки. 
# Все слова преобразуем к их первоначалоной форме (Лемматизация)

morph = pymorphy2.MorphAnalyzer()
patterns = "[A-Z|a-z|0-9!#$%&'()*+,./:“″;”<=>?@[\]^_`{|}~—\"\-•–«»]+"
#stops = set(stopwords.words("russian"))
def clean(text):
    text = text.lower()
    text = re.sub(patterns, ' ', text)
    tokens = []
    for token in text.split():
        if token and token not in stopworlds_new:
            token = token.strip()
            token = morph.normal_forms(token)[0]  # Лемматизация
            #token = stemmer.stem(token) # Стеммизация
            tokens.append(token)
    return ' '.join(tokens)

In [15]:
# Применим функцию очистки к train и test

train['clean'] = train['text'].apply(lambda x: clean(x))
test['clean'] = test['text'].apply(lambda x: clean(x))
train[['clean', 'text']]

## FastText

In [None]:
ft_train, ft_test = train_test_split(train, random_state=42, test_size=0.2, stratify = train['class'])

In [None]:
def to_ft_label(s):
    return '__label__'+s.replace(',','_').replace(' ','_').replace('-','_')

labels_dict = {}
for g in train['class']:
    labels_dict[to_ft_label(g)] = g

In [None]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords
train.iloc[:, 0] = train.iloc[:, 0].apply(lambda x: ' '.join(simple_preprocess(x)))
test.iloc[:, 1] = test.iloc[:, 1].apply(lambda x: ' '.join(simple_preprocess(x)))

In [None]:
col = ['class', 'text']

# train
train_for_ft = ft_train[col]
train_for_ft['class']=[to_ft_label(s) for s in train_for_ft['class']]

# test
test_for_ft = ft_test[col]
test_for_ft['class']=[to_ft_label(s) for s in test_for_ft['class']]

In [None]:
train_for_ft.to_csv('train_for_ft.csv', index=False, sep=' ', header=False, escapechar=" ")
test_for_ft.to_csv('test_for_ft.csv', index=False, sep=' ', header=False, escapechar=" ")

In [None]:
import fasttext
model = fasttext.train_supervised('train_for_ft.csv', lr = 0.9)

In [None]:
model.test('test_for_ft.csv')

In [None]:
def predict(test):
    return labels_dict[ model.predict(test['text'], k=1)[0][0] ]
test['predictions'] = test.apply(predict,axis=1)

test.head()

In [None]:
submission = pd.DataFrame({'id':range(0, len(test)),
                           'class':test['predictions'].values},
                          columns=['id', 'class'])
submission.to_csv('submission1.csv', index=False)
submission.head()

In [None]:
#sub_predict = model.predict([text_sub_sequences, X_sub])
#sample_submission['class'] = sub_predict_nn2[:,0]
#sample_submission.to_csv('submission.csv', index=False)

## Model sklearn
Logistic Regression

In [16]:
# Выделим X, y. X - это будет, наш обработанный текст, y -  наш класс

y = train.encoded_cat.values
X = train.drop(['encoded_cat', 'text'], axis=1)


In [17]:
## Разделим все данные на train test в соотношении 80/20

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y)

Необходимо преобразовать тест в токены, сделаем несколькими способами и посмотрим на результат

In [18]:
## CountVectorizer преобразует текст в матрицу количества токенов

from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()

X_train_review_bow = vect.fit_transform(X_train['clean'])
X_test_review_bow = vect.transform(X_test['clean'])
X_sub_rev_bow = vect.transform(test['clean'])
#X_train_review_bow = vect.fit_transform(X_train)
#X_test_review_bow = vect.transform(X_test)

print('X_train_review_bow shape: ', X_train_review_bow.shape)
print('X_test_review_bow shape: ', X_test_review_bow.shape)
print('X_sub_review_bow shape: ', X_sub_rev_bow.shape)

In [19]:
## Tf-Idf преобразует текст в матрицу функций TF-IDF (частота обратная частоте документа)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

X_train_review_tfidf = vectorizer.fit_transform(X_train['clean'])
X_test_review_tfidf = vectorizer.transform(X_test['clean'])
X_sub_review_tfidf = vectorizer.transform(test['clean'])

print('X_train_review_tfidf shape: ', X_train_review_tfidf.shape)
print('X_test_review_tfidf shape: ', X_test_review_tfidf.shape)
print('X_sub_review_tfidf shape: ', X_sub_review_tfidf.shape)

In [20]:
# Построим модель логистической регресии, в качестве метрики будем выводить accuracy и F1

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', penalty='l2', random_state=42)
clf.fit(X_train_review_tfidf, y_train)

#y_pred = clf.predict(X_test_review_tfidf)
y_pred = clf.predict(X_test_review_tfidf)
print('Test Accuracy: ', accuracy_score(y_test, y_pred))
print('Test F1: ', f1_score(y_test, y_pred, average='weighted'))

In [21]:
clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', penalty='l2', random_state=42)
clf.fit(X_train_review_bow, y_train)

y_pred = clf.predict(X_test_review_bow)
print('Test Accuracy: ', accuracy_score(y_test, y_pred))
print('Test F1: ', f1_score(y_test, y_pred, average='weighted'))

Попробуем другие модели sklearn

In [22]:
# C-классификация опорных векторов

from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train_review_bow, y_train)
y_pred_svc = svclassifier.predict(X_test_review_bow)
print('Test Accuracy: ', accuracy_score(y_test, y_pred_svc))
print('Test F1: ', f1_score(y_test, y_pred, average='weighted'))

In [23]:
# Классификатор, реализующий голосование k ближайших соседей

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 10)
knn.fit(X_train_review_bow, y_train)
y_pred = knn.predict(X_test_review_bow)

print('Test Accuracy: ', accuracy_score(y_test, y_pred))
print('Test F1: ', f1_score(y_test, y_pred, average='weighted'))

Гипер параметры

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

"""model = LogisticRegression(multi_class='multinomial', penalty='l2', random_state=42)
#penalty = ['l1','l2']
solver = ['newton-cg', 'sag', 'saga', 'lbfgs']
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
grid = dict(alpha=alpha)
c_values = [100, 10, 1.0]
#param_grid = {'C':[1, 10]}
param_grid = dict(C=c_values, solver=solver)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, verbose=10, cv=cv, scoring='accuracy', error_score=0)
grid_search.fit(X_train_review_bow, y_train)
#print("Best: %f using %s" % (grid_search.best_params_))"""

In [24]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(multi_class='multinomial', solver='newton-cg', penalty='l2', C = 10, random_state=42)
clf.fit(X_train_review_bow, y_train)

y_pred = clf.predict(X_test_review_bow)
print('Test Accuracy: ', accuracy_score(y_test, y_pred))
print('Test F1: ', f1_score(y_test, y_pred, average='weighted'))

In [25]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(multi_class='multinomial', solver='saga', penalty='l2', C = 100, random_state=42)
clf.fit(X_train_review_bow, y_train)

y_pred = clf.predict(X_test_review_bow)
print('Test Accuracy: ', accuracy_score(y_test, y_pred))
print('Test F1: ', f1_score(y_test, y_pred, average='weighted'))

Стекинг

In [None]:
from sklearn.ensemble import StackingClassifier
#from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

#data, target = load_breast_cancer(return_X_y=True)

estimators = [('lr', LogisticRegression()), ('svc', SVC(kernel='linear'))]
modelClf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

#X_train, X_valid, y_train, y_valid = train_test_split(data, target, test_size=0.3, random_state=12)

modelClf.fit(X_train_review_bow, y_train)
y_pred = clf.predict(X_test_review_bow)
print('Test Accuracy: ', accuracy_score(y_test, y_pred))
print('Test F1: ', f1_score(y_test, y_pred, average='weighted'))
#print(modelClf.score(X_valid, y_valid))

## Bert TenzorFlow

In [26]:
!pip install transformers
!pip install pytorch
!pip install pytorch-transformers

In [27]:
import transformers
from transformers import DistilBertTokenizer
from transformers import DistilBertModel, DistilBertConfig
from transformers import TFDistilBertForSequenceClassification
import tensorflow as tf
import json
import gc

In [None]:
# Токенизируем текст 

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
X_train_encodings = tokenizer(X_train['clean'].to_list(), truncation=True, padding=True)
test_encodings = tokenizer(test['text'].to_list(), truncation=True, padding=True)

X_test_encodings = tokenizer(X_test['clean'].to_list(), truncation=True, padding=True)

In [None]:
# Создает dataset с тонекизированными данными

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(X_train_encodings),
    y_train
))

test_dataset = tf.data.Dataset.from_tensor_slices((dict(X_test_encodings), 
                                    list(y_test))) 
val_class = np.zeros(len(test))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    val_class))

In [None]:
class F1_Score(tf.keras.metrics.Metric):

    def __init__(self, name='f1_score', **kwargs):
        super().__init__(name=name, **kwargs)
        self.f1 = self.add_weight(name='f1', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        p = Precision(thresholds=0.5)(y_true, y_pred)
        r = Recall(thresholds=0.5)(y_true, y_pred)
        self.f1 = 2 * ((p * r) / (p + r + 1e-6))

    def result(self):
        return self.f1

    def reset_states(self):
        self.f1.assign(0)

In [None]:
from keras import backend as K

def recall_m(y_train, y_test):
    true_positives = K.sum(K.round(K.clip(y_train * y_test, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_train, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_train, y_test):
    true_positives = K.sum(K.round(K.clip(y_train * y_test, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_test, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_train, y_test):
    precision = precision_m(y_train, y_test)
    recall = recall_m(y_train, y_test)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=50)
losss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5) # 2e-5
model.compile(optimizer=optimizer, loss=losss, metrics=['accuracy']) # loss=model.compute_loss #model.hf_compute_loss()

In [None]:
# Замораживаем слои
#model.base_model.transformer.layer
for layers in model.layers[:]:
    #layers.trainable = False
    print(layers)

In [None]:
#model.compile(optimizer=optimizer, loss=losss, metrics=['accuracy'])

In [None]:
model.fit(train_dataset.shuffle(1000).batch(16), 
          epochs=1,
          validation_data=test_dataset.shuffle(1000).batch(16))

## Distill-Bert fine-tuning - Huggingface and Pytorch

In [None]:
!pip install datasets
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from datasets import Dataset
from sklearn.model_selection import train_test_split

In [None]:
id2label = {str(i): label for i, label in enumerate(train["class"].unique().tolist())}
label2id = {v: k for k, v in id2label.items()}
print(label2id)

In [None]:
# Токенизатор DistilBERT будет принимать обучающие данные только в том случае, если у него есть labels столбец, 
# поэтому мы добавим еще один столбец с нашими сопоставленными метками.

train = (train.assign(labels=train["class"].map(label2id)))
train.head()

In [None]:
# Создаем Dataset и разбиваем на test и train

from datasets import Dataset
from sklearn.model_selection import train_test_split

dataset = Dataset.from_pandas(train).train_test_split(train_size=0.8, seed=123)
print(dataset)

In [None]:
#
dataset = dataset.class_encode_column("labels")

In [None]:
# загружаем трансформатор AutoTokenizer для DistilBERT

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
#  Составляем список столбцов, которые нужно удалить из набора данных при токенизации

cols_to_remove = [col for col in dataset["train"].column_names if col != "labels"]
print(cols_to_remove)

In [None]:
# После завершения токенизации мы подготавливаем набор данных для передачи в модель, устанавливая его формат "torch"

def tokenize(batch):
    tokenized_batch = tokenizer(batch['text'], padding=True, truncation=True, max_length=320)
    tokenized_batch['labels'] = [int(label) for label in batch["labels"]]
    return tokenized_batch

dataset_enc = dataset.map(tokenize, batched=True, remove_columns=cols_to_remove, num_proc=4)
dataset_enc.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

print(dataset_enc["train"].column_names)

In [None]:
# Создаем экземпляр a DataLoader для каждого разделения набора данных, чтобы передать его модели

from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataloader = DataLoader(
    dataset_enc["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    dataset_enc["test"], batch_size=8, collate_fn=data_collator
)

In [None]:
# Загружаем модель

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AutoModelForSequenceClassification

# Load model from checkpoint
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=50)

In [None]:
# Определяем гтперпараметры, оптимизатор, скорость, планировщик

from transformers import AdamW
from transformers import get_scheduler
from transformers import get_linear_schedule_with_warmup

learning_rate = 2e-5
num_epochs = 5

# Гиперпараметры
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)

#optimizer = AdamW(model.parameters(), lr=learning_rate)

num_training_batches = len(train_dataloader)
num_training_steps = num_epochs * num_training_batches
lr_scheduler = get_scheduler(
    "linear",                   
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
# Перемещаем модель на GPU

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

model.to(device)

In [None]:
# Запускаем обучение модели

from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

# Train the model with PyTorch training loop
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
# Оценка модели

from datasets import load_metric

# Load metric
#metric = load_metric("glue", "mrpc")
metric = load_metric("f1")
# Iteratively evaluate the model and compute metrics
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

# Get model accuracy and F1 score
metric.compute(predictions=predictions, references=batch["labels"], average="weighted")
#metric.compute(predictions=predictions, references=batch["labels"],average="weighted")

## DistilBert Freeze

In [None]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [None]:
# Применяем get_dumies для таргета
train_class = pd.get_dummies(train, columns=['class'])
train_class.sample(3)

In [None]:
train['labels'] = train_class.iloc[:, 2:].values.tolist()

In [None]:
MAX_LEN = 372
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 8
LEARNING_RATE = 2e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)

distilbert = DistilBertModel.from_pretrained("distilbert-base-uncased")

In [None]:
from torch import nn
from transformers import BertModel

class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text_clean
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
# Создаем dataset и dataloader для нейронной сети

train_size = 0.8
train_data=train.sample(frac=train_size,random_state=200)
test_data=train.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(train.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
# замораживаем слои

for param in distilbert.parameters():
  param.requires_grad = False

In [None]:
# Создаем Class для нашей модели
# тут добавляем слой Dropuot чтоб не переобучиться, а также 2 Linear

class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.distilbert = distilbert
        #self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        
        self.dropout = torch.nn.Dropout(0.1)
        
        self.relu = nn.ReLU()
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.classifier = torch.nn.Linear(768, 50)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = self.relu(pooler)
        #pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = DistilBERTClass()
model.to(device)

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        loss.backward()
        optimizer.step()

In [None]:
for epoch in range(EPOCHS):
    train(epoch)

In [None]:
def validation(testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
outputs, targets = validation(testing_loader)

final_outputs = np.array(outputs) >=0.5

In [None]:
val_hamming_loss = metrics.hamming_loss(targets, final_outputs)
val_hamming_score = hamming_score(np.array(targets), np.array(final_outputs))
f1_m = f1_score(targets, final_outputs, average='weighted')
print(f"Hamming Score = {val_hamming_score}")
print(f"Hamming Loss = {val_hamming_loss}")
print(f1_m)

In [None]:
class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.distilbert = distilbert
        #self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        
        self.dropout = torch.nn.Dropout(0.1)
        self.rnn = nn.GRU(312,
                        256,
                        num_layers=2,
                        batch_first=True,
                        dropout=0.1)
        #self.relu =  nn.ReLU()
        #self.pre_classifier = torch.nn.Linear(768, 768)
        self.classifier = torch.nn.Linear(768, 50)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        #pooler = self.pre_classifier(pooler)
        #pooler = self.relu(pooler)
        #pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = DistilBERTClass()
model.to(device)

In [None]:
for epoch in range(EPOCHS):
    train(epoch)