In [120]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

In [142]:
neg = pd.read_csv("/kaggle/input/wb-sch-p2/train_neg.csv")
pos = pd.read_csv("/kaggle/input/wb-sch-p2/train_pos.csv")

In [143]:
data = pd.concat([neg, pos])

In [144]:
data['desc'] = data.apply(lambda x: x if len(x.desk) < 350 else x.desk[:350], axis=1)

In [145]:
data = data.sample(frac=1).reset_index(drop=True)

In [146]:
data['qad'] = data["Question"].astype(str) + "/n" + data["desc"].astype(str)

In [147]:
import re
def clean_text(text):
    text = text.lower()

    text = re.sub(r'[^a-zA-Zа-яА-Я0-9\s]', '', text)
    
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [148]:
data['qad'] = data['qad'].apply(clean_text)

In [149]:
class CustomDataset(Dataset):
  def __init__(self, questions, labels, tokenizer, max_length):
    self.questions = questions
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.questions)

  def __getitem__(self, idx):
    text = str(self.questions[idx])
    label = self.labels[idx]

    encoding = self.tokenizer.encode_plus(
               text,
               add_special_tokens=True,
               max_length=self.max_length,
               return_token_type_ids=False,
               padding='max_length',
               return_attention_mask=True,
               return_tensors='pt',
               truncation=True
           )

    return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
           }

In [150]:
def create_data_loader(df, tokenizer, max_length, batch_size):
  ds = CustomDataset(
           questions=df.qad.to_numpy(),
           labels=df.label.to_numpy(),
           tokenizer=tokenizer,
           max_length=max_length
       )
  return DataLoader(ds, batch_size=batch_size, num_workers=4)

In [151]:
model_name = "cointegrated/LaBSE-en-ru"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/LaBSE-en-ru and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [152]:
data.shape

(2512, 8)

In [153]:
val = data.sample(400, random_state=42)
train = data.drop(val.index)

In [154]:
BATCH_SIZE = 16
MAX_LENGTH = 512

train_data_loader = create_data_loader(train, tokenizer, MAX_LENGTH, BATCH_SIZE)
val_data_loader = create_data_loader(val, tokenizer, MAX_LENGTH, BATCH_SIZE)

training_args = TrainingArguments(
       output_dir='./results',
       num_train_epochs=4,
       per_device_train_batch_size=BATCH_SIZE,
       per_device_eval_batch_size=BATCH_SIZE,
       warmup_steps=500,
       weight_decay=0.01,
       logging_dir='./logs',
       logging_steps=10,
       eval_strategy='steps'
   )

trainer = Trainer(
       model=model,
       args=training_args,
       train_dataset=train_data_loader.dataset,
       eval_dataset=val_data_loader.dataset
   )

In [155]:
trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss,Validation Loss
10,0.7438,0.712762
20,0.705,0.665555
30,0.6595,0.651018
40,0.6619,0.639845
50,0.6418,0.618871
60,0.6074,0.582108
70,0.5394,0.549328
80,0.4677,0.456953
90,0.3764,0.370578
100,0.3678,0.359263


TrainOutput(global_step=264, training_loss=0.3299042522681482, metrics={'train_runtime': 661.5515, 'train_samples_per_second': 12.77, 'train_steps_per_second': 0.399, 'total_flos': 2222762195681280.0, 'train_loss': 0.3299042522681482, 'epoch': 4.0})

In [156]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
def tokenize_question(question, tokenizer, max_length=128):
    return tokenizer(
        [question],
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=max_length
    )

def predict(question):
    inputs = tokenize_question(question, tokenizer)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        
    predictions = torch.argmax(logits, dim=-1).cpu().numpy()
    probabilities = torch.softmax(logits, dim=-1).cpu().numpy()
    return predictions
    
    
    
    

In [157]:
val_neg = pd.read_csv("/kaggle/input/wb-sch-p2/valid_250_neg.csv")
val_pos = pd.read_csv("/kaggle/input/wb-sch-p2/valid_250_pos.csv")

In [158]:
valid = pd.concat([val_neg, val_pos])

In [159]:
valid['desc'] = valid.apply(lambda x: x if len(x.desk) < 350 else x.desk[:350], axis=1)
valid['qad'] = valid["Question"].astype(str) + "/n" + valid["desc"].astype(str)
valid['qad'] = valid['qad'].apply(clean_text)

In [160]:
valid['pred'] = valid.apply(lambda x: predict(x.qad)[0] ,axis=1)

In [161]:
valid['pred']

0      0
1      0
2      0
3      0
4      1
      ..
245    1
246    1
247    1
248    1
249    1
Name: pred, Length: 500, dtype: int64

In [162]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
precision_score(valid['label'], valid['pred']), recall_score(valid['label'], valid['pred'])

(0.918918918918919, 0.952)

In [163]:
hf_token = ""
hf_username = "Hvixze"
model.push_to_hub(f"{hf_username}/labse_wb_p2_4ep", token = hf_token) # Online saving
tokenizer.push_to_hub(f"{hf_username}/labse_wb_p2_4ep", token = hf_token)

model.safetensors:   0%|          | 0.00/513M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Hvixze/labse_wb_p2_4ep/commit/4ceb9356f91ff04da8ad0b63790a5f362347f868', commit_message='Upload tokenizer', commit_description='', oid='4ceb9356f91ff04da8ad0b63790a5f362347f868', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Hvixze/labse_wb_p2_4ep', endpoint='https://huggingface.co', repo_type='model', repo_id='Hvixze/labse_wb_p2_4ep'), pr_revision=None, pr_num=None)