In [61]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification

In [62]:
neg = pd.read_csv("/kaggle/input/wb-sch-p2/train_neg.csv")
pos = pd.read_csv("/kaggle/input/wb-sch-p2/train_pos.csv")

In [63]:
data = pd.concat([neg, pos])

In [64]:
data['desc'] = data.apply(lambda x: x if len(x.desk) < 350 else x.desk[:350], axis=1)

In [65]:
data = data.sample(frac=1).reset_index(drop=True)

In [66]:
data['qad'] = data["Question"].astype(str) + "/n" + data["desc"].astype(str)

In [67]:
import re
def clean_text(text):
    text = text.lower()

    text = re.sub(r'[^a-zA-Zа-яА-Я0-9\s]', '', text)
    
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [68]:
data['qad'] = data['qad'].apply(clean_text)

In [69]:
class CustomDataset(Dataset):
  def __init__(self, questions, labels, tokenizer, max_length):
    self.questions = questions
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.questions)

  def __getitem__(self, idx):
    text = str(self.questions[idx])
    label = self.labels[idx]

    encoding = self.tokenizer.encode_plus(
               text,
               add_special_tokens=True,
               max_length=self.max_length,
               return_token_type_ids=False,
               padding='max_length',
               return_attention_mask=True,
               return_tensors='pt',
               truncation=True
           )

    return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
           }

In [70]:
def create_data_loader(df, tokenizer, max_length, batch_size):
  ds = CustomDataset(
           questions=df.qad.to_numpy(),
           labels=df.label.to_numpy(),
           tokenizer=tokenizer,
           max_length=max_length
       )
  return DataLoader(ds, batch_size=batch_size, num_workers=4)

In [71]:
model_name ="intfloat/multilingual-e5-base" #"cointegrated/LaBSE-en-ru"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
# model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at intfloat/multilingual-e5-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [72]:
data.shape

(2512, 8)

In [73]:
val = data.sample(400, random_state=42)
train = data.drop(val.index)

In [74]:
BATCH_SIZE = 16
MAX_LENGTH = 512

train_data_loader = create_data_loader(train, tokenizer, MAX_LENGTH, BATCH_SIZE)
val_data_loader = create_data_loader(val, tokenizer, MAX_LENGTH, BATCH_SIZE)

training_args = TrainingArguments(
       output_dir='./results',
       num_train_epochs=4,
       per_device_train_batch_size=BATCH_SIZE,
       per_device_eval_batch_size=BATCH_SIZE,
       warmup_steps=500,
       weight_decay=0.01,
       logging_dir='./logs',
       logging_steps=10,
       eval_strategy='steps'
   )

trainer = Trainer(
       model=model,
       args=training_args,
       train_dataset=train_data_loader.dataset,
       eval_dataset=val_data_loader.dataset
   )

In [75]:
trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss,Validation Loss
10,0.6727,0.680199
20,0.6731,0.669578
30,0.6641,0.66545
40,0.6778,0.656854
50,0.652,0.64486
60,0.662,0.632265
70,0.6281,0.611201
80,0.5424,0.612993
90,0.6363,0.546514
100,0.5686,0.513093


TrainOutput(global_step=264, training_loss=0.39008257425192633, metrics={'train_runtime': 736.634, 'train_samples_per_second': 11.468, 'train_steps_per_second': 0.358, 'total_flos': 2222762195681280.0, 'train_loss': 0.39008257425192633, 'epoch': 4.0})

In [76]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
def tokenize_question(question, tokenizer, max_length=512):
    return tokenizer(
        [question],
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=max_length
    )

def predict(question):
    inputs = tokenize_question(question, tokenizer)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        
    predictions = torch.argmax(logits, dim=-1).cpu().numpy()
    probabilities = torch.softmax(logits, dim=-1).cpu().numpy()
    return predictions
    
    
    
    

In [77]:
val_neg = pd.read_csv("/kaggle/input/wb-sch-p2/valid_250_neg.csv")
val_pos = pd.read_csv("/kaggle/input/wb-sch-p2/valid_250_pos.csv")

In [78]:
valid = pd.concat([val_neg, val_pos])

In [79]:
valid['desc'] = valid.apply(lambda x: x if len(x.desk) < 350 else x.desk[:350], axis=1)
valid['qad'] = valid["Question"].astype(str) + "/n" + valid["desc"].astype(str)
valid['qad'] = valid['qad'].apply(clean_text)

In [80]:
valid['pred'] = valid.apply(lambda x: predict(x.qad)[0] ,axis=1)

In [81]:
valid['pred']

0      0
1      0
2      0
3      0
4      1
      ..
245    1
246    1
247    1
248    1
249    1
Name: pred, Length: 500, dtype: int64

In [82]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
precision_score(valid['label'], valid['pred']), recall_score(valid['label'], valid['pred'])

(0.9263565891472868, 0.956)

In [83]:
hf_token = "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"
hf_username = "Hvixze"
model.push_to_hub(f"{hf_username}/e5_base_wb_p2_4ep", token = hf_token) # Online saving
tokenizer.push_to_hub(f"{hf_username}/e5_base_wb_p2_4ep", token = hf_token)

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Hvixze/e5_base_wb_p2_4ep/commit/f5e5d90833e1a6c30ef828243e306faf23928dd0', commit_message='Upload tokenizer', commit_description='', oid='f5e5d90833e1a6c30ef828243e306faf23928dd0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Hvixze/e5_base_wb_p2_4ep', endpoint='https://huggingface.co', repo_type='model', repo_id='Hvixze/e5_base_wb_p2_4ep'), pr_revision=None, pr_num=None)