In [1]:
from catboost import CatBoostClassifier
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from catboost import CatBoostClassifier, Pool, metrics, cv
from tqdm import tqdm
import numpy as np
import re
import torch.optim as optim
import matplotlib.pyplot as plt
import wandb
from transformers import get_cosine_schedule_with_warmup


import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
# Скачиваем и обрабатываем данные
train_data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')  
train_data = train_data[['text', 'target']] 


train_data.loc[train_data['text'] == 'like for the music video I want some real action shit like burning buildings and police chases not some weak ben winston shit', 'target'] = 1
train_data.loc[train_data['text'] == 'Hellfire is surrounded by desires so be careful and donÛªt let your desires control you! #Afterlife', 'target'] = 1
train_data.loc[train_data['text'] == 'To fight bioterrorism sir.', 'target'] = 1
train_data.loc[train_data['text'] == '.POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4', 'target'] = 1
train_data.loc[train_data['text'] == 'CLEARED:incident with injury:I-495  inner loop Exit 31 - MD 97/Georgia Ave Silver Spring', 'target'] = 1
train_data.loc[train_data['text'] == '#foodscare #offers2go #NestleIndia slips into loss after #Magginoodle #ban unsafe and hazardous for #humanconsumption', 'target'] = 1
train_data.loc[train_data['text'] == 'In #islam saving a person is equal in reward to saving all humans! Islam is the opposite of terrorism!', 'target'] = 1
train_data.loc[train_data['text'] == 'Who is bringing the tornadoes and floods. Who is bringing the climate change. God is after America He is plaguing her\n \n#FARRAKHAN #QUOTE', 'target'] = 1
train_data.loc[train_data['text'] == 'RT NotExplained: The only known image of infamous hijacker D.B. Cooper. http://t.co/JlzK2HdeTG', 'target'] = 0
train_data.loc[train_data['text'] == "Mmmmmm I'm burning.... I'm burning buildings I'm building.... Oooooohhhh oooh ooh...", 'target'] = 1
train_data.loc[train_data['text'] == "wowo--=== 12000 Nigerian refugees repatriated from Cameroon", 'target'] = 0
train_data.loc[train_data['text'] == "He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam", 'target'] = 1
train_data.loc[train_data['text'] == "Hellfire! We donÛªt even want to think about it or mention it so letÛªs not do anything that leads to it #islam!", 'target'] = 1
train_data.loc[train_data['text'] == "The Prophet (peace be upon him) said 'Save yourself from Hellfire even if it is by giving half a date in charity.'", 'target'] = 1
train_data.loc[train_data['text'] == "Caution: breathing may be hazardous to your health.", 'target'] = 0
train_data.loc[train_data['text'] == "I Pledge Allegiance To The P.O.P.E. And The Burning Buildings of Epic City. ??????", 'target'] = 1
train_data.loc[train_data['text'] == "#Allah describes piling up #wealth thinking it would last #forever as the description of the people of #Hellfire in Surah Humaza. #Reflect", 'target'] = 1
train_data.loc[train_data['text'] == "that horrible sinking feeling when youÛªve been at home on your phone for a while and you realise its been on 3G this whole time", 'target'] = 0


test_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

# **Bert + catboost**

In [3]:
from transformers import AutoTokenizer, AutoModel

class Bert(nn.Module):
    def __init__(self, trainable = False):
        super().__init__()
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.bert = AutoModel.from_pretrained('bert-base-uncased').to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        if trainable == False:
            for m in self.bert.modules():
                for name, params in m.named_parameters():
                    params.requires_grad = False
        self.target_indx = 0
        

    def forward(self, input):

        encoding = self.tokenizer.batch_encode_plus(
            input,  # List of input texts
            padding="max_length",
            max_length=512,  # Pad to the maximum sequence length
            truncation=True,  # Truncate to the maximum sequence length if necessary
            return_tensors='pt',  # Return PyTorch tensors
            add_special_tokens=True  # Add special tokens CLS and SEP
        )

        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)

        out = self.bert(input_ids, attention_mask, output_hidden_states=True)
        out = out[0][:,self.target_indx,:]

        return out
    
class dataset_bilder(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]
    
    
class bert_catboost(nn.Module):
    def __init__(self, params):
        super().__init__()
        self.bert = Bert()
        self.catboost = CatBoostClassifier(**params)
        
    def train_catboost(self, train, val, batch_train_size, batch_val_size):
        bert_output = []
        bert_output_val = []
        target = []
        target_val = []
        
        for X_train, y_train in tqdm(train, desc="Train", colour="CYAN"):
            if y_train.shape[0] == batch_train_size:
                bert_output.append(self.bert(X_train).tolist())
                target.append(y_train.tolist())
            
        for X_val, y_val in tqdm(val, desc="Val", colour="CYAN"):
            if y_val.shape[0] == batch_val_size:
                bert_output_val.append(self.bert(X_val).tolist())
                target_val.append(y_val.tolist())
        
        bert_output = np.array(bert_output)
        bert_output = bert_output.reshape(-1, bert_output.shape[-1])
        bert_output_val = np.array(bert_output_val)
        bert_output_val = bert_output_val.reshape(-1, bert_output_val.shape[-1])
        
        target = np.array(target)
        target = target.reshape(-1)
        target_val = np.array(target_val)
        target_val = target_val.reshape(-1)
        
        self.catboost.fit(
            bert_output, target,
            eval_set=(bert_output_val, target_val),
            #logging_level='Verbose',  # you can uncomment this for text output
            plot=True
        );
        
        
    def forward(self, input):
        output = self.bert(input).tolist()
        output = self.catboost.predict(output)
        
        return output

    
class model_testment(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
        
    def take_accuracy(self,test_data):
        first_iter = True
        for X, y in tqdm(test_data, desc="Testment", colour="CYAN"):
            if first_iter == True:
                model_prediction = self.model(X)
                target = y.numpy() 
                first_iter = False
            else:
                model_prediction = np.concatenate((model_prediction, self.model(X)), axis = 0)
                target = np.concatenate((target, y.numpy()), axis = 0)
        
        return len(model_prediction[model_prediction == target])/len(model_prediction)
    
    def create_result(self, pd_test):
        texts = pd_test['text'].values.tolist()   
        model_output = []
        for text in texts:
            model_output.append(self.model([text])[0])
        model_output = np.array(model_output)
        
        submission = pd.DataFrame({'id': pd_test['id'], 'target': model_output})
        
        return submission
    
    def forward(self, input):
        return self.model(input)

In [4]:
from sklearn.model_selection import train_test_split  # To split data into training and testing sets
#Train-test split

X_train, X_val, y_train, y_val = train_test_split(train_data.text.values.tolist(), train_data.target.values.tolist(), test_size=0.01)

train_dataset = dataset_bilder(X_train, y_train)
val_dataset = dataset_bilder(X_val, y_val)

batch_train_size = 64
batch_val_size = 64

train = DataLoader(
        train_dataset,
        batch_size=batch_train_size,
        num_workers=4,
        shuffle=True,
        collate_fn=None,
    )

val = DataLoader(
        val_dataset,
        batch_size=batch_val_size,
        num_workers=4,
        shuffle=False,
        collate_fn=None,
    )

In [5]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

params = {
    'iterations': 500,
    'learning_rate': 0.01,
    'eval_metric': metrics.Accuracy(),
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': True,
    'task_type' : 'GPU'
}

model = bert_catboost(params)
model.train_catboost(train, val, batch_train_size, batch_val_size)

acc = model_testment(model).take_accuracy(val)
print('accuracy: ' + str(acc))

submission = model_testment(model).create_result(test_data)
submission.to_csv('submission.csv', index=False)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Train: 100%|[36m██████████[0m| 118/118 [02:01<00:00,  1.03s/it]
Val: 100%|[36m██████████[0m| 2/2 [00:01<00:00,  1.72it/s]


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Testment: 100%|[36m██████████[0m| 2/2 [00:01<00:00,  1.38it/s]


accuracy: 0.8051948051948052


In [30]:
acc = model_testment(model).take_accuracy(val)
print('accuracy: ' + str(acc))

Testment: 100%|[36m██████████[0m| 24/24 [00:26<00:00,  1.12s/it]

accuracy: 0.8073089700996677





In [6]:
submission = model_testment(model).create_result(test_data)
submission.to_csv('submission.csv', index=False)

#  **BertForSequenceClassification**

In [27]:
from transformers import BertTokenizer, BertForSequenceClassification

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def load_checkpoint(filepath):
    checkpoint = torch.load(filepath)
    model = checkpoint['model']
    model.load_state_dict(checkpoint['state_dict'])
    for parameter in model.parameters():
        parameter.requires_grad = True
    
    return model


class Bert_classification(nn.Module): # создаем класс с бертом и токенайзером
    def __init__(self, dowload_model_checkpoint = False):
        super().__init__()
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        if dowload_model_checkpoint == False:
            self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2 ).to(self.device)
        else:
            self.bert = load_checkpoint('/kaggle/input/bert_class/pytorch/default/1/checkpoint.pth')
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def forward(self, input):
        encoding = self.tokenizer.batch_encode_plus(
            input,  # List of input texts
            padding="max_length",
            max_length=128,  # Pad to the maximum sequence length
            truncation=True,  # Truncate to the maximum sequence length if necessary
            return_tensors='pt',  # Return PyTorch tensors
            #add_special_tokens=True  # Add special tokens CLS and SEP
        )

        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)

        out = self.bert(input_ids, attention_mask)

        return out.logits
    
class dataset_bilder(Dataset): # для создания датасета
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]

    
class model_usage(): # создем класс с кодом для обучения, тестирования и сохранению результата
    def __init__(self, model):
        self.model = model 
    
    def train(self, dataloader, optimizer, loss_func, scheduler, epochs):
        self.model.bert.train()
        for _ in range(epochs):
            for texts, labels in tqdm(dataloader, desc="Epoch", colour="GREEN"):
                labels = labels.to(device)
                optimizer.zero_grad()
                output = self.model(texts)
                labels = labels.long()
                loss = loss_func(output, labels)
                wandb.log({"loss_val": loss})
                loss.backward()
                optimizer.step()
                scheduler.step()
                
            checkpoint = {'model': self.model,
              'state_dict': self.model.state_dict()}

            torch.save(checkpoint, 'checkpoint_'+ str(_) +'.pth')

    def test(self, dataloader, loss_func):
        self.model.eval()
        testloss, correct = 0, 0
        num_batches = len(dataloader)
        size = len(dataloader.dataset)
        
        with torch.no_grad():
            for texts, labels in tqdm(dataloader, desc="Eval", colour="CYAN"):
                labels = labels.to(device)
                output = self.model(texts)
                labels = labels.long()
                loss = loss_func(output, labels)
                testloss += loss.item()
                preds = torch.argmax(output, dim=1)
                correct += (preds == labels).type(torch.float).sum().item()

        correct /= size
        testloss /= num_batches

        print(f"Testment: \nAccuracy: {(100*correct):>0.1f}%, Avg loss: {testloss:>8f} \n")
        
    def create_result(self, pd_test):
        texts = pd_test['text'].values.tolist()   
        model_output = []
        for text in tqdm(texts, desc="Creating", colour="CYAN"):
            model_output.append(torch.argmax(self.model([text])).item())
        model_output = np.array(model_output)
        
        submission = pd.DataFrame({'id': pd_test['id'], 'target': model_output})
        
        return submission

In [28]:
## Сплитаем дату
from sklearn.model_selection import train_test_split  # To split data into training and testing sets

X_train, X_val, y_train, y_val = train_test_split(train_data.text.values.tolist(), train_data.target.values.tolist(), test_size=0.01)

#Создаем даталоадер для обучения
train_dataset = dataset_bilder(X_train, y_train)
val_dataset = dataset_bilder(X_val, y_val)

batch_train_size = 128
batch_val_size = 128

train = DataLoader(
        train_dataset,
        batch_size=batch_train_size,
        num_workers=4,
        shuffle=True,
        collate_fn=None,
    )

val = DataLoader(
        val_dataset,
        batch_size=batch_val_size,
        num_workers=4,
        shuffle=False,
        collate_fn=None,
    )
# Инициализируем модель
model = Bert_classification(dowload_model_checkpoint = False)

# Создаем вещи для обучения
epochs = 10
LR = 1e-6
optimizer = optim.AdamW(model.bert.parameters(), lr = LR)
loss_func = nn.CrossEntropyLoss()
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=120, num_training_steps= 59 * epochs)  # 226 - число батчей в датасете


wandb.login()
wandb.init(
            project="BertForSequenceClassification",
            config={
                "config": 'Tuning',
                "epoch": str(epochs),
                "lr": str(LR),
            })

# Обучаем
model_f = model_usage(model)
model_f.train(train, optimizer, loss_func, scheduler, epochs)                            
Bert_model = model_f.model

# Тестируем
model_f.test(val, loss_func)

# Сохраняем результат на тест дате
submission = model_f.create_result(test_data)
submission.to_csv('submission.csv', index=False)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

KeyboardInterrupt



In [None]:
checkpoint = {'model': Bert_model.bert,
              'state_dict': Bert_model.bert.state_dict()}

torch.save(checkpoint, 'checkpoint.pth')

# **RobertaForSequenceClassification**

In [1]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class Roberta_classification(nn.Module): # создаем класс с бертом и токенайзером
    def __init__(self):
        super().__init__()
        self.model_name = "roberta-base"
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.model = RobertaForSequenceClassification.from_pretrained(self.model_name, num_labels=2).to(self.device)
        self.tokenizer = RobertaTokenizer.from_pretrained(self.model_name)

    def forward(self, input):
        encoding = self.tokenizer.batch_encode_plus(
            input,  # List of input texts
            padding="max_length",
            max_length=128,  # Pad to the maximum sequence length
            truncation=True,  # Truncate to the maximum sequence length if necessary
            return_tensors='pt',  # Return PyTorch tensors
            add_special_tokens=True  # Add special tokens CLS and SEP
        )

        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)

        out = self.model(input_ids, attention_mask)

        return out.logits
    
class dataset_bilder(Dataset): # для создания датасета
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]

    
class model_usage(): # создем класс с кодом для обучения, тестирования и сохранению результата
    def __init__(self, model):
        self.model = model 
    
    def train(self, dataloader, optimizer, loss_func, epochs, val_data):
        self.model.model.train()
        for _ in range(epochs):
            for texts, labels in tqdm(dataloader, desc="Epoch", colour="GREEN"):
                labels = labels.to(device)
                optimizer.zero_grad()
                output = self.model(texts)
                labels = labels.long()
                loss = loss_func(output, labels)
                wandb.log({"loss_val": loss})
                loss.backward()
                optimizer.step()
            
            self.test(val_data, loss_func)
            
            checkpoint = {'model': self.model,
              'state_dict': self.model.state_dict()}

            torch.save(checkpoint, 'checkpoint_'+ str(_) +'.pth')

    def test(self, dataloader, loss_func):
        self.model.eval()
        testloss, correct = 0, 0
        num_batches = len(dataloader)
        size = len(dataloader.dataset)
        
        with torch.no_grad():
            for texts, labels in tqdm(dataloader, desc="Eval", colour="CYAN"):
                labels = labels.to(device)
                output = self.model(texts)
                labels = labels.long()
                loss = loss_func(output, labels)
                testloss += loss.item()
                preds = torch.argmax(output, dim=1)
                correct += (preds == labels).type(torch.float).sum().item()

        correct /= size
        testloss /= num_batches

        print(f"Testment: \nAccuracy: {(100*correct):>0.1f}%, Avg loss: {testloss:>8f} \n")
        
    def create_result(self, pd_test):
        texts = pd_test['text'].values.tolist()   
        model_output = []
        for text in tqdm(texts, desc="Creating", colour="CYAN"):
            model_output.append(torch.argmax(self.model([text])).item())
        model_output = np.array(model_output)
        
        submission = pd.DataFrame({'id': pd_test['id'], 'target': model_output})
        
        return submission

NameError: name 'torch' is not defined

In [None]:
# Сплитаем дату
from sklearn.model_selection import train_test_split  # To split data into training and testing sets

X_train, X_val, y_train, y_val = train_test_split(train_data.text.values.tolist(), train_data.target.values.tolist(), test_size=0.2)

#Создаем даталоадер для обучения
train_dataset = dataset_bilder(X_train, y_train)
val_dataset = dataset_bilder(X_val, y_val)

batch_train_size = 128
batch_val_size = 128

train = DataLoader(
        train_dataset,
        batch_size=batch_train_size,
        num_workers=4,
        shuffle=True,
        collate_fn=None,
    )

val = DataLoader(
        val_dataset,
        batch_size=batch_val_size,
        num_workers=4,
        shuffle=False,
        collate_fn=None,
    )
# Инициализируем модель
model = Roberta_classification()

# Создаем вещи для обучения
epochs = 10
LR = 1e-6
optimizer = optim.AdamW(model.model.parameters(), lr = LR)
loss_func = nn.CrossEntropyLoss()

wandb.login()
wandb.init(
            project="RobertaForSequenceClassification",
            config={
                "config": 'Tuning',
                "epoch": str(epochs),
                "lr": str(LR),
            })

# Обучаем и тестируем на каждой эпохе
model_f = model_usage(model)
model_f.train(train, optimizer, loss_func, epochs, val)                            
Lamma_model = model_f.model


# Сохраняем результат на тест дате
submission = model_f.create_result(test_data)
submission.to_csv('submission.csv', index=False)

# **Roberta + CatBoost**

In [3]:
from transformers import RobertaModel, RobertaTokenizer

class Bert(nn.Module):
    def __init__(self, trainable = False):
        super().__init__()
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.bert = RobertaModel.from_pretrained("roberta-base").to(self.device)
        self.tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
        if trainable == False:
            for m in self.bert.modules():
                for name, params in m.named_parameters():
                    params.requires_grad = False
        self.target_indx = 0
        

    def forward(self, input):

        encoding = self.tokenizer.batch_encode_plus(
            input,  # List of input texts
            padding="max_length",
            max_length=512,  # Pad to the maximum sequence length
            truncation=True,  # Truncate to the maximum sequence length if necessary
            return_tensors='pt',  # Return PyTorch tensors
            add_special_tokens=True  # Add special tokens CLS and SEP
        )

        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)

        out = self.bert(input_ids, attention_mask, output_hidden_states=True)
        out = out[0][:,self.target_indx,:]

        return out
    
class dataset_bilder(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]
    
    
class bert_catboost(nn.Module):
    def __init__(self, params):
        super().__init__()
        self.bert = Bert()
        self.catboost = CatBoostClassifier(**params)
        
    def train_catboost(self, train, val, batch_train_size, batch_val_size):
        bert_output = []
        bert_output_val = []
        target = []
        target_val = []
        
        for X_train, y_train in tqdm(train, desc="Train", colour="CYAN"):
            if y_train.shape[0] == batch_train_size:
                bert_output.append(self.bert(X_train).tolist())
                target.append(y_train.tolist())
            
        for X_val, y_val in tqdm(val, desc="Val", colour="CYAN"):
            if y_val.shape[0] == batch_val_size:
                bert_output_val.append(self.bert(X_val).tolist())
                target_val.append(y_val.tolist())
        
        bert_output = np.array(bert_output)
        bert_output = bert_output.reshape(-1, bert_output.shape[-1])
        bert_output_val = np.array(bert_output_val)
        bert_output_val = bert_output_val.reshape(-1, bert_output_val.shape[-1])
        
        target = np.array(target)
        target = target.reshape(-1)
        target_val = np.array(target_val)
        target_val = target_val.reshape(-1)
        
        self.catboost.fit(
            bert_output, target,
            eval_set=(bert_output_val, target_val),
            #logging_level='Verbose',  # you can uncomment this for text output
            plot=True
        );
        
        
    def forward(self, input):
        output = self.bert(input).tolist()
        output = self.catboost.predict(output)
        
        return output

    
class model_testment(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
        
    def take_accuracy(self,test_data):
        first_iter = True
        for X, y in tqdm(test_data, desc="Testment", colour="CYAN"):
            if first_iter == True:
                model_prediction = self.model(X)
                target = y.numpy() 
                first_iter = False
            else:
                model_prediction = np.concatenate((model_prediction, self.model(X)), axis = 0)
                target = np.concatenate((target, y.numpy()), axis = 0)
        
        return len(model_prediction[model_prediction == target])/len(model_prediction)
    
    def create_result(self, pd_test):
        texts = pd_test['text'].values.tolist()   
        model_output = []
        for text in texts:
            model_output.append(self.model([text])[0])
        model_output = np.array(model_output)
        
        submission = pd.DataFrame({'id': pd_test['id'], 'target': model_output})
        
        return submission
    
    def forward(self, input):
        return self.model(input)

In [4]:
from sklearn.model_selection import train_test_split  # To split data into training and testing sets
#Train-test split

X_train, X_val, y_train, y_val = train_test_split(train_data.text.values.tolist(), train_data.target.values.tolist(), test_size=0.01)

train_dataset = dataset_bilder(X_train, y_train)
val_dataset = dataset_bilder(X_val, y_val)

batch_train_size = 64
batch_val_size = 64

train = DataLoader(
        train_dataset,
        batch_size=batch_train_size,
        num_workers=4,
        shuffle=True,
        collate_fn=None,
    )

val = DataLoader(
        val_dataset,
        batch_size=batch_val_size,
        num_workers=4,
        shuffle=False,
        collate_fn=None,
    )


import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

params = {
    'iterations': 500,
    'learning_rate': 0.01,
    'eval_metric': metrics.Accuracy(),
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': True,
    'task_type' : 'GPU'
}

model = bert_catboost(params)
model.train_catboost(train, val, batch_train_size, batch_val_size)

acc = model_testment(model).take_accuracy(val)
print('accuracy: ' + str(acc))

submission = model_testment(model).create_result(test_data)
submission.to_csv('submission.csv', index=False)

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Train: 100%|[36m██████████[0m| 118/118 [02:15<00:00,  1.15s/it]
Val: 100%|[36m██████████[0m| 2/2 [00:01<00:00,  1.56it/s]


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Testment: 100%|[36m██████████[0m| 2/2 [00:01<00:00,  1.26it/s]


accuracy: 0.8571428571428571


In [5]:
submission = model_testment(model).create_result(test_data)
submission.to_csv('submission.csv', index=False)

# **ElectraForSequenceClassification**

In [9]:
from transformers import ElectraTokenizer, ElectraForSequenceClassification
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class Electra_classification(nn.Module): # создаем класс с бертом и токенайзером
    def __init__(self):
        super().__init__()
        self.model_name = "google/electra-small-discriminator"
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.model = ElectraForSequenceClassification.from_pretrained(self.model_name, num_labels=2).to(self.device)
        self.tokenizer = ElectraTokenizer.from_pretrained(self.model_name)

    def forward(self, input):
        encoding = self.tokenizer.batch_encode_plus(
            input,  # List of input texts
            padding="max_length",
            max_length=128,  # Pad to the maximum sequence length
            truncation=True,  # Truncate to the maximum sequence length if necessary
            return_tensors='pt',  # Return PyTorch tensors
            add_special_tokens=True  # Add special tokens CLS and SEP
        )

        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)

        out = self.model(input_ids, attention_mask)

        return out.logits
    
class dataset_bilder(Dataset): # для создания датасета
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]

    
class model_usage(): # создем класс с кодом для обучения, тестирования и сохранению результата
    def __init__(self, model):
        self.model = model 
    
    def train(self, dataloader, optimizer, loss_func, epochs, val_data):
        self.model.model.train()
        for _ in range(epochs):
            for texts, labels in tqdm(dataloader, desc="Epoch", colour="GREEN"):
                labels = labels.to(device)
                optimizer.zero_grad()
                output = self.model(texts)
                labels = labels.long()
                loss = loss_func(output, labels)
                wandb.log({"loss_val": loss})
                loss.backward()
                optimizer.step()
            
            self.test(val_data, loss_func)
            
            checkpoint = {'model': self.model,
              'state_dict': self.model.state_dict()}

            torch.save(checkpoint, 'checkpoint_'+ str(_) +'.pth')

    def test(self, dataloader, loss_func):
        self.model.eval()
        testloss, correct = 0, 0
        num_batches = len(dataloader)
        size = len(dataloader.dataset)
        
        with torch.no_grad():
            for texts, labels in tqdm(dataloader, desc="Eval", colour="CYAN"):
                labels = labels.to(device)
                output = self.model(texts)
                labels = labels.long()
                loss = loss_func(output, labels)
                testloss += loss.item()
                preds = torch.argmax(output, dim=1)
                correct += (preds == labels).type(torch.float).sum().item()

        correct /= size
        testloss /= num_batches

        print(f"Testment: \nAccuracy: {(100*correct):>0.1f}%, Avg loss: {testloss:>8f} \n")
        
    def create_result(self, pd_test):
        texts = pd_test['text'].values.tolist()   
        model_output = []
        for text in tqdm(texts, desc="Creating", colour="CYAN"):
            model_output.append(torch.argmax(self.model([text])).item())
        model_output = np.array(model_output)
        
        submission = pd.DataFrame({'id': pd_test['id'], 'target': model_output})
        
        return submission

In [12]:
# Сплитаем дату
from sklearn.model_selection import train_test_split  # To split data into training and testing sets

X_train, X_val, y_train, y_val = train_test_split(train_data.text.values.tolist(), train_data.target.values.tolist(), test_size=0.2)

#Создаем даталоадер для обучения
train_dataset = dataset_bilder(X_train, y_train)
val_dataset = dataset_bilder(X_val, y_val)

batch_train_size = 128
batch_val_size = 128

train = DataLoader(
        train_dataset,
        batch_size=batch_train_size,
        num_workers=4,
        shuffle=True,
        collate_fn=None,
    )

val = DataLoader(
        val_dataset,
        batch_size=batch_val_size,
        num_workers=4,
        shuffle=False,
        collate_fn=None,
    )
# Инициализируем модель
model = Electra_classification()

# Создаем вещи для обучения
epochs = 8
LR = 1e-5
optimizer = optim.AdamW(model.model.parameters(), lr = LR)
loss_func = nn.CrossEntropyLoss()

wandb.login()
wandb.init(
            project="ElectraForSequenceClassification",
            config={
                "config": 'Tuning',
                "epoch": str(epochs),
                "lr": str(LR),
            })

# Обучаем и тестируем на каждой эпохе
model_f = model_usage(model)
model_f.train(train, optimizer, loss_func, epochs, val)                            
Lamma_model = model_f.model


# Сохраняем результат на тест дате
submission = model_f.create_result(test_data)
submission.to_csv('submission.csv', index=False)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


VBox(children=(Label(value='0.018 MB of 0.018 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss_val,█████▇▆▇▅▆▅▅▄▅▄▄▅▃▄▃▃▄▃▃▄▃▃▃▃▂▃▂▃▂▃▂▁▃▃▂

0,1
loss_val,0.19628


Epoch: 100%|[32m██████████[0m| 60/60 [00:20<00:00,  2.91it/s]
Eval: 100%|[36m██████████[0m| 1/1 [00:00<00:00,  4.05it/s]


Testment: 
Accuracy: 87.5%, Avg loss: 0.609466 



Epoch: 100%|[32m██████████[0m| 60/60 [00:20<00:00,  2.97it/s]
Eval: 100%|[36m██████████[0m| 1/1 [00:00<00:00,  7.18it/s]


Testment: 
Accuracy: 87.5%, Avg loss: 0.481980 



Epoch: 100%|[32m██████████[0m| 60/60 [00:20<00:00,  2.98it/s]
Eval: 100%|[36m██████████[0m| 1/1 [00:00<00:00,  6.94it/s]


Testment: 
Accuracy: 62.5%, Avg loss: 0.568511 



Epoch: 100%|[32m██████████[0m| 60/60 [00:20<00:00,  2.97it/s]
Eval: 100%|[36m██████████[0m| 1/1 [00:00<00:00,  6.75it/s]


Testment: 
Accuracy: 87.5%, Avg loss: 0.403331 



Epoch: 100%|[32m██████████[0m| 60/60 [00:20<00:00,  2.96it/s]
Eval: 100%|[36m██████████[0m| 1/1 [00:00<00:00,  7.09it/s]


Testment: 
Accuracy: 87.5%, Avg loss: 0.419730 



Epoch: 100%|[32m██████████[0m| 60/60 [00:20<00:00,  2.98it/s]
Eval: 100%|[36m██████████[0m| 1/1 [00:00<00:00,  6.78it/s]


Testment: 
Accuracy: 87.5%, Avg loss: 0.441447 



Epoch: 100%|[32m██████████[0m| 60/60 [00:20<00:00,  2.97it/s]
Eval: 100%|[36m██████████[0m| 1/1 [00:00<00:00,  6.63it/s]


Testment: 
Accuracy: 87.5%, Avg loss: 0.393552 



Epoch: 100%|[32m██████████[0m| 60/60 [00:20<00:00,  2.98it/s]
Eval: 100%|[36m██████████[0m| 1/1 [00:00<00:00,  6.22it/s]


Testment: 
Accuracy: 87.5%, Avg loss: 0.395433 



Creating: 100%|[36m██████████[0m| 3263/3263 [00:41<00:00, 78.81it/s]


In [7]:
true_targets = pd.read_csv('/kaggle/input/leaked-data-test/submission_leaked_data.csv')

true_targets['target'].values

array([1, 1, 1, ..., 1, 1, 1])

In [16]:
submission['target'].values

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [8]:
comparison = true_targets.compare(submission)

# Печать различий
print(comparison)

     target      
       self other
0       1.0   0.0
2       1.0   0.0
5       1.0   0.0
17      0.0   1.0
27      0.0   1.0
...     ...   ...
3234    1.0   0.0
3241    0.0   1.0
3242    1.0   0.0
3246    1.0   0.0
3262    1.0   0.0

[823 rows x 2 columns]
