In [1]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
import pandas as pd
import os
import csv
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm

2025-07-14 19:49:02.732921: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752504542.753530    7660 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752504542.759689    7660 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752504542.775286    7660 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752504542.775308    7660 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752504542.775310    7660 computation_placer.cc:177] computation placer alr

In [2]:
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

In [3]:
def read_line_csv(spamreader):
    for row in spamreader:
        yield row

In [4]:
train_df = pd.DataFrame(columns = ["Text_1", "Text_2", "Target"])
csvfile = open('/home/egikor/ML/Practic Project/data/train.csv', newline='') 
spamreader = csv.reader(csvfile, delimiter=',')
csv_generator = read_line_csv(spamreader)
next(csv_generator)

for dir_id in sorted(os.listdir("/home/egikor/ML/Practic Project/data/train")):
    result_row = []
    for file_name in sorted(os.listdir(os.path.join("/home/egikor/ML/Practic Project/data/train", dir_id))):
        file_path = os.path.join("/home/egikor/ML/Practic Project/data/train", dir_id, file_name)
        with open(file_path, "r") as f:
            file_content = f.readlines()
            result_row.append(" ".join(file_content))
    train_df.loc[len(train_df)] = [*result_row, int(next(csv_generator)[1])]

train_df['Target'] = train_df['Target'].astype(int)
train_df['Target'] = train_df['Target'] -1
display(train_df)

Unnamed: 0,Text_1,Text_2,Target
0,The VIRSA (Visible Infrared Survey Telescope A...,The China relay network has released a signifi...,0
1,China\n The goal of this project involves achi...,The project aims to achieve an accuracy level ...,1
2,Scientists can learn about how galaxies form a...,Dinosaur eggshells offer clues about what dino...,0
3,China\n The study suggests that multiple star ...,The importance for understanding how stars evo...,1
4,Dinosaur Rex was excited about his new toy set...,Analyzing how fast stars rotate within a galax...,1
...,...,...,...
90,A main focus of modern cosmology is to underst...,A key focus of modern cosmology is to understa...,1
91,"APEX, as its name suggests, serves as a guide ...","APEX, as its name suggests, serves as a guide ...",0
92,FORS1 and FORS2 are early instruments of the V...,FORS1 and FORS2 are early instruments of the V...,1
93,The observations of the Pluto-Charon system an...,The observations of the Pluto-Charon binary an...,1


In [5]:
class TextPairDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text1 = str(self.df.iloc[idx]['Text_1'])
        text2 = str(self.df.iloc[idx]['Text_2'])
        target = int(self.df.iloc[idx]['Target'])

        encoding1 = self.tokenizer(
            text1, 
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        encoding2 = self.tokenizer(
            text2,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids_0': encoding1['input_ids'].squeeze(0),
            'attention_mask_0': encoding1['attention_mask'].squeeze(0),
            'input_ids_1': encoding2['input_ids'].squeeze(0),
            'attention_mask_1': encoding2['attention_mask'].squeeze(0),
            'target': torch.tensor(target, dtype=torch.float)
        }

In [6]:
class BertPairClassifier(nn.Module):
    def __init__(self, pretrained_model_name='bert-base-uncased'):
        super().__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        self.classifier = nn.Sequential(
            nn.Linear(768*4, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 1)
        )
        
    def forward(self, input_ids_0, attention_mask_0, input_ids_1, attention_mask_1):
        out_0 = self.bert(input_ids=input_ids_0, attention_mask=attention_mask_0)
        out_1 = self.bert(input_ids=input_ids_1, attention_mask=attention_mask_1)
        
        emb_0 = out_0.last_hidden_state[:, 0, :]  # Берем [CLS] токен
        emb_1 = out_1.last_hidden_state[:, 0, :]
        
        features = torch.cat([
            emb_0,
            emb_1,
            torch.abs(emb_0 - emb_1),
            emb_0 * emb_1
        ], dim=1)
        
        return self.classifier(features).squeeze(1)  # Возвращаем logits

In [7]:
# Инициализация токенизатора
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Создание Dataset и DataLoader
train_dataset = TextPairDataset(train_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# 4. Инициализация модели и оптимизатора
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertPairClassifier().to(device)

# Балансировка классов для loss function
pos_weight = torch.tensor([len(train_df)/sum(train_df['Target'])-1]).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

In [8]:
num_epochs = 5 # Количество эпох
best_val_acc = 0 # Переменная для хранения лучшего accuracy 

for epoch in range(num_epochs): # Итерация по эпохам

    model.train() # Перевод модели в режим обучения

    train_loss, train_acc = 0, 0 # Переменная для хранения текущего loss и accuracy 
    

    for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}'): # Итерация по датасету 
        
        optimizer.zero_grad()
        
        # Получаем токены и маски с батча
        inputs = {
            'input_ids_0': batch['input_ids_0'].to(device),
            'attention_mask_0': batch['attention_mask_0'].to(device),
            'input_ids_1': batch['input_ids_1'].to(device),
            'attention_mask_1': batch['attention_mask_1'].to(device)
        }


        targets = batch['target'].to(device)
        
        # Прямой проход
        logits = model(**inputs)
        
        # Вычисляем loss
        loss = criterion(logits, targets)
        
        # Обратный проход
        loss.backward()

        # Ограничение нормы градиаентов
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Обновляем параметры модели
        optimizer.step()
        
        # Вычисление accuracy
        with torch.no_grad():
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).float()
            acc = (preds == targets).float().mean()
        
        train_loss += loss.item()
        train_acc += acc.item()
    
    # Валидация
    model.eval()
    
    # Вывод статистики
    avg_train_loss = train_loss / len(train_loader)
    avg_train_acc = train_acc / len(train_loader)
    
    print(f"\nEpoch {epoch+1}/{num_epochs}:")
    print(f"Train Loss: {avg_train_loss:.4f} | Acc: {avg_train_acc:.4f}")

Epoch 1: 100%|██████████| 12/12 [00:06<00:00,  1.74it/s]



Epoch 1/5:
Train Loss: 0.6459 | Acc: 0.6548


Epoch 2: 100%|██████████| 12/12 [00:06<00:00,  1.89it/s]



Epoch 2/5:
Train Loss: 0.5408 | Acc: 0.8631


Epoch 3: 100%|██████████| 12/12 [00:06<00:00,  1.96it/s]



Epoch 3/5:
Train Loss: 0.4052 | Acc: 0.9896


Epoch 4: 100%|██████████| 12/12 [00:06<00:00,  1.98it/s]



Epoch 4/5:
Train Loss: 0.2788 | Acc: 0.9896


Epoch 5: 100%|██████████| 12/12 [00:06<00:00,  1.88it/s]


Epoch 5/5:
Train Loss: 0.1852 | Acc: 1.0000





In [None]:
class TextPairDatasetTest(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text1 = str(self.df.iloc[idx]['Text_1'])
        text2 = str(self.df.iloc[idx]['Text_2'])

        encoding1 = self.tokenizer(
            text1, 
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        encoding2 = self.tokenizer(
            text2,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids_0': encoding1['input_ids'].squeeze(0),
            'attention_mask_0': encoding1['attention_mask'].squeeze(0),
            'input_ids_1': encoding2['input_ids'].squeeze(0),
            'attention_mask_1': encoding2['attention_mask'].squeeze(0),
            'text_1' : text1,
            'text_2' : text2,
        }

In [15]:
test_df = pd.DataFrame(columns = ["Text_1", "Text_2"])
for dir_id in sorted(os.listdir("/home/egikor/ML/Practic Project/data/test")):
    result_row = []
    for file_name in sorted(os.listdir(os.path.join("/home/egikor/ML/Practic Project/data/test", dir_id))):
        file_path = os.path.join("/home/egikor/ML/Practic Project/data/test", dir_id, file_name)
        with open(file_path, "r") as f:
            file_content = f.readlines()
            result_row.append(" ".join(file_content))
    test_df.loc[len(test_df)] = [*result_row]

display(test_df)

Unnamed: 0,Text_1,Text_2
0,"""Music"" Music music music Music music Music mu...",Since its launch on Paranal observatory's Very...
1,underground exploration on SN's birth has prov...,SN 1987A provides valuable insights as newer o...
2,This research aimed to understand how star sha...,ChromeDriver music player\n This study focused...
3,Using OmegaCAM's wide field capabilities spann...,"greek translation :\n vazhi (megaCAM), territo..."
4,AssemblyCulture AssemblyCulture AssemblyCultur...,XClass is software tool that helps astronomers...
...,...,...
1063,Alongside the detailed studies mentioned earli...,Alongside the detailed studies mentioned earli...
1064,"At this meeting, we gained a new outlook on th...","At this meeting, we gained a new outlook on th..."
1065,ESO Reflex is designed to handle essential tas...,ESO Reflex is designed to supply essential com...
1066,Even greater angular resolution is possible wi...,Higher angular resolution can also be achieved...


In [29]:
# Создание Dataset и DataLoader
test_dataset = TextPairDatasetTest(test_df, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
model.eval()
with torch.no_grad():
    for batch in test_loader:
        input_ids_0 = batch["input_ids_0"].to(device)
        attention_mask_0 = batch["attention_mask_0"].to(device)
        input_ids_1 = batch["input_ids_1"].to(device)
        attention_mask_1 = batch["attention_mask_1"].to(device)
        outputs = model(
            input_ids_0,
            attention_mask_0,
            input_ids_1,
            attention_mask_1
        )
        preds = torch.sigmoid(outputs).squeeze()
        preds = (preds > 0.5).type(torch.int16)




--------------------- батч ---------------------


['" music " music music music music music music music music music the two telescopes using " music " have been incredibly busy since their launch! they\'re incredibly popular for research on earthly objects like star clusters or celestial bodies like planets or even galaxies far away! they\'ve produced many scientific publications within just a few years - so many so that they dominate by far when it comes to peer reviewed articles published from those telescopes! these musical journey has produced over a hundred articles published through various outlets like journals such as\'nature \'. some notable achievements include discovering near space objects found between star systems as well as identifying cosmic events such as how',
 "underground exploration on sn's birth has provided valuable insights into its structure : early detection : the space telescope data revealed warm underground material around sn's blast zone ( like finding buried treasure ). subsequent studies used different 

--------------------- батч ---------------------


["mlloader's research faced various challenges throughout its journey towards capturing deep space images due its complex nature.. one unexpected issue arose early on : despite initial expectations for significant contrast reduction due to its design principle ( s ), their observations revealed progressively worsening performance over time.. this decline defied typical explanations related external factors impacting their imaging process.. the researchers suspected an internal culprit - a small leak allowing some kind or gasair particles inside their cryostat chamber could potentially freeze onto parts responsible for filtering light... fortunately they had reason enough at hand ; they estimated theoretically how much ice buildup",
 'dinosaur rex was an enormous theropod dinosaur from about 77 million years ago during what we call " the late cretaceous ". they were carnivorous dinosaurs known for their large size relative to other theropod dinosaurs like velociraptors or deinonychuses.

--------------------- батч ---------------------


['the " naomi " project underwent two phases for testing its capabilities at both eso\'s paranal observatory ( the first ) and later at garching ( the second ). these phases involved fourteen missions using various telescopes within an instrument suite called " vlti ". during these tests : functionality checks verified that " naomi " could work correctly as part of other telescopes like gravity pionier under specific configurations like those offered by matisse. performance assessments measured throughput efficiency, potential signal loss due to dark areas or dust clouds, as well as stability through fringe tracking analysis. these measurements compared favorably against results obtained without " naomi',
 'god object like red supergiants lose mass before becoming white dwarfs or exploding as supernovae due competition between gravity pulling them back inward versus other forces like shock waves created by their own movement outwards pushing out gas through radiation pressure on dust p

--------------------- батч ---------------------


['initially limited by its capabilities due to its early stages being online as a telescope system called alma, alma had restrictions on how large images could be created using it. this led researchers to choose one very well known area called " the hubble ultra deep field " as their initial target for creating deep images using alma. after many delays due to technical limitations, they finally completed their observations using alma\'s capabilities. their findings showed promising results despite some noise showing up from those initial observations. they were able identify only about sixteen distinct objects within this area after carefully looking through existing imagery from another telescope called " hubble space telescope " which provided',
 'the erosita telescope will identify clusters using data from various deep surveys like des and decals as well as other telescopes like panstarrs and vst atlas. this process uses information from extended x - ray sources combined with red li

--------------------- батч ---------------------


['scientists use various methods like analyzing fluctuations from cosmic microwave background radiation or studying supernovae explosions ( geometric tests ) to determine how our universe evolved over time – these are key pieces for understanding cosmology today. to get accurate measurements about these changes we need multiple observations combined together because no single method provides complete information about everything that affects our universe\'s evolution. the " 4most " project focuses specifically on studying how things like galaxies form within massive structures called " clusters, " providing valuable information about dark energy - an invisible force driving our universe\'s expansion - through its unique ability to observe distant objects at high redshifts',
 'egorie test has been developed using a unique five stage mirror system designed for maximum efficiency towards achieving high quality images during scientific observation activities such as photogrametry spectrosc

--------------------- батч ---------------------


['underground is there more variation between how past star formation affects the carbon to oxygen isotope ratio compared to changes caused by variations within stellar populations? to answer this question scientifically : we developed an accurate computer model called " galactic chemical evolution " which considers how isotopes are produced throughout different types ( mass ) of stars over their entire lifespan while considering how metallicity impacts element release into interstellar space at various stages during their life cycle. this allows us to understand both local disc data and galactic gradients in isotope abundances across our own galaxy! using this calibrated model combined with simulated scenarios : we created models where we simulate rapid bursts',
 "the cosmic structure survey ( crs ) will provide detailed views of how galaxies are arranged across vast cosmic distances — from tiny voids within clusters all the way out past large structures like galaxy clusters themselve

--------------------- батч ---------------------


["eso doesn't have immediate funds allocated towards building new parts or running them during ctas timeframe but brings valuable experience managing telescopes remotely that benefits ctas scientific goals when combined with their expertise from operating existing telescopes like la palma's vlts elts at their basecamp location near where they are planning to deploy the south portion ( cta south ). after discussing logistics responsibilities between partners involved it was decided that : eso joins as part owner ( 8 voting rights ) while handling operations based off cost neutrality through contributions from all partners involved eso benefits through access : scientists from member states gain access to 10 observing time across",
 "to study star formation in distant galaxies at redshifts between roughly. 3 and. 17, we used two methods : acquiring new observations with the kmos instrument on eso's very large telescope ( vlt ) as well as analyzing existing data collected by muse on anoth

--------------------- батч ---------------------


['underground scientists have had their eyes set on exploring space with large - scale projects known as " public surveys. " to make these ambitious endeavors possible, they recently issued a call to potential researchers interested in joining forces with them through " community surveys " which can take advantage of significant portions ( up to half ) of time allotted by the powerful " 4most " instrument at eso\'s vista telescope over five years. these collaborative efforts will support ongoing investigations from previous public survey presentations at an eso gathering earlier this year as well as those detailed within a special edition published last year ( march ). this approach is fundamental to modern astronomy ;',
 'scientists studied nearby red giant star l2 puppis to understand how massive stars affect their orbiting planets during later stages like when they become giants like our sun one day may become.. they found evidence for dust clouds surrounding it with various structu

KeyboardInterrupt: 