In [1]:
import zipfile

In [2]:
import zipfile
from tqdm import tqdm

def unzip_with_progress(zip_path, extract_to):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        file_list = zip_ref.namelist()

        for file in tqdm(file_list, desc="Extracting", unit="file"):
            zip_ref.extract(file, extract_to)


unzip_with_progress('AI4Code.zip', './AI4Code')

Extracting: 100%|██████████| 139263/139263 [14:49<00:00, 156.62file/s]


In [1]:
import pandas as pd
import numpy as np

from transformers import BertTokenizer
from transformers import BertModel

from torch.utils.data import DataLoader


from torch.utils.data import Sampler


import os
import torch
import torch.nn as nn
import torch.optim as optim

from time import localtime, strftime
from time import time
from bisect import bisect

from tqdm import tqdm
from torch.utils.data import Dataset
import json
# from utils import prepare_folders, get_device
# from model import OrderPredictionModel
# from train import Trainer


In [3]:
class Cell:
    def __init__(self, input_ids, att_mask, cell_type):
        self.input_ids = input_ids
        self.att_mask = att_mask
        self.cell_type = cell_type

    def get(self):
        return (
            self.input_ids,
            self.att_mask,
            self.cell_type
        )
    
import torch




class CellDataset(Dataset):
    def __init__(self, path, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.files = {}
        
        n_pair = 0
        for row_index in self.data.index:
            n_pair += len(self.data.loc[row_index, 'cell_order']) - 1
        self.n_pair = n_pair

        for filename in tqdm(self.data.index):
            cells_dict = {}
            cells = self.data.loc[filename, 'cell_order']
            with open(f'{path}{filename}.json') as file:
                json_code = json.load(file)
            for cell in cells:
                input_ids, att_mask, cell_type = self.prepare_data(
                    json_code['cell_type'][cell],
                    json_code['source'][cell]
                )
                cells_dict[cell] = Cell(input_ids, att_mask, cell_type)
            self.files[filename] = cells_dict
                
        
    def __len__(self):
        return self.n_pair
        
    def __getitem__(self, idx):
        filename = idx[0]
        first_cell_id = idx[1]
        second_cell_id = idx[2]

        first_position = self.data.loc[filename, 'cell_order'].index(first_cell_id)
        second_position = self.data.loc[filename, 'cell_order'].index(second_cell_id)
        order = 0 if first_position < second_position else 1
        
        return (
            (
                self.files[filename][first_cell_id].get(),
                self.files[filename][second_cell_id].get()
            ),
            order
        )

    def prepare_data(
        self,
        cell_type,
        cell_content
    ):
    
        tokens = self.tokenizer(
            cell_content,
            padding="max_length",
            max_length=self.max_length,
            truncation=True,
            return_tensors="pt",
        )
    
        cell_type = 1 if cell_type == "code" else 0
    
        type_tensor = torch.tensor([cell_type], dtype=torch.long)
    
        return (
            tokens["input_ids"],
            tokens["attention_mask"],
            type_tensor
        )
    


class CellSampler(Sampler):
    def __init__(self, data, seed=None):
        self.data = data
        self.seed = seed
        n_pair = 0
        for row_index in self.data.index:
            n_pair += len(self.data.loc[row_index, 'cell_order']) - 1
        self.n_pair = n_pair

    def __len__(self):
        return self.n_pair

    def __iter__(self):
        pairs = []
        for row_index in self.data.index:
            cells = self.data.loc[row_index, 'cell_order'].copy()
            if self.seed:
                rng = np.random.default_rng(self.seed)
                rng.shuffle(cells)
            else:
                np.random.shuffle(cells)
            for cell_index in range(len(cells) - 1):
                pairs.append([row_index, cells[cell_index], cells[cell_index + 1]])

        for pair in pairs:
            yield pair

def prepare_folders():
    current_time = strftime('%d.%m.%Y-%H:%M', localtime())
    savedir = f'./checkpoints/{current_time}/'

    if not os.path.exists('./checkpoints'):
        os.mkdir('./checkpoints/')
    if not os.path.exists(savedir):
        os.mkdir(savedir)
    else:
        for root, dirs, files in os.walk(savedir, topdown=False):
            for name in files:
                os.remove(os.path.join(root, name))
            for name in dirs:
                os.rmdir(os.path.join(root, name))

    return savedir

def get_device():
    device = (
        "cuda" if torch.cuda.is_available()
        else "mps" if torch.mps.is_available()
        else "cpu"
    )
    return device

def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):
        j = bisect(sorted_so_far, u)
        inversions += i - j
        sorted_so_far.insert(j, u)
    return inversions

def kendall_tau(ground_truth, predictions):
    total_inversions = 0
    total_2max = 0
    for gt, pred in zip(ground_truth, predictions):
        ranks = [gt.index(x) for x in pred]
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max

class OrderPredictionModel(nn.Module):
    def __init__(self, hidden_dim, dropout_prob=0.1):
        super(OrderPredictionModel, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.type_embedding = nn.Embedding(2, 8)
        self.fc1 = nn.Linear(768 * 2 + 8 * 2, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc2 = nn.Linear(hidden_dim, 1)

    def forward(self, input_ids1, att_mask1, cell_type1, input_ids2, att_mask2, cell_type2):
        with torch.no_grad():
            embedding1 = self.bert(input_ids1, attention_mask=att_mask1).pooler_output
            embedding2 = self.bert(input_ids2, attention_mask=att_mask2).pooler_output

        type_emb1 = self.type_embedding(cell_type1)
        type_emb2 = self.type_embedding(cell_type2)

        combined = torch.cat([embedding1, type_emb1, embedding2, type_emb2], dim=1)
        x = torch.relu(self.bn1(self.fc1(combined)))
        x = self.dropout(x)
        output = torch.sigmoid(self.fc2(x))
        return output.squeeze(1)

In [None]:
np.random.seed(42)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
path = 'AI4Code'
print("*" * 80)
print("Reading data")
info = pd.read_csv(path+'/train_orders.csv', index_col='id')
info['cell_order'] = info['cell_order'].apply(lambda x: x.split())
indeces = list(info.index)
np.random.shuffle(indeces)

train_size = 0.7
valid_size = 0.2
test_size = 0.1

train_border = int(train_size * len(indeces))
valid_border = int((train_size + valid_size) * len(indeces))

train_data = info.loc[indeces[:train_border]]
valid_data = info.loc[indeces[train_border:valid_border]]
test_data = info.loc[indeces[valid_border:]]

train_data_short = train_data.iloc[:50000]
valid_data_short = valid_data.iloc[:5000]

train_dataset = CellDataset(path +'/train/', train_data_short, tokenizer, 128)
train_sampler = CellSampler(train_data_short)
train_dataloader = DataLoader(train_dataset, 64, drop_last=True, sampler=train_sampler)


# model = OrderPredictionModel(128)
# savedir = prepare_folders()
# device = get_device()


********************************************************************************
Reading data


 87%|████████▋ | 43262/50000 [53:01<07:18, 15.38it/s]  

In [None]:
import pickle
from pathlib import Path

# Функция для сохранения токенизированных данных
def save_tokenized_data(dataset, filename):
    with open(filename, 'wb') as f:
        pickle.dump(dataset, f)

# Сохранение тренировочных и валидационных данных
save_tokenized_data(train_dataset, 'train_tokenized_50k.pkl')


In [None]:
valid_dataset = CellDataset(path + '/train/', valid_data_short, tokenizer, 128)
valid_sampler = CellSampler(valid_data_short, 42)
valid_dataloader = DataLoader(valid_dataset, 64, drop_last=True, sampler=valid_sampler)


In [None]:
save_tokenized_data(valid_dataset, 'valid_tokenized_5k.pkl')