In [4]:
import glob
import os
from typing import List
# import jieba

import numpy as np
import pandas as pd
# import tensorflow as tf
import transformers
from tqdm.notebook import tqdm
import warnings
import xgboost as xgb
from sklearn.model_selection import train_test_split

from matplotlib import pyplot as plt
def read_notebook(path: str) -> pd.DataFrame:
    return (
        pd.read_json(path, dtype={"cell_type": "category", "source": "str"})
        .assign(id=os.path.basename(path).split(".")[0])
        .rename_axis("cell_id")
    )



def get_ranks(base: pd.Series, derived: List[str]) -> List[str]:
    return [base.index(d) for d in derived]



# Отключаем FutureWarning
warnings.filterwarnings("ignore", category=FutureWarning)

In [5]:
paths = glob.glob(os.path.join('AI4Code', "train", "*.json"))
df = (
    pd.concat([read_notebook(x) for x in tqdm(paths, desc="Concat")])
    .set_index("id", append=True)
    .swaplevel()
    .sort_index(level="id", sort_remaining=False)
).reset_index()
df["source"] = df["source"]
df["rank"] = df.groupby(["id", "cell_type"]).cumcount()
df["pct_rank"] = df.groupby(["id", "cell_type"])["rank"].rank(pct=True)

Concat:   0%|          | 0/139256 [00:00<?, ?it/s]

In [6]:
df['source'] = df['source'].fillna('')  # Замена NaN на пустые строки
df = df[df['source'].str.strip() != '']  # Удаление пустых строк
df['source'] = df['source'].astype(str)  # Преобразование всех значений в строки

# Удаление символов новой строки
df['source'] = df['source'].str.replace('\n', ' ', regex=False)
df['source'] = df['source'].str.replace('.', ' ', regex=False)
# Сохранение текста в файл для обучения FastText
with open('text_data.txt', 'w', encoding='utf-8') as f:
    for text in df['source']:
        f.write(text + '\n')
with open('fasttext_data.txt', 'w') as f:
    for index, row in df.iterrows():
        f.write(f'__label__{row["rank"]} {row["source"]}\n')

In [7]:
import fasttext
model = fasttext.train_supervised(input='fasttext_data.txt', wordNgrams=2)  # Используем биграммы

# Создание эмбедингов для каждого абзаца





Read 205M words
Number of words:  11191425
Number of labels: 810
Progress: 100.0% words/sec/thread:   19567 lr:  0.000000 avg.loss:  4.147717 ETA:   0h 0m 0s


In [8]:
df['embedding'] = df['source'].apply(lambda x: model.get_sentence_vector(x))

In [10]:
# Разделение данных на обучающую и тестовую выборки
import xgboost as xgb

from scipy.stats import kendalltau
from sklearn.metrics import ndcg_score
from sklearn.model_selection import train_test_split
X = list(df['embedding'])
y = df['rank']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [11]:
import xgboost as xgb

# Преобразование данных в формат DMatrix, который используется в XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Параметры модели
params = {
    'objective': 'rank:pairwise',
    'eta': 0.1,
    'gamma': 1.0,
    'min_child_weight': 0.1,
    'max_depth': 6,
    'eval_metric': 'ndcg'
}

# Обучение модели
bst = xgb.train(params, dtrain, num_boost_round=100)

# Предсказание на тестовом наборе
y_pred = bst.predict(dtest)

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import kendalltau
from sklearn.metrics import ndcg_score

In [13]:
ndcg = ndcg_score([y_test], [y_pred])
print(f'NDCG: {ndcg}')
tau, p_value = kendalltau(y_test, y_pred)
print(f'Kendall Tau Score: {tau}')
print(f'P-value: {p_value}')

NDCG: 0.9293989134777952
Kendall Tau Score: 0.10297401267864703
P-value: 0.0


In [14]:
bst.save_model('xgboost_model.json')  # Сохранить в формате JSON
# Или
bst.save_model('xgboost_model.model') 

  bst.save_model('xgboost_model.model')


In [None]:



X_train, X_test, y_train, y_test = train_test_split(
    df[['source']], df['rank'], test_size=0.2, random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(
    df[['source']], df['rank'], test_size=0.2, random_state=42
)

# Преобразование данных в формат DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Параметры модели
params = {
    'objective': 'rank:pairwise',
    'eta': 0.1,
    'gamma': 1.0,
    'min_child_weight': 0.1,
    'max_depth': 6,
    'eval_metric': 'ndcg'
}

# Обучение модели
bst = xgb.train(params, dtrain, num_boost_round=100)

# Предсказание на тестовом наборе
y_pred = bst.predict(dtest)


In [None]:
paths = glob.glob(os.path.join('/kaggle/input/ai4code/AI4Code', "train", "*.json"))
df = (
    pd.concat([read_notebook(x) for x in tqdm(paths[:20000], desc="Concat")])
    .set_index("id", append=True)
    .swaplevel()
    .sort_index(level="id", sort_remaining=False)
).reset_index()
df["source"] = df["source"]
df["rank"] = df.groupby(["id", "cell_type"]).cumcount()
df["pct_rank"] = df.groupby(["id", "cell_type"])["rank"].rank(pct=True)

In [None]:
df['source'] = df['source'].fillna('')  # Замена NaN на пустые строки
df = df[df['source'].str.strip() != '']  # Удаление пустых строк
df['source'] = df['source'].astype(str)  # Преобразование всех значений в строки

# Удаление символов новой строки
df['source'] = df['source'].str.replace('\n', ' ', regex=False)
df['source'] = df['source'].str.replace('.', ' ', regex=False)
# Сохранение текста в файл для обучения FastText
with open('text_data.txt', 'w', encoding='utf-8') as f:
    for text in df['source']:
        f.write(text + '\n')
with open('fasttext_data.txt', 'w') as f:
    for index, row in df.iterrows():
        f.write(f'__label__{row["rank"]} {row["source"]}\n')