In [2]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

# Загрузка данных
print("Loading SNP and STR test data...")
snp_data = pd.read_csv("data/raw/FinalReport.csv", sep=";")
str_test = pd.read_csv("data/raw/STR_test.csv", sep=";")


# Предобработка SNP данных
def preprocess_snp(df):
    print("Preprocessing SNP data...")
    df = df.pivot_table(
        index="animal_id", columns="SNP Name", values="Allele1 - AB", aggfunc="first"
    )
    df = df.fillna("0")
    print("Filled missing values with '0'.")
    
    allele_mapping = {"A": 0, "B": 1, "0": np.nan}
    df = df.replace(allele_mapping)
    return df


snp_pivot = preprocess_snp(snp_data)
print(f"SNP pivot data shape: {snp_pivot.shape}")



Loading SNP and STR test data...
SNP data shape: (26272455, 6)
STR test data shape: (28355, 4)
Preprocessing SNP data...
Initial SNP data shape: (26272455, 6)
SNP data shape after pivot: (8442, 2985)
Filled missing values with '0'.
SNP data shape after allele mapping: (8442, 2985)
SNP pivot data shape: (8442, 2985)


In [4]:

# Функция для загрузки моделей
def load_models(model_dir, model_type):
    print(f"Loading {model_type} models from {model_dir}...")
    models = {}
    for file_name in os.listdir(model_dir):
        # Извлекаем имя аллеля и маркера
        if file_name.endswith(".cbm") and model_type == "CatBoost":
            model_name = "_".join(file_name.split("_")[:2])  # Извлекаем имя как 'Allele1_TGLA227'
            model = CatBoostRegressor()
            model.load_model(os.path.join(model_dir, file_name))
            models[model_name] = model
            print(f"Loaded CatBoost model: {model_name}")
        elif file_name.endswith(".json") and model_type == "XGBoost":
            model_name = "_".join(file_name.split("_")[:2])  # Извлекаем имя как 'Allele1_TGLA227'
            model = XGBRegressor()
            model.load_model(os.path.join(model_dir, file_name))
            models[model_name] = model
            print(f"Loaded XGBoost model: {model_name}")
    return models


# Определяем, какой тип модели использовать
model_type = "XGBoost"  # или "CatBoost", в зависимости от того, какие модели использовать
model_dir = f"models/{model_type.lower()}"


models = load_models(model_dir, model_type)



if not models:
    print(f"No models found in {model_dir}. Exiting...")
    exit()

print(f"Total models loaded: {len(models)}")


Loading XGBoost models from ../models/xgboost...
Loaded XGBoost model: Allele1_SPS115
Loaded XGBoost model: Allele1_ETH225
Loaded XGBoost model: Allele2_TGLA53
Loaded XGBoost model: Allele2_INRA023
Loaded XGBoost model: Allele1_TGLA126
Loaded XGBoost model: Allele1_ETH3
Loaded XGBoost model: Allele1_BM1824
Loaded XGBoost model: Allele2_TGLA126
Loaded XGBoost model: Allele2_TGLA122
Loaded XGBoost model: Allele1_ETH10
Loaded XGBoost model: Allele2_ETH3
Loaded XGBoost model: Allele2_ETH225
Loaded XGBoost model: Allele2_SPS115
Loaded XGBoost model: Allele1_INRA023
Loaded XGBoost model: Allele2_BM2113
Loaded XGBoost model: Allele1_TGLA227
Loaded XGBoost model: Allele1_TGLA53
Loaded XGBoost model: Allele1_BM1818
Loaded XGBoost model: Allele1_TGLA122
Loaded XGBoost model: Allele1_BM2113
Loaded XGBoost model: Allele2_BM1824
Loaded XGBoost model: Allele2_BM1818
Loaded XGBoost model: Allele2_ETH10
Loaded XGBoost model: Allele2_TGLA227
Total models loaded: 24


In [8]:
str_markers = str_test["STR Name"].unique()

imputed_data = []
missing_models = 0
for marker in tqdm(str_markers, desc="Predicting alleles for STR markers"):
    for allele in ["Allele1", "Allele2"]:
        target = f"{allele}_{marker}"
        if target in models:
            model = models[target]
            X_pred = snp_pivot.loc[str_test["animal_id"].unique()].fillna(0)
            X_pred = X_pred.apply(pd.to_numeric, errors='coerce').fillna(0)
            X_pred = X_pred.drop(columns=['BovineHD1900015984', 'BovineHD3100000210'])
            y_pred = model.predict(X_pred)
            temp_df = pd.DataFrame(
                {"animal_id": X_pred.index, "STR Name": marker, allele: y_pred}
            )
            imputed_data.append(temp_df)
        else:
            print(f"Model for {target} not found.")
            missing_models += 1

if missing_models > 0:
    print(f"{missing_models} models were missing and skipped.")


Total STR markers: 12


Predicting alleles for STR markers:   0%|          | 0/12 [00:00<?, ?it/s]

Predicting Allele1_TGLA227...
Shape of SNP data for prediction: (1688, 2983)
Predicting Allele2_TGLA227...
Shape of SNP data for prediction: (1688, 2983)


Predicting alleles for STR markers:   8%|▊         | 1/12 [00:05<00:55,  5.05s/it]

Predicting Allele1_TGLA126...
Shape of SNP data for prediction: (1688, 2983)
Predicting Allele2_TGLA126...
Shape of SNP data for prediction: (1688, 2983)


Predicting alleles for STR markers:  17%|█▋        | 2/12 [00:10<00:50,  5.09s/it]

Predicting Allele1_TGLA122...
Shape of SNP data for prediction: (1688, 2983)
Predicting Allele2_TGLA122...
Shape of SNP data for prediction: (1688, 2983)


Predicting alleles for STR markers:  25%|██▌       | 3/12 [00:15<00:45,  5.09s/it]

Predicting Allele1_TGLA53...
Shape of SNP data for prediction: (1688, 2983)
Predicting Allele2_TGLA53...
Shape of SNP data for prediction: (1688, 2983)


Predicting alleles for STR markers:  33%|███▎      | 4/12 [00:20<00:40,  5.07s/it]

Predicting Allele1_SPS115...
Shape of SNP data for prediction: (1688, 2983)
Predicting Allele2_SPS115...
Shape of SNP data for prediction: (1688, 2983)


Predicting alleles for STR markers:  42%|████▏     | 5/12 [00:25<00:35,  5.10s/it]

Predicting Allele1_INRA023...
Shape of SNP data for prediction: (1688, 2983)
Predicting Allele2_INRA023...
Shape of SNP data for prediction: (1688, 2983)


Predicting alleles for STR markers:  50%|█████     | 6/12 [00:30<00:30,  5.10s/it]

Predicting Allele1_ETH225...
Shape of SNP data for prediction: (1688, 2983)
Predicting Allele2_ETH225...
Shape of SNP data for prediction: (1688, 2983)


Predicting alleles for STR markers:  58%|█████▊    | 7/12 [00:35<00:25,  5.08s/it]

Predicting Allele1_ETH3...
Shape of SNP data for prediction: (1688, 2983)
Predicting Allele2_ETH3...
Shape of SNP data for prediction: (1688, 2983)


Predicting alleles for STR markers:  67%|██████▋   | 8/12 [00:40<00:20,  5.09s/it]

Predicting Allele1_BM2113...
Shape of SNP data for prediction: (1688, 2983)
Predicting Allele2_BM2113...
Shape of SNP data for prediction: (1688, 2983)


Predicting alleles for STR markers:  75%|███████▌  | 9/12 [00:45<00:15,  5.09s/it]

Predicting Allele1_BM1824...
Shape of SNP data for prediction: (1688, 2983)
Predicting Allele2_BM1824...
Shape of SNP data for prediction: (1688, 2983)


Predicting alleles for STR markers:  83%|████████▎ | 10/12 [00:50<00:10,  5.08s/it]

Predicting Allele1_BM1818...
Shape of SNP data for prediction: (1688, 2983)
Predicting Allele2_BM1818...
Shape of SNP data for prediction: (1688, 2983)


Predicting alleles for STR markers:  92%|█████████▏| 11/12 [00:56<00:05,  5.10s/it]

Predicting Allele1_ETH10...
Shape of SNP data for prediction: (1688, 2983)
Predicting Allele2_ETH10...
Shape of SNP data for prediction: (1688, 2983)


Predicting alleles for STR markers: 100%|██████████| 12/12 [01:01<00:00,  5.10s/it]


In [10]:
# Объединение предсказанных данных
if imputed_data:
    imputed_df = pd.concat(imputed_data, axis=0)

    str_test = str_test.drop(columns=["Allele1", "Allele2"])
    str_test = str_test.merge(imputed_df, on=["animal_id", "STR Name"], how="left")



    output_path = "data/processed/STR_test_imputed.csv"
    str_test.to_csv(output_path, index=False, sep=";")
    print(f"Imputed STR test data saved to {output_path}.")
else:
    print("No imputed data to save. Exiting...")


Merged imputed data shape: (40512, 4)
Merging imputed data with the original STR test data...
STR test data shape after dropping original alleles: (56710, 2)
STR test data shape after merging: (113420, 4)
Imputed STR test data saved to ../data/processed/STR_test_imputed.csv.
