In [15]:
import pandas as pd
import numpy as np
from phik.report import plot_correlation_matrix
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, \
    classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score
import optuna
from sklearn.metrics import roc_auc_score, average_precision_score
from optuna.pruners import MedianPruner
from sklearn.metrics import f1_score
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)

In [23]:
def gini_impurity(y):
    """
    Вычисление Gini impurity для списка меток y.
    """
    if len(y) == 0:
        return 0
    
    p = sum(y) / len(y)
    return 1 - p**2 - (1-p)**2

def weighted_gini_impurity(y_left, y_right):
    """
    Вычисление взвешенной Gini impurity для двух подмножеств данных.
    """
    n_left, n_right = len(y_left), len(y_right)
    n_total = n_left + n_right
    
    return (n_left / n_total) * gini_impurity(y_left) + (n_right / n_total) * gini_impurity(y_right)

def best_split_threshold(feature, target):
    """
    Нахождение наилучшего порога разделения для заданного признака.
    """
    # Сортировка признака
    sorted_idx = feature.argsort()
    sorted_feature = feature[sorted_idx]
    sorted_target = target[sorted_idx]

    # Инициализация переменных
    best_gini = float('inf')
    best_threshold = None
    
    # Перебор возможных пороговых значений
    for i in range(1, len(sorted_feature)):
        if sorted_feature[i] != sorted_feature[i-1]:
            threshold = (sorted_feature[i] + sorted_feature[i-1]) / 2
            gini = weighted_gini_impurity(sorted_target[:i], sorted_target[i:])
            if gini < best_gini:
                best_gini = gini
                best_threshold = threshold
    
    
    return best_threshold, best_gini


In [19]:
df_raw = pd.read_csv('/kaggle/input/leopard-challenge-classification/train.csv')
test = pd.read_csv(r'/kaggle/input/leopard-challenge-classification/test.csv')
df_raw.shape, test.shape

((13863, 26), (5942, 25))

In [20]:
test_cv = test.drop(['oral', 'ID'], axis = 1).copy()
test_cv['tartar'] = test_cv['tartar'].apply(lambda x: 1 if x == 'Y' else 0)
merged_data = df_raw.drop(['oral', 'ID'], axis=1).copy()
merged_data['tartar'] = merged_data['tartar'].apply(lambda x: 1 if x == 'Y' else 0)
merged_data.shape, test_cv.shape

((13863, 24), (5942, 23))

In [21]:
merged_data['BMI'] = merged_data['weight(kg)'] / ((merged_data['height(cm)'] / 100) ** 2)
merged_data['Chol_HDL_ratio'] = merged_data['Cholesterol'] / merged_data['HDL']
merged_data['ldl_hdl'] = merged_data['LDL'] / merged_data['HDL']
merged_data['map'] = (merged_data['systolic'] + (2 * merged_data['relaxation'])) / 3
merged_data['Waist_Height_ratio'] = merged_data['waist(cm)'] / merged_data['height(cm)']
merged_data['height_nin_110'] = (merged_data['height(cm)'] - 110) / merged_data['weight(kg)']
merged_data['Average_hearing'] = (merged_data['hearing(left)'] + merged_data['hearing(right)']) / 2
merged_data['Average_eyesight'] = (merged_data['eyesight(left)'] + merged_data['eyesight(right)']) / 2
merged_data['ast_alt'] = merged_data['AST'] / merged_data['ALT']
merged_data['Systolic_Diastolic_ratio'] = merged_data['systolic'] / merged_data['relaxation']
merged_data['Atherogenic_coefficient'] = (merged_data['Cholesterol'] - merged_data['HDL']) / merged_data['HDL']
merged_data['BMI_to_age'] = merged_data['BMI'] / merged_data['age']
merged_data['Glucose_to_Cholesterol'] = merged_data['fasting blood sugar'] / merged_data['Cholesterol']
merged_data['Triglycerides_to_HDL'] = merged_data['triglyceride'] / merged_data['HDL']
merged_data['Systolic_to_age'] = merged_data['systolic'] / merged_data['age']
merged_data['Diastolic_to_age'] = merged_data['relaxation'] / merged_data['age']
merged_data['Hemoglobin_to_age'] = merged_data['hemoglobin'] / merged_data['age']
merged_data['GTP_to_age'] = merged_data['Gtp'] / merged_data['age']
merged_data['GTP_to_AST_ALT'] = merged_data['Gtp'] / (merged_data['AST'] + merged_data['ALT'])
merged_data['GTP_to_Cholesterol'] = merged_data['Gtp'] / merged_data['Cholesterol']
merged_data['GTP_index'] = merged_data['Gtp'] / (merged_data['Cholesterol'] + merged_data['triglyceride'])

merged_data.shape

(13863, 45)

In [22]:
infinity_features = merged_data.nunique()[merged_data.nunique() >= 3].index.to_list()
infinity_features, len(infinity_features)

(['age',
  'height(cm)',
  'weight(kg)',
  'waist(cm)',
  'eyesight(left)',
  'eyesight(right)',
  'systolic',
  'relaxation',
  'fasting blood sugar',
  'Cholesterol',
  'triglyceride',
  'HDL',
  'LDL',
  'hemoglobin',
  'Urine protein',
  'serum creatinine',
  'AST',
  'ALT',
  'Gtp',
  'BMI',
  'Chol_HDL_ratio',
  'ldl_hdl',
  'map',
  'Waist_Height_ratio',
  'height_nin_110',
  'Average_hearing',
  'Average_eyesight',
  'ast_alt',
  'Systolic_Diastolic_ratio',
  'Atherogenic_coefficient',
  'BMI_to_age',
  'Glucose_to_Cholesterol',
  'Triglycerides_to_HDL',
  'Systolic_to_age',
  'Diastolic_to_age',
  'Hemoglobin_to_age',
  'GTP_to_age',
  'GTP_to_AST_ALT',
  'GTP_to_Cholesterol',
  'GTP_index'],
 40)

In [29]:
from tqdm import tqdm

continuous_features = merged_data[infinity_features].columns.to_list()

# Нахождение наилучших порогов разделения для каждого признака
best_thresholds = {}
for feature in tqdm(continuous_features):
    threshold, gini = best_split_threshold(merged_data[feature].values, merged_data['smoking'].values)
#     if gini < 0.32:
    best_thresholds[feature] = ( gini ) # threshold,

best_thresholds

100%|██████████| 40/40 [02:16<00:00,  3.42s/it]


{'age': 0.32041557759499767,
 'height(cm)': 0.3220007667212753,
 'weight(kg)': 0.32205507080218254,
 'waist(cm)': 0.3219400088264416,
 'eyesight(left)': 0.3222692060944869,
 'eyesight(right)': 0.3222898572229469,
 'systolic': 0.3221474813128374,
 'relaxation': 0.32214158188668934,
 'fasting blood sugar': 0.3217415349923938,
 'Cholesterol': 0.3222386111993474,
 'triglyceride': 0.3168128518892239,
 'HDL': 0.3212278706120323,
 'LDL': 0.3217304552634793,
 'hemoglobin': 0.32050366465301394,
 'Urine protein': 0.3221857998441848,
 'serum creatinine': 0.32167477884786533,
 'AST': 0.3221225244243006,
 'ALT': 0.32210136637790165,
 'Gtp': 0.3144329572906187,
 'BMI': 0.3215269436987101,
 'Chol_HDL_ratio': 0.32129215554826224,
 'ldl_hdl': 0.32157596526378096,
 'map': 0.32225199236270957,
 'Waist_Height_ratio': 0.32193085250471454,
 'height_nin_110': 0.3216415408356256,
 'Average_hearing': 0.3222019768227528,
 'Average_eyesight': 0.3221792168351839,
 'ast_alt': 0.3221808924008923,
 'Systolic_Diastol

In [31]:
gini_scores_df = pd.DataFrame(list(best_thresholds.items()), columns=['Feature', 'gini_scores'])
gini_scores_df.sort_values(by='gini_scores', ascending=True)

Unnamed: 0,Feature,gini_scores
37,GTP_to_AST_ALT,0.311794
38,GTP_to_Cholesterol,0.314105
18,Gtp,0.314433
36,GTP_to_age,0.315194
39,GTP_index,0.316666
10,triglyceride,0.316813
32,Triglycerides_to_HDL,0.316823
35,Hemoglobin_to_age,0.320363
0,age,0.320416
13,hemoglobin,0.320504


In [7]:
from sklearn.preprocessing import PolynomialFeatures


continuous_data = merged_data[infinity_features]
# Создание полиномиальных признаков (степень 2)
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_data = poly.fit_transform(continuous_data)
poly_features = poly.get_feature_names_out(infinity_features)

# Создание DataFrame с полиномиальными признаками
poly_df = pd.DataFrame(poly_data, columns=poly_features)

# Проверка размерности нового DataFrame
poly_df.shape

(13863, 860)

In [8]:
poly_df.iloc[:,39:45].columns.to_list()

['GTP_index',
 'age^2',
 'age height(cm)',
 'age weight(kg)',
 'age waist(cm)',
 'age eyesight(left)']

In [43]:
%%time
# Список непрерывных признаков
continuous_features = poly_df.iloc[:,39:155].columns.to_list()

# Нахождение наилучших порогов разделения для каждого признака
best_thresholds = {}
for feature in continuous_features:
    threshold, gini = best_split_threshold(poly_df[feature].values, merged_data['smoking'].values)
    if gini < 0.32:
        best_thresholds[feature] = ( gini) # threshold,

best_thresholds

CPU times: user 12min 33s, sys: 97.3 ms, total: 12min 33s
Wall time: 12min 34s


{'GTP_index': (0.12347173646608675, 0.31666644704880775),
 'age triglyceride': (4972.5, 0.3180431522932294),
 'age Gtp': (2045.0, 0.3162168616783544),
 'age Triglycerides_to_HDL': (121.80631868131869, 0.31806825600288446),
 'age GTP_to_age': (43.5, 0.3144329572906187),
 'age GTP_to_AST_ALT': (30.510647544545847, 0.31478783704299473),
 'age GTP_to_Cholesterol': (6.457322006472492, 0.31631456924644386),
 'age GTP_index': (5.778336125069794, 0.31833017510821615),
 'height(cm) triglyceride': (18722.5, 0.3167616775915021),
 'height(cm) Gtp': (7857.5, 0.31402263206610814),
 'height(cm) Triglycerides_to_HDL': (477.433081674674, 0.3168168782659486),
 'height(cm) GTP_to_age': (198.36666666666667, 0.31512050793267354),
 'height(cm) GTP_to_AST_ALT': (149.3617886178862, 0.3117706416804088),
 'height(cm) GTP_to_Cholesterol': (33.55163903892718, 0.31410093688081353),
 'height(cm) GTP_index': (24.619670023569746, 0.31660430931175965),
 'weight(kg) triglyceride': (7925.0, 0.31769916272124116),
 'weigh

In [44]:
%%time
# Список непрерывных признаков
continuous_features = poly_df.iloc[:,155:255].columns.to_list()

# Нахождение наилучших порогов разделения для каждого признака
best_thresholds = {}
for feature in continuous_features:
    threshold, gini = best_split_threshold(poly_df[feature].values, merged_data['smoking'].values)
    if gini < 0.32:
        best_thresholds[feature] = ( gini) # threshold,

best_thresholds

CPU times: user 16min 55s, sys: 163 ms, total: 16min 55s
Wall time: 16min 56s


{'weight(kg) GTP_to_Cholesterol': 0.31488066665982617,
 'weight(kg) GTP_index': 0.31704402207148086,
 'waist(cm) triglyceride': 0.31751385008775057,
 'waist(cm) Gtp': 0.3147129825176973,
 'waist(cm) Triglycerides_to_HDL': 0.31761488786201303,
 'waist(cm) GTP_to_age': 0.3156145115860029,
 'waist(cm) GTP_to_AST_ALT': 0.3126842111134856,
 'waist(cm) GTP_to_Cholesterol': 0.31453243124438535,
 'waist(cm) GTP_index': 0.3170714210379383,
 'eyesight(left) triglyceride': 0.31820624733460234,
 'eyesight(left) Gtp': 0.3153050165264075,
 'eyesight(left) Triglycerides_to_HDL': 0.31845875514843,
 'eyesight(left) GTP_to_age': 0.31698536358950946,
 'eyesight(left) GTP_to_AST_ALT': 0.3145092058414746,
 'eyesight(left) GTP_to_Cholesterol': 0.3157583682536538,
 'eyesight(left) GTP_index': 0.31752880586879567,
 'eyesight(right) triglyceride': 0.31853701890718594,
 'eyesight(right) Gtp': 0.3154887525044051}

In [45]:
%%time
# Список непрерывных признаков
continuous_features = poly_df.iloc[:,255:455].columns.to_list()

# Нахождение наилучших порогов разделения для каждого признака
best_thresholds = {}
for feature in continuous_features:
    threshold, gini = best_split_threshold(poly_df[feature].values, merged_data['smoking'].values)
    if gini < 0.32:
        best_thresholds[feature] = ( gini) # threshold,

best_thresholds

CPU times: user 45min 21s, sys: 367 ms, total: 45min 21s
Wall time: 45min 24s


{'eyesight(right) Triglycerides_to_HDL': 0.3186672629260633,
 'eyesight(right) GTP_to_age': 0.316411386623151,
 'eyesight(right) GTP_to_AST_ALT': 0.31368708562889724,
 'eyesight(right) GTP_to_Cholesterol': 0.3151685927821788,
 'eyesight(right) GTP_index': 0.31736538489448474,
 'systolic triglyceride': 0.31762366099789974,
 'systolic Gtp': 0.31499710496764344,
 'systolic Triglycerides_to_HDL': 0.31746444815714425,
 'systolic GTP_to_age': 0.31570337989944613,
 'systolic GTP_to_AST_ALT': 0.31283330863658326,
 'systolic GTP_to_Cholesterol': 0.3146664137495251,
 'systolic GTP_index': 0.3172977494142715,
 'relaxation triglyceride': 0.31707188698467076,
 'relaxation Gtp': 0.3148768993661209,
 'relaxation Triglycerides_to_HDL': 0.31769251561117395,
 'relaxation GTP_to_age': 0.3155259316593805,
 'relaxation GTP_to_AST_ALT': 0.3127887636599891,
 'relaxation GTP_to_Cholesterol': 0.3139848789884586,
 'relaxation GTP_index': 0.3165956485951636,
 'fasting blood sugar triglyceride': 0.317117013351013

In [46]:
from tqdm import tqdm

continuous_features = poly_df.iloc[:,455:555].columns.to_list()

# Нахождение наилучших порогов разделения для каждого признака
best_thresholds = {}
for feature in tqdm(continuous_features):
    threshold, gini = best_split_threshold(poly_df[feature].values, merged_data['smoking'].values)
    if gini < 0.32:
        best_thresholds[feature] = ( gini) # threshold,

best_thresholds

100%|██████████| 100/100 [19:48<00:00, 11.88s/it]


{'LDL Gtp': 0.31688200814215883,
 'LDL Triglycerides_to_HDL': 0.319551038781008,
 'LDL GTP_to_age': 0.31759085375540086,
 'LDL GTP_to_AST_ALT': 0.3151674934787339,
 'LDL GTP_to_Cholesterol': 0.3158852944187137,
 'LDL GTP_index': 0.31827385727367996,
 'hemoglobin Gtp': 0.3138555983096709,
 'hemoglobin Triglycerides_to_HDL': 0.31641918193591856,
 'hemoglobin GTP_to_age': 0.31451098607342975,
 'hemoglobin GTP_to_AST_ALT': 0.31153555068793104,
 'hemoglobin GTP_to_Cholesterol': 0.31336135458182235,
 'hemoglobin GTP_index': 0.31612517627852527,
 'Urine protein Gtp': 0.3150401690399306,
 'Urine protein Triglycerides_to_HDL': 0.3169182360499526,
 'Urine protein GTP_to_age': 0.3153675534842144,
 'Urine protein GTP_to_AST_ALT': 0.31218895811763003,
 'Urine protein GTP_to_Cholesterol': 0.31475136161374484,
 'Urine protein GTP_index': 0.316949298028408,
 'serum creatinine Gtp': 0.3160938350124456,
 'serum creatinine Triglycerides_to_HDL': 0.3183530903744698}

In [47]:
from tqdm import tqdm

continuous_features = poly_df.iloc[:,555:720].columns.to_list()

# Нахождение наилучших порогов разделения для каждого признака
best_thresholds = {}
for feature in tqdm(continuous_features):
    threshold, gini = best_split_threshold(poly_df[feature].values, merged_data['smoking'].values)
    if gini < 0.32:
        best_thresholds[feature] = ( gini ) # threshold,

best_thresholds

100%|██████████| 165/165 [52:08<00:00, 18.96s/it]


{'serum creatinine GTP_to_age': 0.3164975943331758,
 'serum creatinine GTP_to_AST_ALT': 0.3133776881853809,
 'serum creatinine GTP_to_Cholesterol': 0.3158515687936262,
 'serum creatinine GTP_index': 0.31818995408313244,
 'AST Gtp': 0.317209477850096,
 'AST Triglycerides_to_HDL': 0.3186129475763978,
 'AST GTP_to_age': 0.3176692840912583,
 'AST GTP_to_AST_ALT': 0.3132282789956866,
 'AST GTP_to_Cholesterol': 0.31745824046437043,
 'AST GTP_index': 0.31920842230346086,
 'ALT Gtp': 0.3182705077390445,
 'ALT Triglycerides_to_HDL': 0.3193128188566947,
 'ALT GTP_to_age': 0.3183564764349793,
 'ALT GTP_to_AST_ALT': 0.3151805409464455,
 'ALT GTP_to_Cholesterol': 0.31832674016003815,
 'ALT GTP_index': 0.3196479687745079,
 'Gtp^2': 0.3144329572906187,
 'Gtp BMI': 0.3150838424809376,
 'Gtp Chol_HDL_ratio': 0.31466171271498244,
 'Gtp ldl_hdl': 0.3161177095017238,
 'Gtp map': 0.3149422099371003,
 'Gtp Waist_Height_ratio': 0.3146300799149578,
 'Gtp height_nin_110': 0.3135883009142427,
 'Gtp Average_hear

In [48]:
from tqdm import tqdm

continuous_features = poly_df.iloc[:,720:860].columns.to_list()

# Нахождение наилучших порогов разделения для каждого признака
best_thresholds = {}
for feature in tqdm(continuous_features):
    threshold, gini = best_split_threshold(poly_df[feature].values, merged_data['smoking'].values)
    if gini < 0.32:
        best_thresholds[feature] = ( gini ) # threshold,

best_thresholds

100%|██████████| 140/140 [48:33<00:00, 20.81s/it]


{'Waist_Height_ratio GTP_to_age': 0.315566406339988,
 'Waist_Height_ratio GTP_to_AST_ALT': 0.3127573683077994,
 'Waist_Height_ratio GTP_to_Cholesterol': 0.3146086948554256,
 'Waist_Height_ratio GTP_index': 0.31726523666827844,
 'height_nin_110 Triglycerides_to_HDL': 0.31668482819652427,
 'height_nin_110 GTP_to_age': 0.31483702308334155,
 'height_nin_110 GTP_to_AST_ALT': 0.3110301551167763,
 'height_nin_110 GTP_to_Cholesterol': 0.3134613458922022,
 'height_nin_110 GTP_index': 0.3160262887855808,
 'Average_hearing Triglycerides_to_HDL': 0.3170656559481916,
 'Average_hearing GTP_to_age': 0.31548972285890775,
 'Average_hearing GTP_to_AST_ALT': 0.3129386214771055,
 'Average_hearing GTP_to_Cholesterol': 0.3147234340149493,
 'Average_hearing GTP_index': 0.3173452254871051,
 'Average_eyesight Triglycerides_to_HDL': 0.3184284433214396,
 'Average_eyesight GTP_to_age': 0.31672978082775743,
 'Average_eyesight GTP_to_AST_ALT': 0.31380058072156825,
 'Average_eyesight GTP_to_Cholesterol': 0.315227613

In [9]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

data = {
  'LDL Gtp': 0.31688200814215883,
 'LDL Triglycerides_to_HDL': 0.319551038781008,
 'LDL GTP_to_age': 0.31759085375540086,
 'LDL GTP_to_AST_ALT': 0.3151674934787339,
 'LDL GTP_to_Cholesterol': 0.3158852944187137,
 'LDL GTP_index': 0.31827385727367996,
 'hemoglobin Gtp': 0.3138555983096709,
 'hemoglobin Triglycerides_to_HDL': 0.31641918193591856,
 'hemoglobin GTP_to_age': 0.31451098607342975,
 'hemoglobin GTP_to_AST_ALT': 0.31153555068793104,
 'hemoglobin GTP_to_Cholesterol': 0.31336135458182235,
 'hemoglobin GTP_index': 0.31612517627852527,
 'Urine protein Gtp': 0.3150401690399306,
 'Urine protein Triglycerides_to_HDL': 0.3169182360499526,
 'Urine protein GTP_to_age': 0.3153675534842144,
 'Urine protein GTP_to_AST_ALT': 0.31218895811763003,
 'Urine protein GTP_to_Cholesterol': 0.31475136161374484,
 'Urine protein GTP_index': 0.316949298028408,
 'serum creatinine Gtp': 0.3160938350124456,
 'serum creatinine Triglycerides_to_HDL': 0.3183530903744698,
  'eyesight(right) Triglycerides_to_HDL': 0.3186672629260633,
 'eyesight(right) GTP_to_age': 0.316411386623151,
 'eyesight(right) GTP_to_AST_ALT': 0.31368708562889724,
 'eyesight(right) GTP_to_Cholesterol': 0.3151685927821788,
 'eyesight(right) GTP_index': 0.31736538489448474,
 'systolic triglyceride': 0.31762366099789974,
 'systolic Gtp': 0.31499710496764344,
 'systolic Triglycerides_to_HDL': 0.31746444815714425,
 'systolic GTP_to_age': 0.31570337989944613,
 'systolic GTP_to_AST_ALT': 0.31283330863658326,
 'systolic GTP_to_Cholesterol': 0.3146664137495251,
 'systolic GTP_index': 0.3172977494142715,
 'relaxation triglyceride': 0.31707188698467076,
 'relaxation Gtp': 0.3148768993661209,
 'relaxation Triglycerides_to_HDL': 0.31769251561117395,
 'relaxation GTP_to_age': 0.3155259316593805,
 'relaxation GTP_to_AST_ALT': 0.3127887636599891,
 'relaxation GTP_to_Cholesterol': 0.3139848789884586,
 'relaxation GTP_index': 0.3165956485951636,
 'fasting blood sugar triglyceride': 0.3171170133510133,
 'fasting blood sugar Gtp': 0.3145179492834554,
 'fasting blood sugar Triglycerides_to_HDL': 0.3171261168117925,
 'fasting blood sugar GTP_to_age': 0.3147969035677171,
 'fasting blood sugar GTP_to_AST_ALT': 0.31221257687488513,
 'fasting blood sugar GTP_to_Cholesterol': 0.31448715088815515,
 'fasting blood sugar GTP_index': 0.3167059936694782,
 'Cholesterol triglyceride': 0.31810632677985295,
 'Cholesterol Gtp': 0.31482378910212233,
 'Cholesterol Triglycerides_to_HDL': 0.3181262897654849,
 'Cholesterol GTP_to_age': 0.3156824165340627,
 'Cholesterol GTP_to_AST_ALT': 0.31262368102559834,
 'Cholesterol GTP_to_Cholesterol': 0.3143875264608234,
 'Cholesterol GTP_index': 0.31658545812462324,
 'triglyceride^2': 0.3168128518892239,
 'triglyceride HDL': 0.31747245456777706,
 'triglyceride LDL': 0.31945573465799393,
 'triglyceride hemoglobin': 0.316466155055537,
 'triglyceride Urine protein': 0.31705745361167426,
 'triglyceride serum creatinine': 0.3182858426932048,
 'triglyceride AST': 0.31843616677136005,
 'triglyceride ALT': 0.3194145005598536,
 'triglyceride Gtp': 0.31329946081602966,
 'triglyceride BMI': 0.3177298484889949,
 'triglyceride Chol_HDL_ratio': 0.3181262897654849,
 'triglyceride ldl_hdl': 0.319551038781008,
 'triglyceride map': 0.3172153897896446,
 'triglyceride Waist_Height_ratio': 0.3175916603897661,
 'triglyceride height_nin_110': 0.31654870240352273,
 'triglyceride Average_hearing': 0.31722469256122054,
 'triglyceride Average_eyesight': 0.31837341061521846,
 'triglyceride ast_alt': 0.31729678612755025,
 'triglyceride Systolic_Diastolic_ratio': 0.3171106086195784,
 'triglyceride Atherogenic_coefficient': 0.3185426178848457,
 'triglyceride BMI_to_age': 0.31874880639461745,
 'triglyceride Glucose_to_Cholesterol': 0.31679371673457246,
 'triglyceride Triglycerides_to_HDL': 0.31721106765578383,
 'triglyceride Systolic_to_age': 0.31875429946285194,
 'triglyceride Diastolic_to_age': 0.3186760763404969,
 'triglyceride Hemoglobin_to_age': 0.3184051539929281,
 'triglyceride GTP_to_age': 0.3136313960524044,
 'triglyceride GTP_to_AST_ALT': 0.3112409942704689,
 'triglyceride GTP_to_Cholesterol': 0.31263964024367175,
 'triglyceride GTP_index': 0.3123074294807811,
 'HDL Gtp': 0.31527559441926156,
 'HDL Triglycerides_to_HDL': 0.3167748831635405,
 'HDL GTP_to_age': 0.3167874369960617,
 'HDL GTP_to_AST_ALT': 0.31466466018293476,
 'HDL GTP_to_Cholesterol': 0.3156506636045204,
 'HDL GTP_index': 0.318211023344735,
  'weight(kg) GTP_to_Cholesterol': 0.31488066665982617,
 'weight(kg) GTP_index': 0.31704402207148086,
 'waist(cm) triglyceride': 0.31751385008775057,
 'waist(cm) Gtp': 0.3147129825176973,
 'waist(cm) Triglycerides_to_HDL': 0.31761488786201303,
 'waist(cm) GTP_to_age': 0.3156145115860029,
 'waist(cm) GTP_to_AST_ALT': 0.3126842111134856,
 'waist(cm) GTP_to_Cholesterol': 0.31453243124438535,
 'waist(cm) GTP_index': 0.3170714210379383,
 'eyesight(left) triglyceride': 0.31820624733460234,
 'eyesight(left) Gtp': 0.3153050165264075,
 'eyesight(left) Triglycerides_to_HDL': 0.31845875514843,
 'eyesight(left) GTP_to_age': 0.31698536358950946,
 'eyesight(left) GTP_to_AST_ALT': 0.3145092058414746,
 'eyesight(left) GTP_to_Cholesterol': 0.3157583682536538,
 'eyesight(left) GTP_index': 0.31752880586879567,
 'eyesight(right) triglyceride': 0.31853701890718594,
 'eyesight(right) Gtp': 0.3154887525044051,
  'GTP_index': 0.31666644704880775,
 'age triglyceride': 0.3180431522932294,
 'age Gtp': 0.3162168616783544,
 'age Triglycerides_to_HDL': 0.31806825600288446,
 'age GTP_to_age': 0.3144329572906187,
 'age GTP_to_AST_ALT': 0.31478783704299473,
 'age GTP_to_Cholesterol': 0.31631456924644386,
 'age GTP_index': 0.31833017510821615,
 'height(cm) triglyceride': 0.3167616775915021,
 'height(cm) Gtp': 0.31402263206610814,
 'height(cm) Triglycerides_to_HDL': 0.3168168782659486,
 'height(cm) GTP_to_age': 0.31512050793267354,
 'height(cm) GTP_to_AST_ALT': 0.3117706416804088,
 'height(cm) GTP_to_Cholesterol': 0.31410093688081353,
 'height(cm) GTP_index': 0.31660430931175965,
 'weight(kg) triglyceride': 0.31769916272124116,
 'weight(kg) Gtp': 0.3151188798277309,
 'weight(kg) Triglycerides_to_HDL': 0.3178395338462438,
 'weight(kg) GTP_to_age':  0.3159769033863127,
 'weight(kg) GTP_to_AST_ALT':0.31292445245376604,
  'serum creatinine GTP_to_age': 0.3164975943331758,
 'serum creatinine GTP_to_AST_ALT': 0.3133776881853809,
 'serum creatinine GTP_to_Cholesterol': 0.3158515687936262,
 'serum creatinine GTP_index': 0.31818995408313244,
 'AST Gtp': 0.317209477850096,
 'AST Triglycerides_to_HDL': 0.3186129475763978,
 'AST GTP_to_age': 0.3176692840912583,
 'AST GTP_to_AST_ALT': 0.3132282789956866,
 'AST GTP_to_Cholesterol': 0.31745824046437043,
 'AST GTP_index': 0.31920842230346086,
 'ALT Gtp': 0.3182705077390445,
 'ALT Triglycerides_to_HDL': 0.3193128188566947,
 'ALT GTP_to_age': 0.3183564764349793,
 'ALT GTP_to_AST_ALT': 0.3151805409464455,
 'ALT GTP_to_Cholesterol': 0.31832674016003815,
 'ALT GTP_index': 0.3196479687745079,
 'Gtp^2': 0.3144329572906187,
 'Gtp BMI': 0.3150838424809376,
 'Gtp Chol_HDL_ratio': 0.31466171271498244,
 'Gtp ldl_hdl': 0.3161177095017238,
 'Gtp map': 0.3149422099371003,
 'Gtp Waist_Height_ratio': 0.3146300799149578,
 'Gtp height_nin_110': 0.3135883009142427,
 'Gtp Average_hearing': 0.31499106080623607,
 'Gtp Average_eyesight': 0.3151031446574053,
 'Gtp ast_alt': 0.313228517770408,
 'Gtp Systolic_Diastolic_ratio': 0.3145637921974801,
 'Gtp Atherogenic_coefficient': 0.31521321910435335,
 'Gtp BMI_to_age': 0.316282429642031,
 'Gtp Glucose_to_Cholesterol': 0.31448715088815515,
 'Gtp Triglycerides_to_HDL': 0.31335946145520266,
 'Gtp Systolic_to_age': 0.31570337989944613,
 'Gtp Diastolic_to_age': 0.3155259316593805,
 'Gtp Hemoglobin_to_age': 0.31451098607342975,
 'Gtp GTP_to_age': 0.31418762190830524,
 'Gtp GTP_to_AST_ALT': 0.31200249319420764,
 'Gtp GTP_to_Cholesterol': 0.314135309032389,
 'Gtp GTP_index': 0.315226361351893,
 'BMI Triglycerides_to_HDL': 0.31772164816701176,
 'BMI GTP_to_age': 0.316282429642031,
 'BMI GTP_to_AST_ALT': 0.3134450308962261,
 'BMI GTP_to_Cholesterol': 0.3150760095362547,
 'BMI GTP_index': 0.3174453893763509,
 'Chol_HDL_ratio Triglycerides_to_HDL': 0.3183805645942525,
 'Chol_HDL_ratio GTP_to_age': 0.3154431998750036,
 'Chol_HDL_ratio GTP_to_AST_ALT': 0.3129736034933265,
 'Chol_HDL_ratio GTP_to_Cholesterol': 0.3139626611800924,
 'Chol_HDL_ratio GTP_index': 0.31576231416062656,
 'ldl_hdl Triglycerides_to_HDL': 0.3198000329157859,
 'ldl_hdl GTP_to_age': 0.31707240063434233,
 'ldl_hdl GTP_to_AST_ALT': 0.31516211571839015,
 'ldl_hdl GTP_to_Cholesterol': 0.3150683370255076,
 'ldl_hdl GTP_index': 0.3175290014813954,
 'map Triglycerides_to_HDL': 0.3175177117846928,
 'map GTP_to_age': 0.31548972229490213,
 'map GTP_to_AST_ALT': 0.31288095394758686,
 'map GTP_to_Cholesterol': 0.31397906450839663,
 'map GTP_index': 0.3170276473275847,
 'Waist_Height_ratio Triglycerides_to_HDL': 0.3176115622021932,
  'Waist_Height_ratio GTP_to_age': 0.315566406339988,
 'Waist_Height_ratio GTP_to_AST_ALT': 0.3127573683077994,
 'Waist_Height_ratio GTP_to_Cholesterol': 0.3146086948554256,
 'Waist_Height_ratio GTP_index': 0.31726523666827844,
 'height_nin_110 Triglycerides_to_HDL': 0.31668482819652427,
 'height_nin_110 GTP_to_age': 0.31483702308334155,
 'height_nin_110 GTP_to_AST_ALT': 0.3110301551167763,
 'height_nin_110 GTP_to_Cholesterol': 0.3134613458922022,
 'height_nin_110 GTP_index': 0.3160262887855808,
 'Average_hearing Triglycerides_to_HDL': 0.3170656559481916,
 'Average_hearing GTP_to_age': 0.31548972285890775,
 'Average_hearing GTP_to_AST_ALT': 0.3129386214771055,
 'Average_hearing GTP_to_Cholesterol': 0.3147234340149493,
 'Average_hearing GTP_index': 0.3173452254871051,
 'Average_eyesight Triglycerides_to_HDL': 0.3184284433214396,
 'Average_eyesight GTP_to_age': 0.31672978082775743,
 'Average_eyesight GTP_to_AST_ALT': 0.31380058072156825,
 'Average_eyesight GTP_to_Cholesterol': 0.31522761333454075,
 'Average_eyesight GTP_index': 0.31753405205162577,
 'ast_alt Triglycerides_to_HDL': 0.31768145055761854,
 'ast_alt GTP_to_age': 0.3148235401292051,
 'ast_alt GTP_to_AST_ALT': 0.31525758460618825,
 'ast_alt GTP_to_Cholesterol': 0.31398528780360235,
 'ast_alt GTP_index': 0.31746144772179963,
 'Systolic_Diastolic_ratio Triglycerides_to_HDL': 0.31726636251250073,
 'Systolic_Diastolic_ratio GTP_to_age': 0.31540382091444485,
 'Systolic_Diastolic_ratio GTP_to_AST_ALT': 0.31221215941626834,
 'Systolic_Diastolic_ratio GTP_to_Cholesterol': 0.3146795994382394,
 'Systolic_Diastolic_ratio GTP_index': 0.31700708067749817,
 'Atherogenic_coefficient Triglycerides_to_HDL': 0.3189441232196417,
 'Atherogenic_coefficient GTP_to_age': 0.31598690879806934,
 'Atherogenic_coefficient GTP_to_AST_ALT': 0.31412226102070384,
 'Atherogenic_coefficient GTP_to_Cholesterol': 0.31431983655000767,
 'Atherogenic_coefficient GTP_index': 0.31586834146018483,
 'BMI_to_age Triglycerides_to_HDL': 0.31884990024298115,
 'BMI_to_age GTP_to_age': 0.31886070738780814,
 'BMI_to_age GTP_to_AST_ALT': 0.3157292174620733,
 'BMI_to_age GTP_to_Cholesterol': 0.31607371776009163,
 'BMI_to_age GTP_index': 0.31831429628226143,
 'Glucose_to_Cholesterol Triglycerides_to_HDL': 0.3173690460639591,
 'Glucose_to_Cholesterol GTP_to_age': 0.31498697506369633,
 'Glucose_to_Cholesterol GTP_to_AST_ALT': 0.312928216672828,
 'Glucose_to_Cholesterol GTP_to_Cholesterol': 0.3153993865376134,
 'Glucose_to_Cholesterol GTP_index': 0.3174277744947182,
 'Triglycerides_to_HDL^2': 0.31682322278204433,
 'Triglycerides_to_HDL Systolic_to_age': 0.31854647967861893,
 'Triglycerides_to_HDL Diastolic_to_age': 0.3183769002597975,
 'Triglycerides_to_HDL Hemoglobin_to_age': 0.3183407127976317,
 'Triglycerides_to_HDL GTP_to_age': 0.313576919731131,
 'Triglycerides_to_HDL GTP_to_AST_ALT': 0.31200461711625666,
 'Triglycerides_to_HDL GTP_to_Cholesterol': 0.3131113381612183,
 'Triglycerides_to_HDL GTP_index': 0.3131039334340688,
 'Systolic_to_age GTP_to_age': 0.31822711067085563,
 'Systolic_to_age GTP_to_AST_ALT': 0.3156565468789886,
 'Systolic_to_age GTP_to_Cholesterol': 0.31577102383001165,
 'Systolic_to_age GTP_index': 0.3180976076310316,
 'Diastolic_to_age GTP_to_age': 0.31820244421500016,
 'Diastolic_to_age GTP_to_AST_ALT': 0.31548361933781366,
 'Diastolic_to_age GTP_to_Cholesterol': 0.31566883115587563,
 'Diastolic_to_age GTP_index': 0.3179080570430709,
 'Hemoglobin_to_age GTP_to_age': 0.31810920866921183,
 'Hemoglobin_to_age GTP_to_AST_ALT': 0.3149889442506867,
 'Hemoglobin_to_age GTP_to_Cholesterol': 0.3148249570860322,
 'Hemoglobin_to_age GTP_index': 0.31775458717536675,
 'GTP_to_age^2': 0.31519434704339155,
 'GTP_to_age GTP_to_AST_ALT': 0.3125119679889727,
 'GTP_to_age GTP_to_Cholesterol': 0.31422245291617196,
 'GTP_to_age GTP_index': 0.3152876115933816,
 'GTP_to_AST_ALT^2': 0.3117936232478386,
 'GTP_to_AST_ALT GTP_to_Cholesterol': 0.312130607786379,
 'GTP_to_AST_ALT GTP_index': 0.31365794216993326,
 'GTP_to_Cholesterol^2': 0.3141051393618154,
 'GTP_to_Cholesterol GTP_index': 0.31524827773354197,
 'GTP_index^2': 0.31666644704880775
}

# Преобразование словаря в DataFrame
df = pd.DataFrame(list(data.items()), columns=['Feature', 'Gini_Index'])

In [33]:
df.sort_values(by='Gini_Index', ascending=True).iloc[:30, :]#['Feature'].to_list()

Unnamed: 0,Feature,Gini_Index
192,height_nin_110 GTP_to_AST_ALT,0.31103
80,triglyceride GTP_to_AST_ALT,0.311241
9,hemoglobin GTP_to_AST_ALT,0.311536
119,height(cm) GTP_to_AST_ALT,0.311771
254,GTP_to_AST_ALT^2,0.311794
162,Gtp GTP_to_AST_ALT,0.312002
235,Triglycerides_to_HDL GTP_to_AST_ALT,0.312005
255,GTP_to_AST_ALT GTP_to_Cholesterol,0.312131
15,Urine protein GTP_to_AST_ALT,0.312189
212,Systolic_Diastolic_ratio GTP_to_AST_ALT,0.312212


In [35]:
df.to_csv('evaluate_features_Gini_poly_df.csv', index=False)#['Feature'].to_list()

In [11]:
for feat in df['Feature'].to_list():
    merged_data[feat] = poly_df[feat]

  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_df[feat]
  merged_data[feat] = poly_d

In [64]:
phik_overview = merged_data.phik_matrix()
# phik_overview['smoking'].sort_values(ascending=False) 
sorted_smoking_values = phik_overview['smoking'].sort_values(ascending=False)

interval columns not set, guessing: ['age', 'height(cm)', 'weight(kg)', 'waist(cm)', 'eyesight(left)', 'eyesight(right)', 'hearing(left)', 'hearing(right)', 'systolic', 'relaxation', 'fasting blood sugar', 'Cholesterol', 'triglyceride', 'HDL', 'LDL', 'hemoglobin', 'Urine protein', 'serum creatinine', 'AST', 'ALT', 'Gtp', 'dental caries', 'tartar', 'smoking', 'BMI', 'Chol_HDL_ratio', 'ldl_hdl', 'map', 'Waist_Height_ratio', 'height_nin_110', 'Average_hearing', 'Average_eyesight', 'ast_alt', 'Systolic_Diastolic_ratio', 'Atherogenic_coefficient', 'BMI_to_age', 'Glucose_to_Cholesterol', 'Triglycerides_to_HDL', 'Systolic_to_age', 'Diastolic_to_age', 'Hemoglobin_to_age', 'GTP_to_age', 'GTP_to_AST_ALT', 'GTP_to_Cholesterol', 'GTP_index', 'LDL Gtp', 'LDL Triglycerides_to_HDL', 'LDL GTP_to_age', 'LDL GTP_to_AST_ALT', 'LDL GTP_to_Cholesterol', 'LDL GTP_index', 'hemoglobin Gtp', 'hemoglobin Triglycerides_to_HDL', 'hemoglobin GTP_to_age', 'hemoglobin GTP_to_AST_ALT', 'hemoglobin GTP_to_Cholesterol'

In [12]:
best_infinity_features = [ 'GTP_index', 'GTP_to_Cholesterol', 'GTP_to_AST_ALT', 'GTP_to_age', 'Triglycerides_to_HDL', 'Gtp', 'triglyceride', 'tartar' ]

In [13]:
# Фильтрация признаков, где связь больше 0.8
selected_features = df.sort_values(by='Gini_Index', ascending=True).iloc[:30, :]['Feature'].to_list() + best_infinity_features
len(selected_features)

38

In [14]:
merged_data.shape

(13863, 304)

In [130]:
from sklearn.model_selection import train_test_split

X = merged_data[selected_features]#.drop(['smoking'], axis=1).copy() # [] #   [selected_features.index.to_list()]
y = merged_data['smoking'].copy()

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.shape, X_test.shape

((11090, 38), (2773, 38))

In [131]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42, stratify=y_train)

X_train = pd.DataFrame(X_train)
X_val = pd.DataFrame(X_val)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((8317, 38), (2773, 38), (8317,), (2773,))

In [136]:
import optuna
from sklearn.metrics import roc_auc_score, average_precision_score


X_train_temp = X_train.copy() # .drop(columns=['hearing(left)'])
X_test_temp = X_test.copy()
X_val_temp = X_val.copy()

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 1000, 4000),
        'boosting_type': trial.suggest_categorical('boosting_type', [ 'Plain' ]), # 'Ordered',
        'depth': trial.suggest_int('depth',  6, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.01 ), #  0.05, 0.1
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 4, 25),
        'border_count': trial.suggest_int('border_count', 20, 300),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 9),
        'od_type': 'Iter', 
        'od_wait': 100, 
        'eval_metric': 'AUC',
        'logging_level': 'Silent',
        'random_seed': 42,
        'auto_class_weights': 'Balanced',
#         'task_type': 'GPU'
    }

    model = CatBoostClassifier(**params)
    model.fit(X_train_temp, y_train, eval_set=[(X_val_temp, y_val)], early_stopping_rounds=params['od_wait'], cat_features=[])

    y_pred_proba = model.predict_proba(X_test_temp)[:, 1]
        
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    avg_prec = average_precision_score(y_test, y_pred_proba)
    
    return roc_auc, avg_prec

In [137]:
%%time 
study = optuna.create_study(pruner=MedianPruner(), directions=['maximize', 'maximize'] ) #pruner=MedianPruner(), sampler=optuna.samplers.TPESampler(seed=42)
study.optimize(objective,
               n_jobs=-1,
               n_trials=10, # 25 показывает результат лучше
               show_progress_bar=True
              )

best_trial = study.best_trials[0]
roc_auc_best = best_trial.values[0]
avg_prec_best = best_trial.values[1]

print(f"Лучшее значение ROC AUC: {roc_auc_best:.4f}")
print(f"Лучшее значение Average Precision: {avg_prec_best:.4f}")

[I 2023-10-27 17:59:00,526] A new study created in memory with name: no-name-8543ae67-a346-42dc-ae1d-344a5c3aaf82


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2023-10-27 17:59:42,067] Trial 2 finished with values: [0.6976405654896392, 0.3720604283257462] and parameters: {'iterations': 2882, 'boosting_type': 'Plain', 'depth': 8, 'learning_rate': 0.009755254821632057, 'l2_leaf_reg': 7.20425044805961, 'border_count': 197, 'bagging_temperature': 3.779075898036869}. 
[I 2023-10-27 17:59:52,580] Trial 3 finished with values: [0.6938528823187657, 0.37034613994914195] and parameters: {'iterations': 1778, 'boosting_type': 'Plain', 'depth': 6, 'learning_rate': 0.005016284300067847, 'l2_leaf_reg': 13.941279669958549, 'border_count': 236, 'bagging_temperature': 1.967467408844997}. 
[I 2023-10-27 17:59:53,611] Trial 1 finished with values: [0.7031316570912142, 0.3859308460339045] and parameters: {'iterations': 3804, 'boosting_type': 'Plain', 'depth': 6, 'learning_rate': 0.00737603133434573, 'l2_leaf_reg': 19.40538436339702, 'border_count': 138, 'bagging_temperature': 7.858601108713023}. 
[I 2023-10-27 17:59:55,509] Trial 4 finished with values: [0.674

In [138]:
%%time
# Возьмите параметры из лучшего trial на основе первой метрики (ROC AUC в данном случае)
best_params = study.best_trials[0].params

model = CatBoostClassifier(**best_params, auto_class_weights='Balanced', logging_level='Silent') #  task_type='GPU' auto_class_weights='Balanced' , 

model.fit(X_train_temp, y_train, eval_set=[(X_val_temp, y_val)], early_stopping_rounds=100, verbose=100, cat_features=[]) # cat_features

y_pred_proba = model.predict_proba(X_test_temp)[:, 1]

thresholds = np.linspace(0.01, 1, 300)
f1_scores = [f1_score(y_test, y_pred_proba > thresh) for thresh in thresholds]
optimal_threshold = thresholds[np.argmax(f1_scores)]

# print(f"Оптимальный порог: {optimal_threshold:.5f}")
print(f"Наивысший F1: {max(f1_scores):.5f}")

Наивысший F1: 0.44544
CPU times: user 4min 31s, sys: 3.91 s, total: 4min 35s
Wall time: 1min 13s


In [139]:
# Получение значимости признаков
feature_importances = model.get_feature_importance()

# Создание датафрейма для лучшей визуализации
features_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
})

# Сортировка признаков по их значимости  
features_df = features_df.sort_values(by='Importance', ascending=False)

print(features_df)

                                    Feature  Importance
37                                   tartar   10.441955
27                              Gtp ast_alt    4.916470
33                               GTP_to_age    4.696695
0             height_nin_110 GTP_to_AST_ALT    4.522112
23            Chol_HDL_ratio GTP_to_AST_ALT    4.011363
34                     Triglycerides_to_HDL    3.913113
36                             triglyceride    3.744369
21    Glucose_to_Cholesterol GTP_to_AST_ALT    3.603555
12                GTP_to_age GTP_to_AST_ALT    3.139064
30                                GTP_index    2.933708
31                       GTP_to_Cholesterol    2.923292
26                       AST GTP_to_AST_ALT    2.750111
13               Cholesterol GTP_to_AST_ALT    2.630876
1               triglyceride GTP_to_AST_ALT    2.500676
20                weight(kg) GTP_to_AST_ALT    2.483780
10       fasting blood sugar GTP_to_AST_ALT    2.474724
25  Triglycerides_to_HDL GTP_to_Cholesterol    2