In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from fastparquet import ParquetFile
import seaborn as sns
from utils import feature_drop
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import  roc_auc_score
import lightgbm as lgb
from sklearn.impute import SimpleImputer


In [3]:
file_path = "Data/train_ai_comp_final_dp.parquet"
pf = ParquetFile(file_path)
df = pf.to_pandas()
df.head()

Unnamed: 0,id,target,sample_ml_new,feature1,feature2,feature3,feature4,feature5,feature6,feature7,...,feature1067,feature1068,feature1069,feature1070,feature1071,feature1072,feature1073,feature1074,feature1075,feature1076
0,0,0,1,1761,1759,85,105469,191,46,0,...,28913,48985,84264,12045,12107,12510,21126,28913,48985,84264
1,1,0,1,1761,1759,120,105610,144,71,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,1,890,1759,141,105227,191,11,0,...,55949,70006,113317,33735,31242,12691,48682,55949,70006,113317
5,5,0,1,1599,966,30,102441,191,8,0,...,0,6872,8530,0,0,0,0,0,6872,8530
6,6,0,1,1761,1759,85,104006,191,2,0,...,0,0,7642,0,0,0,0,0,0,7642


### Удаление константных признаков

In [4]:
df = feature_drop(df)

### t-SNE

In [5]:
features = df.drop(columns=['target', "id"])
targets = df['target']

imputer = SimpleImputer(strategy='median')
scaled_df_imputed = imputer.fit_transform(features)

scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(features)

# Выполняем t-SNE на масштабированных признаках
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(scaled_df_imputed)

# Визуализация с подсветкой в соответствии с таргетами
plt.figure(figsize=(10,6))
scatter = plt.scatter(tsne_results[:,0], tsne_results[:,1], c=targets, cmap='viridis', alpha=0.5)

# Добавление легенды
# Для бинарных или числовых targets, легенда может быть не так полезна, как для категориальных,
# но добавление цветовой шкалы может быть полезным.
plt.colorbar(scatter, label='Target Value')

plt.title('t-SNE визуализация с цветами таргетов')
plt.xlabel('TSNE1')
plt.ylabel('TSNE2')
plt.show()

### Отбор фичей Вовы и деление на train, val, test

In [8]:
X = df.drop(columns=['target', "id"])
y = df['target']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [9]:
import lightgbm as lgb

params = {
    'boosting_type': 'gbdt',
    'class_weight': None,
    'colsample_bytree': 1.0,
    'importance_type': 'gain',
    'learning_rate': 0.1,
    'max_depth': -1,
    'min_child_samples': 20,
    'min_child_weight': 0.001,
    'min_split_gain': 0,
    'n_estimators': 10000,
    'num_leaves': 31,
    'objective': "binary",
    'random_state': 9,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'silent': True,
    'subsample': 1.0,
    'subsample_for_bin': 200000,
    'subsample_freq': 0,
    'metric': 'auc',
    'gpu_use_dp': True,
    "early_stopping_rounds":300
}

train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

lightgbm = lgb.train(params, train_data, num_boost_round=1000, valid_sets=[train_data, val_data])
y_train_pred = lightgbm.predict(X_train)
y_test_pred = lightgbm.predict(X_test)

roc_auc_train = roc_auc_score(y_train, y_train_pred)
roc_auc_test = roc_auc_score(y_test, y_test_pred)
print(f"roc-auc train {roc_auc_train}")
print(f"roc-auc test {roc_auc_test}")



[LightGBM] [Info] Number of positive: 13005, number of negative: 350725
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.269866 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 19785
[LightGBM] [Info] Number of data points in the train set: 363730, number of used features: 114
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.035755 -> initscore=-3.294669
[LightGBM] [Info] Start training from score -3.294669
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[144]	training's auc: 0.846483	valid_1's auc: 0.74335
roc-auc train 0.8464833286374527
roc-auc test 0.7384609681902414
