In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Загрузка данных
train = pd.read_parquet('data/train.parquet')
test = pd.read_parquet('data/test.parquet')

# EDA
print(train.head())
print(train.describe())
print(train.label.value_counts())

# Визуализация
plt.figure(figsize=(12, 6))
sns.countplot(x='label', data=train)
plt.title('Class Distribution')
plt.show()


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import joblib

# Загрузка признаков
X_train = pd.read_csv('data/X_train.csv')
y_train = train['label']

# Разделение данных
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Обучение модели
model = RandomForestClassifier(random_state=42)
model.fit(X_train_split, y_train_split)

# Валидация
y_val_pred = model.predict_proba(X_val_split)[:, 1]
roc_auc = roc_auc_score(y_val_split, y_val_pred)
print(f'ROC AUC: {roc_auc}')

# Сохранение модели
joblib.dump(model, 'model/rf_model.joblib')