In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import phik
from phik import resources, report, phik_matrix

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [None]:

train = pd.read_csv('data/train.csv')

train.head()

In [None]:
train.id.nunique() == train.shape[0]

In [None]:
train.set_index('id', inplace=True)

Смотрим распределение таргета

In [None]:
sns.set()
plt.figure(figsize=(8, 5))
sns.countplot(data=train, x='target')
plt.title('Target distribut')
plt.xlabel('cnt')
plt.ylabel('target')
plt.show()

In [None]:
columns_with_minus_one = train.columns[(train == -1).any()].tolist()
columns_with_minus_one

In [None]:
cat_features = [x for x in list(train.columns) if 'cat' in x]

plt.figure(figsize=(15, 20))

for i, col in enumerate(cat_features):
    plt.subplot(5, 3, i + 1)
    sns.countplot(data=train, x=col)
    plt.title(col)

plt.tight_layout()
plt.show()

In [None]:
num_features = [x for x in list(train.columns) if 'cat' not in x and 'bin' not in x and x not in ['target']]
num_features
plt.figure(figsize=(15, 30))

for i, col in enumerate(num_features):
    plt.subplot(9, 3, i + 1)
    sns.histplot(train[col], kde=False, bins=30)
    plt.title(col)
    plt.xlabel(col)

plt.tight_layout()
plt.show()

In [None]:
phik_matrix = train.sample(100000).phik_matrix(interval_cols=num_features)

plt.figure(figsize=(25, 20))

sns.heatmap(phik_matrix,
            annot=False,
            cmap='coolwarm',
            linewidths=0.1)

plt.title('Матрица корреляции Phik для всех признаков', fontsize=20)
plt.show()

In [None]:
threshold = 0.8

import numpy as np
upper = phik_matrix.where(np.triu(np.ones(phik_matrix.shape), k=1).astype(bool))

columns_to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

print(f"Список на удаление: {columns_to_drop}")

In [None]:
df = train.copy()
df.drop(columns_to_drop, axis=1, inplace=True)

# Baseline

In [None]:
X = df.drop('target', axis=1)
y = df['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

model = LogisticRegression(
    penalty='l1',
    solver='liblinear',
    class_weight='balanced',
    C=0.2,
    random_state=42,
    max_iter=1000
)
model.fit(X_train_scaled, y_train)

probs = model.predict_proba(X_val_scaled)[:, 1]
auc = roc_auc_score(y_val, probs)

print(f"Baseline ROC-AUC: {auc:.4f}")
print(f"Baseline Gini: {2 * auc - 1}")


In [None]:
preds = model.predict(X_val_scaled)
accuracy_score(y_val, preds)

In [None]:
cm = confusion_matrix(y_val, preds)

# 3. Визуализация
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Predicted 0', 'Predicted 1'],
            yticklabels=['Actual 0', 'Actual 1'])

plt.ylabel('Реальные значения')
plt.xlabel('Предсказанные значения')
plt.title('Confusion Matrix')
plt.show()

In [None]:
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.coef_[0]
})
feature_importance['importance'] = feature_importance['importance'].abs()
feature_importance = feature_importance.sort_values(by='importance', ascending=False)
plt.figure(figsize=(10, 8))
sns.barplot(
    x='importance',
    y='feature',
    data=feature_importance.head(20),
    palette='vlag'
)