In [100]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import mutual_info_classif
import warnings
warnings.filterwarnings('ignore')

# Загрузка данных
df = pd.read_csv('bank-full.csv', sep=';')

# Выбираем только нужные признаки
selected_features = ['age', 'job', 'marital', 'education', 'balance', 'housing',
                    'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
                    'previous', 'poutcome', 'y']

df = df[selected_features]

print("Размер данных после отбора признаков:", df.shape)
print(df.dtypes)
df.head().T

Размер данных после отбора признаков: (45211, 15)
age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object


Unnamed: 0,0,1,2,3,4
age,58,44,33,47,33
job,management,technician,entrepreneur,blue-collar,unknown
marital,married,single,married,married,single
education,tertiary,secondary,secondary,unknown,unknown
balance,2143,29,2,1506,1
housing,yes,yes,yes,yes,no
contact,unknown,unknown,unknown,unknown,unknown
day,5,5,5,5,5
month,may,may,may,may,may
duration,261,151,76,92,198


In [101]:
print("=== ПРОВЕРКА ПРОПУЩЕННЫХ ЗНАЧЕНИЙ ===")
missing_values = df.isnull().sum()
print(missing_values)

=== ПРОВЕРКА ПРОПУЩЕННЫХ ЗНАЧЕНИЙ ===
age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [102]:
# Вопрос 1
education_mode = df['education'].mode()[0]
print(df['education'].value_counts())
print(f"\nОтвет: {education_mode}")

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

Ответ: secondary


In [103]:
#Вопрос 2
# Выбираем только числовые признаки
numeric_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
numeric_df = df[numeric_features]

# Создаем корреляционную матрицу
correlation_matrix = numeric_df.corr()

print(correlation_matrix)

# Находим пару с наибольшей корреляцией (исключая диагональ)
corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        col1 = correlation_matrix.columns[i]
        col2 = correlation_matrix.columns[j]
        corr_value = abs(correlation_matrix.iloc[i, j])
        corr_pairs.append((col1, col2, corr_value))

# Сортируем по убыванию корреляции
corr_pairs_sorted = sorted(corr_pairs, key=lambda x: x[2], reverse=True)

print("\nКорреляция пар:")
for pair in corr_pairs_sorted[:5]:
    print(f"{pair[0]} и {pair[1]}: {pair[2]:.6f}")
print("\nОтвет:",f"{corr_pairs_sorted[0][0]} и {corr_pairs_sorted[0][1]}: {corr_pairs_sorted[0][2]:.10f}")

               age   balance       day  duration  campaign     pdays  previous
age       1.000000  0.097783 -0.009120 -0.004648  0.004760 -0.023758  0.001288
balance   0.097783  1.000000  0.004503  0.021560 -0.014578  0.003435  0.016674
day      -0.009120  0.004503  1.000000 -0.030206  0.162490 -0.093044 -0.051710
duration -0.004648  0.021560 -0.030206  1.000000 -0.084570 -0.001565  0.001203
campaign  0.004760 -0.014578  0.162490 -0.084570  1.000000 -0.088628 -0.032855
pdays    -0.023758  0.003435 -0.093044 -0.001565 -0.088628  1.000000  0.454820
previous  0.001288  0.016674 -0.051710  0.001203 -0.032855  0.454820  1.000000

Корреляция пар:
pdays и previous: 0.454820
day и campaign: 0.162490
age и balance: 0.097783
day и pdays: 0.093044
campaign и pdays: 0.088628

Ответ: pdays и previous: 0.4548196355


In [104]:
# Кодирование целевой переменной
df['y']=(df['y'] == 'yes').astype(int)
print(df['y'].value_counts())
print('Видим сильный перевес в пользу 0')

# Разделение данных
X = df.drop('y', axis=1)
y = df['y']


# Сначала разделим на train+val и test (60%+20% и 20%)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Затем разделим train+val на train и val (60% и 20% от исходных)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val
)  # 0.25 * 0.8 = 0.2 от исходных

print(f"Обучающая: {X_train.shape[0]} записей ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Валидация: {X_val.shape[0]} записей ({X_val.shape[0]/len(X)*100:.1f}%)")
print(f"Тестовая: {X_test.shape[0]} записей ({X_test.shape[0]/len(X)*100:.1f}%)")

y
0    39922
1     5289
Name: count, dtype: int64
Видим сильный перевес в пользу 0
Обучающая: 27126 записей (60.0%)
Валидация: 9042 записей (20.0%)
Тестовая: 9043 записей (20.0%)


In [105]:
# Вопрос 3
# Выбираем категориальные признаки
features = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

# Копируем и кодируем
X_encoded = X_train[features].copy()
for col in features:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_train[col])

mi_scores = mutual_info_classif(X_encoded, y_train, random_state=42)

# Округляем до 2 знаков
mi_rounded = {feat: round(score, 2) for feat, score in zip(features, mi_scores)}

for feat in features:
    print(f"{feat}: {mi_rounded[feat]}")

best_feature = max(mi_rounded, key=mi_rounded.get)
print(f"\nОтвет: {best_feature}")

job: 0.01
marital: 0.0
education: 0.0
housing: 0.01
contact: 0.01
month: 0.02
poutcome: 0.03

Ответ: poutcome


In [106]:
# Вопрос 4
categorical_features = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

# One-hot encoding
X_train_encoded = pd.get_dummies(X_train, columns=categorical_features, drop_first=True)
X_val_encoded = pd.get_dummies(X_val, columns=categorical_features, drop_first=True)

common_cols = X_train_encoded.columns.intersection(X_val_encoded.columns)
X_train_encoded = X_train_encoded[common_cols]
X_val_encoded = X_val_encoded[common_cols]

# Обучаем логистическую регрессию
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)

# Предсказания на валидации
y_val_pred = model.predict(X_val_encoded)
accuracy = accuracy_score(y_val, y_val_pred)

print(f"Ответ: {accuracy:.2f}")

Ответ: 0.90


In [107]:
# Вопрос 5
features_to_test = ['age', 'balance', 'marital', 'previous']
accuracy_differences = {}

for feature in features_to_test:
    # Создаем копии данных без одного признака
    if feature in categorical_features:
        # Для категориальных признаков исключаем все one-hot колонки
        cols_to_drop = [col for col in X_train_encoded.columns if col.startswith(feature + '_')]
        X_train_reduced = X_train_encoded.drop(cols_to_drop, axis=1)
        X_val_reduced = X_val_encoded.drop(cols_to_drop, axis=1)
    else:
        # Для числовых признаков
        X_train_reduced = X_train_encoded.drop(feature, axis=1)
        X_val_reduced = X_val_encoded.drop(feature, axis=1)

    # Обучаем модель
    model_reduced = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_reduced.fit(X_train_reduced, y_train)

    # Предсказания
    y_val_pred_reduced = model_reduced.predict(X_val_reduced)
    accuracy_reduced = accuracy_score(y_val, y_val_pred_reduced)

    # Разница в точности
    difference = accuracy - accuracy_reduced
    accuracy_differences[feature] = difference

    print(f"Без {feature}: точность = {accuracy_reduced:.4f}, разница = {difference:.4f}")

# Находим признак с наименьшей разницей
min_diff_feature = min(accuracy_differences, key=lambda x: abs(accuracy_differences[x]))
print(f"\n Ответ: {min_diff_feature} (разница = {accuracy_differences[min_diff_feature]:.4f})")

Без age: точность = 0.9027, разница = 0.0009
Без balance: точность = 0.9036, разница = 0.0000
Без marital: точность = 0.9023, разница = 0.0012
Без previous: точность = 0.9026, разница = 0.0010

 Ответ: balance (разница = 0.0000)


In [108]:
# Вопрос 6
C_values = [0.01, 0.1, 1, 10]
best_accuracy = 0
best_C = None

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train_encoded, y_train)

    y_val_pred = model.predict(X_val_encoded)
    accuracy = accuracy_score(y_val, y_val_pred)

    print(f"C = {C}: точность = {accuracy:.3f}")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_C = C

print(f"\n Ответ: Лучшее значение C: {best_C} с точностью {best_accuracy:.3f}")

C = 0.01: точность = 0.899
C = 0.1: точность = 0.903
C = 1: точность = 0.904
C = 10: точность = 0.903

 Ответ: Лучшее значение C: 1 с точностью 0.904


In [113]:
print("\nОТВЕТЫ")
print(education_mode)
print(f"{corr_pairs_sorted[0][0]} и {corr_pairs_sorted[0][1]}")
print(mi_results.iloc[0]['feature'])
print(f"{accuracy:.2f}")
print(min_diff_feature)
print(best_C)


ОТВЕТЫ
secondary
pdays и previous
poutcome
0.90
balance
1
