<a href="https://colab.research.google.com/github/RomanBr89/HW4/blob/main/hw4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

df = pd.read_csv('Titanic-Dataset.csv')


def preprocess_data(df):
    # Заполнение пропущенных значений
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())

    # Кодирование категориальных переменных
    le = LabelEncoder()
    df['Sex'] = le.fit_transform(df['Sex'])

    # Выбор признаков
    features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
    X = df[features]
    y = df['Survived']

    return X, y

X, y = preprocess_data(df)

# Разделение на train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Масштабирование признаков для SVM
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [18]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.utils.class_weight import compute_class_weight

# Вычисление весов классов
classes = np.unique(y_train)
weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))

# Градиентный бустинг
gb_classifier = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

gb_classifier.fit(X_train, y_train)
gb_pred = gb_classifier.predict(X_test)

# Метрики
gb_accuracy = accuracy_score(y_test, gb_pred)
gb_precision = precision_score(y_test, gb_pred)
gb_recall = recall_score(y_test, gb_pred)
gb_f1 = f1_score(y_test, gb_pred)

print('GradientBoost:')
print(f'Accuracy: {gb_accuracy:.4f}')
print(f'Precision: {gb_precision:.4f}')
print(f'Recall: {gb_recall:.4f}')
print(f'F1-Score: {gb_f1:.4f}')
print('-' * 50)

GradientBoost:
Accuracy: 0.7989
Precision: 0.8235
Recall: 0.6087
F1-Score: 0.7000
--------------------------------------------------


In [5]:
!pip install catboost
from catboost import CatBoostClassifier

# CatBoost
cb_classifier = CatBoostClassifier(
    iterations=100,
    learning_rate=0.1,
    depth=6,
    random_state=42,
    verbose=False
)

cb_classifier.fit(X_train, y_train)
cb_pred = cb_classifier.predict(X_test)

# Метрики
cb_accuracy = accuracy_score(y_test, cb_pred)
cb_precision = precision_score(y_test, cb_pred)
cb_recall = recall_score(y_test, cb_pred)
cb_f1 = f1_score(y_test, cb_pred)

print('CatBoost:')
print(f'Accuracy: {cb_accuracy:.4f}')
print(f'Precision: {cb_precision:.4f}')
print(f'Recall: {cb_recall:.4f}')
print(f'F1-Score: {cb_f1:.4f}')
print('-' * 50)

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
CatBoost:
Accuracy: 0.7765
Precision: 0.7736
Recall: 0.5942
F1-Score: 0.6721
--------------------------------------------------


In [6]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Ada Boost
ada_classifier = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=50,
    learning_rate=1.0,
    random_state=42
)

ada_classifier.fit(X_train, y_train)
ada_pred = ada_classifier.predict(X_test)

# Метрики
ada_accuracy = accuracy_score(y_test, ada_pred)
ada_precision = precision_score(y_test, ada_pred)
ada_recall = recall_score(y_test, ada_pred)
ada_f1 = f1_score(y_test, ada_pred)

print('Ada Boost:')
print(f'Accuracy: {ada_accuracy:.4f}')
print(f'Precision: {ada_precision:.4f}')
print(f'Recall: {ada_recall:.4f}')
print(f'F1-Score: {ada_f1:.4f}')
print('-' * 50)

Ada Boost:
Accuracy: 0.8045
Precision: 0.7656
Recall: 0.7101
F1-Score: 0.7368
--------------------------------------------------


In [7]:
from lightgbm import LGBMClassifier

# LightGBM
lgb_classifier = LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

lgb_classifier.fit(X_train, y_train)
lgb_pred = lgb_classifier.predict(X_test)

# Метрики
lgb_accuracy = accuracy_score(y_test, lgb_pred)
lgb_precision = precision_score(y_test, lgb_pred)
lgb_recall = recall_score(y_test, lgb_pred)
lgb_f1 = f1_score(y_test, lgb_pred)

print('LightGBM:')
print(f'Accuracy: {lgb_accuracy:.4f}')
print(f'Precision: {lgb_precision:.4f}')
print(f'Recall: {lgb_recall:.4f}')
print(f'F1-Score: {lgb_f1:.4f}')
print('-' * 50)

[LightGBM] [Info] Number of positive: 273, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000638 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 202
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
LightGBM:
Accuracy: 0.7765
Precision: 0.7736
Recall: 0.5942
F1-Score: 0.6721
--------------------------------------------------


In [8]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# SVM с линейным ядром
svm_classifier = SVC(
    kernel='linear',
    C=1.0,
    random_state=42)

svm_classifier.fit(X_train_scaled, y_train)
svm_pred = svm_classifier.predict(X_test_scaled)

# Метрики
svm_accuracy = accuracy_score(y_test, svm_pred)
svm_precision = precision_score(y_test, svm_pred)
svm_recall = recall_score(y_test, svm_pred)
svm_f1 = f1_score(y_test, svm_pred)

print('SVM (линейное ядро)')
print(f'Accuracy: {svm_accuracy:.4f}')
print(f'Precision: {svm_precision:.4f}')
print(f'Recall: {svm_recall:.4f}')
print(f'F1-Score: {svm_f1:.4f}')
print('-' * 50)

SVM (линейное ядро)
Accuracy: 0.7765
Precision: 0.7377
Recall: 0.6522
F1-Score: 0.6923
--------------------------------------------------


In [9]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_class_weight

# Подготовка стратегий перевзвешивания
strategies = {
    'original': (X_train, y_train),
    'oversampling': SMOTE(random_state=42).fit_resample(X_train, y_train),
    'undersampling': RandomUnderSampler(random_state=42).fit_resample(X_train, y_train)
}

# Для SVM - масштабированные версии
strategies_scaled = {
    'original': (X_train_scaled, y_train),
    'oversampling': SMOTE(random_state=42).fit_resample(X_train_scaled, y_train),
    'undersampling': RandomUnderSampler(random_state=42).fit_resample(X_train_scaled, y_train)
}

In [10]:
# Функция для тестирования со стратегиями
def test_with_strategies(classifier, strategies_dict, X_test, y_test, model_name):
    results = []

    for strategy_name, (X_resampled, y_resampled) in strategies_dict.items():
        classifier.fit(X_resampled, y_resampled)
        y_pred = classifier.predict(X_test)

        results.append({
            'Model': model_name,
            'Strategy': strategy_name,
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred),
            'Recall': recall_score(y_test, y_pred),
            'F1': f1_score(y_test, y_pred)
        })

    return results

In [11]:
# Тестируем градиентный бустинг со стратегиями

gb_strategies_results = []
gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

results = test_with_strategies(gb_classifier, strategies, X_test, y_test, 'Gradient Boosting')
gb_strategies_results.extend(results)

print('GradientBoost с перевзвешиванием')
for result in results:
    print(f"  {result['Strategy']}: Accuracy={result['Accuracy']:.4f}, F1={result['F1']:.4f}")

GradientBoost с перевзвешиванием
  original: Accuracy=0.7989, F1=0.7000
  oversampling: Accuracy=0.7765, F1=0.6875
  undersampling: Accuracy=0.7989, F1=0.7429


In [12]:
# Тестируем CatBoost со стратегиями

cb_strategies_results = []
cb_classifier = CatBoostClassifier(iterations=100, learning_rate=0.1, depth=6, random_state=42, verbose=False)

results = test_with_strategies(cb_classifier, strategies, X_test, y_test, 'CatBoost')
cb_strategies_results.extend(results)

print('CatBoost с перевзвешиванием:')
for result in results:
    print(f"  {result['Strategy']}: Accuracy={result['Accuracy']:.4f}, F1={result['F1']:.4f}")

CatBoost с перевзвешиванием:
  original: Accuracy=0.7765, F1=0.6721
  oversampling: Accuracy=0.7933, F1=0.7299
  undersampling: Accuracy=0.7933, F1=0.7483


In [13]:
# Тестируем AdaBoost со стратегиями

ada_strategies_results = []
ada_classifier = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=50,
    learning_rate=1.0,
    random_state=42
)

results = test_with_strategies(ada_classifier, strategies, X_test, y_test, 'Ada Boost')
ada_strategies_results.extend(results)

print('AdaBoost с перевзвешиванием:')
for result in results:
    print(f"  {result['Strategy']}: Accuracy={result['Accuracy']:.4f}, F1={result['F1']:.4f}")

AdaBoost с перевзвешиванием:
  original: Accuracy=0.8045, F1=0.7368
  oversampling: Accuracy=0.7821, F1=0.7194
  undersampling: Accuracy=0.7821, F1=0.7347


In [14]:
# Тестируем LightGBM со стратегиями

lgb_strategies_results = []
lgb_classifier = LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

results = test_with_strategies(lgb_classifier, strategies, X_test, y_test, 'LightGBM')
lgb_strategies_results.extend(results)

print("LightGBM с перевзвешиванием:")
for result in results:
    print(f"  {result['Strategy']}: Accuracy={result['Accuracy']:.4f}, F1={result['F1']:.4f}")

[LightGBM] [Info] Number of positive: 273, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000111 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 202
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
[LightGBM] [Info] Number of positive: 439, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000144 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 355
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 6
[LightGBM] [Info] [binary:BoostFro

In [15]:
# Тестируем SVM со стратегиями

svm_strategies_results = []
svm_classifier = SVC(kernel='linear', C=1.0, random_state=42)

results = test_with_strategies(svm_classifier, strategies_scaled, X_test_scaled, y_test, 'SVM')
svm_strategies_results.extend(results)

print('SVM с перевзвешиванием:')
for result in results:
    print(f"  {result['Strategy']}: Accuracy={result['Accuracy']:.4f}, F1={result['F1']:.4f}")

SVM с перевзвешиванием:
  original: Accuracy=0.7765, F1=0.6923
  oversampling: Accuracy=0.7765, F1=0.6923
  undersampling: Accuracy=0.7765, F1=0.6923


In [16]:
# Сводные результаты со стратегиями
all_strategies_results = (
    gb_strategies_results +
    cb_strategies_results +
    ada_strategies_results +
    lgb_strategies_results +
    svm_strategies_results
)

strategies_df = pd.DataFrame(all_strategies_results)
print('Сводные результаты с перевзвешиванием:')
print(strategies_df.round(4))

# Лучшая комбинация модель+стратегия
best_combo = strategies_df.loc[strategies_df['F1'].idxmax()]
print(f'\nЛучшая комбинация:')
print(f'Модель: {best_combo['Model']}')
print(f'Стратегия: {best_combo['Strategy']}')
print(f'F1-Score: {best_combo['F1']:.4f}')

# Сравнение эффективности стратегий
print(f'\nСравнение стратегий:')
for strategy in ['original', 'oversampling', 'undersampling']:
    strategy_data = strategies_df[strategies_df['Strategy'] == strategy]
    avg_f1 = strategy_data['F1'].mean()
    print(f"{strategy}: средний F1 = {avg_f1:.4f}")

Сводные результаты с перевзвешиванием:
                Model       Strategy  Accuracy  Precision  Recall      F1
0   Gradient Boosting       original    0.7989     0.8235  0.6087  0.7000
1   Gradient Boosting   oversampling    0.7765     0.7458  0.6377  0.6875
2   Gradient Boosting  undersampling    0.7989     0.7324  0.7536  0.7429
3            CatBoost       original    0.7765     0.7736  0.5942  0.6721
4            CatBoost   oversampling    0.7933     0.7353  0.7246  0.7299
5            CatBoost  undersampling    0.7933     0.7051  0.7971  0.7483
6           Ada Boost       original    0.8045     0.7656  0.7101  0.7368
7           Ada Boost   oversampling    0.7821     0.7143  0.7246  0.7194
8           Ada Boost  undersampling    0.7821     0.6923  0.7826  0.7347
9            LightGBM       original    0.7765     0.7736  0.5942  0.6721
10           LightGBM   oversampling    0.7933     0.7667  0.6667  0.7132
11           LightGBM  undersampling    0.7989     0.7538  0.7101  0.7313