In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from time import time

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('filtered_dataset.csv')
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,NDWI,NDMI,MNDWI,WRI,NDVI,AWEI
0,206,439,275,860,2306,2934,2619,3185,1551,762,0,-0.712884,0.256115,-0.558794,0.171223,0.809952,-7198.25
1,216,390,326,672,1526,1774,2144,2075,1218,576,0,-0.692186,0.275431,-0.514925,0.212968,0.736032,-5432.00
2,170,377,266,732,1807,2122,2146,2391,1287,642,0,-0.701149,0.250218,-0.546875,0.187300,0.779436,-5942.00
3,247,558,414,1082,2273,2564,2601,2925,2124,1084,0,-0.646724,0.100952,-0.583893,0.205714,0.725373,-9895.25
4,194,496,378,905,1896,2204,2602,2571,1739,828,0,-0.679793,0.198802,-0.556152,0.201336,0.746309,-7899.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
538078,632,835,1114,1219,1015,1007,1256,1062,729,505,1,-0.201339,0.265491,0.067775,0.981864,0.059916,-1278.75
538079,626,944,1078,992,427,375,323,214,33,33,1,0.490134,0.814607,0.932446,5.679775,-0.538901,3472.50
538080,646,968,1106,1001,437,408,344,203,49,35,1,0.475610,0.750636,0.903638,5.277354,-0.525517,3493.75
538081,605,904,1036,922,394,353,287,192,33,29,1,0.518052,0.793750,0.929562,6.062500,-0.566138,3332.50


In [3]:
# Определение признаков и целевой переменной
X = df.iloc[:, :10].join(df[['NDWI', 'NDMI', 'MNDWI', 'WRI', 'NDVI', 'AWEI']])  # Признаки: все колонки от 0 до 9 + новые
y = df.iloc[:, 10]   # Целевая переменная: колонка "water"

In [7]:
# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
models = {
    'CatBoost': CatBoostClassifier(iterations=100, learning_rate=0.1, depth=6, verbose=0),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5)
}

# Обучение и оценка каждой модели
for name, model in models.items():
    start_time = time()  # Начало измерения времени
    model.fit(X_train, y_train)  # Обучение модели
    end_time = time()  # Конец измерения времени
    
    # Предсказание на тестовом наборе
    y_pred = model.predict(X_test)
    
    # Оценка производительности модели
    print(f"\n{name}:\n")
    print(f"Training Time: {end_time - start_time:.2f} seconds")  # Время обучения
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))


CatBoost:

Training Time: 3.66 seconds
Confusion Matrix:
[[103766    243]
 [   520   3088]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    104009
           1       0.93      0.86      0.89      3608

    accuracy                           0.99    107617
   macro avg       0.96      0.93      0.94    107617
weighted avg       0.99      0.99      0.99    107617



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Logistic Regression:

Training Time: 25.76 seconds
Confusion Matrix:
[[103845    164]
 [   742   2866]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00    104009
           1       0.95      0.79      0.86      3608

    accuracy                           0.99    107617
   macro avg       0.97      0.90      0.93    107617
weighted avg       0.99      0.99      0.99    107617


Decision Tree:

Training Time: 21.58 seconds
Confusion Matrix:
[[103426    583]
 [   586   3022]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99    104009
           1       0.84      0.84      0.84      3608

    accuracy                           0.99    107617
   macro avg       0.92      0.92      0.92    107617
weighted avg       0.99      0.99      0.99    107617


Random Forest:

Training Time: 331.85 seconds
Confusion Matrix:
[[103789    220]
 [   519   3

In [13]:
import joblib
import os
import tempfile
from time import time
from sklearn.metrics import confusion_matrix, classification_report


In [14]:

# Словарь для хранения размера памяти каждой модели
model_sizes = {}

# Обучение, оценка и измерение размера каждой модели
for name, model in models.items():
    start_time = time()  # Начало измерения времени
    model.fit(X_train, y_train)  # Обучение модели
    end_time = time()  # Конец измерения времени
    
    # Предсказание на тестовом наборе
    y_pred = model.predict(X_test)
    
    # Оценка производительности модели
    print(f"\n{name}:\n")
    print(f"Training Time: {end_time - start_time:.2f} seconds")  # Время обучения
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Временное сохранение модели для определения размера
    tmp_file = tempfile.NamedTemporaryFile(delete=False)
    try:
        tmp_file.close()  # Закрытие файла перед использованием
        joblib.dump(model, tmp_file.name)
        model_size = os.path.getsize(tmp_file.name)  # Размер модели в байтах
        model_sizes[name] = model_size / (1024 ** 2)  # Конвертируем в мегабайты
    finally:
        os.remove(tmp_file.name)  # Удаляем временный файл

# Вывод размера памяти каждой модели
print("\nModel Sizes (in MB):")
for name, size in model_sizes.items():
    print(f"{name}: {size:.2f} MB")



CatBoost:

Training Time: 3.52 seconds
Confusion Matrix:
[[103766    243]
 [   520   3088]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    104009
           1       0.93      0.86      0.89      3608

    accuracy                           0.99    107617
   macro avg       0.96      0.93      0.94    107617
weighted avg       0.99      0.99      0.99    107617



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Logistic Regression:

Training Time: 27.14 seconds
Confusion Matrix:
[[103845    164]
 [   742   2866]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00    104009
           1       0.95      0.79      0.86      3608

    accuracy                           0.99    107617
   macro avg       0.97      0.90      0.93    107617
weighted avg       0.99      0.99      0.99    107617


Decision Tree:

Training Time: 20.74 seconds
Confusion Matrix:
[[103421    588]
 [   593   3015]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99    104009
           1       0.84      0.84      0.84      3608

    accuracy                           0.99    107617
   macro avg       0.92      0.91      0.92    107617
weighted avg       0.99      0.99      0.99    107617


Random Forest:

Training Time: 333.17 seconds
Confusion Matrix:
[[103778    231]
 [   522   3