In [2]:
import pandas as pd
from pathlib import Path

data_dir = Path('dataframes')

csv_files = list(data_dir.glob('*.csv'))

datasets = {}
for path in csv_files:
    name = path.stem
    datasets[name] = pd.read_csv(path)

print(*datasets.keys(), sep='\n')
datasets['cars_100_pairs_11_n_feature']

cars_100_pairs_11_n_feature
cars_1309_pairs_6_n_feature
cars_1385_pairs_13_n_feature
cars_1430_pairs_8_n_feature
cars_191_pairs_9_n_feature
cars_215_pairs_5_n_feature
cars_50_pairs_9_n_feature
cars_680_pairs_11_n_feature
cars_68_pairs_4_n_feature
cars_766_pairs_6_n_feature
cars_95_pairs_12_n_feature
cars_988_pairs_8_n_feature


Unnamed: 0,1_двигатель_включен,1_ручной_тормоз,1_фары_включены,1_тип_кузова,1_привод,1_топливо,1_скорость,1_угол_поворота,1_дистанция_до_объекта,2_двигатель_включен,2_ручной_тормоз,2_фары_включены,2_тип_кузова,2_привод,2_топливо,2_скорость,2_угол_поворота,2_дистанция_до_объекта,collision
0,0.0,0.0,0.0,3.0,0.0,0.0,74.356400,83.777309,16.594605,1.0,1.0,0.0,3.0,2.0,1.0,63.715439,178.935823,6.430087,0
1,0.0,0.0,0.0,3.0,0.0,0.0,74.356400,83.777309,16.594605,0.0,0.0,1.0,1.0,2.0,1.0,167.259286,137.831955,32.796486,0
2,0.0,0.0,1.0,1.0,2.0,1.0,167.259286,137.831955,32.796486,1.0,1.0,0.0,3.0,2.0,1.0,63.715439,178.935823,6.430087,0
3,1.0,0.0,0.0,2.0,0.0,2.0,52.521583,95.132455,92.530939,0.0,1.0,1.0,2.0,2.0,0.0,90.465410,12.519434,4.706553,0
4,0.0,1.0,0.0,3.0,1.0,0.0,115.454234,143.663318,57.916933,1.0,0.0,0.0,2.0,1.0,0.0,11.063215,-19.606858,108.576771,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,1.0,1.0,2.0,2.0,0.0,90.465410,12.519434,4.706553,1.0,0.0,0.0,2.0,1.0,0.0,11.063215,-19.606858,108.576771,0
96,0.0,0.0,0.0,2.0,2.0,1.0,130.893664,-34.517205,62.200079,0.0,0.0,1.0,1.0,2.0,1.0,167.259286,137.831955,32.796486,1
97,0.0,0.0,0.0,2.0,2.0,1.0,130.893664,-34.517205,62.200079,0.0,0.0,0.0,3.0,0.0,0.0,74.356400,83.777309,16.594605,0
98,0.0,1.0,0.0,3.0,1.0,0.0,115.454234,143.663318,57.916933,1.0,1.0,0.0,3.0,2.0,1.0,63.715439,178.935823,6.430087,0


In [3]:
from sklearn.model_selection import train_test_split

results = {}

for data_name, data in datasets.items():
    X = data.drop('collision', axis=1)
    y = data['collision']
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42,
        stratify=y
    )
    
    results[data_name] = (X_train, X_test, y_train, y_test)


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# 1. Logistic Regression
#    — простой и интерпретируемый метод: коэффициенты показывают вклад признаков.
#    — часто служит хорошим базовым решением для линейно разделимых данных.
lr = LogisticRegression(max_iter=1000)

# 2. Decision Tree
#    — не требует нормализации признаков и автоматически выявляет нелинейные зависимости.
#    — легко визуализируется, что помогает в объяснении решений модели.
dt = DecisionTreeClassifier()

# 3. k-Nearest Neighbors (kNN)
#    — ленивый алгоритм, не обучается заранее, подходит для задач с небольшим числом признаков.
#    — хорошо работает при четкой кластеризации данных в пространстве признаков.
knn = KNeighborsClassifier(n_neighbors=5)

# 4. Gaussian Naive Bayes
#    — простой вероятностный метод, особенно эффективен на разреженных данных и при многомерных признаках.
#    — быстрый в обучении и предсказании, не требующий большого объема памяти.
nb = GaussianNB()

# Собираем все классификаторы в список для удобства использования:
classifiers = [
    ('Logistic Regression', lr),
    ('Decision Tree', dt),
    ('k-Nearest Neighbors', knn),
    ('Gaussian Naive Bayes', nb),
]

In [5]:
import time
import psutil
import tracemalloc

def measure_model(model, X_train, y_train):
    proc = psutil.Process()
    tracemalloc.start()
    t0 = time.perf_counter()
    model.fit(X_train, y_train)
    t1 = time.perf_counter()
    current, peak = tracemalloc.get_traced_memory()
    return t1 - t0, current

res = []
for ds_name, (X_train, X_test, y_train, y_test) in results.items():
    for name, model in classifiers:
        # клонируем модель, чтобы каждый раз стартовать «с чистого листа»
        m = model.__class__(**model.get_params())
        fit_time, mem_used = measure_model(m, X_train, y_train)
        res.append({
            'Dataset': ds_name,
            'Model': name,
            'Fit Time (s)': fit_time,
            'Memory Used (bytes)': mem_used
        })

df = pd.DataFrame(res)
df

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Dataset,Model,Fit Time (s),Memory Used (bytes)
0,cars_100_pairs_11_n_feature,Logistic Regression,0.283701,23265
1,cars_100_pairs_11_n_feature,Decision Tree,0.015807,24330
2,cars_100_pairs_11_n_feature,k-Nearest Neighbors,0.003221,43521
3,cars_100_pairs_11_n_feature,Gaussian Naive Bayes,0.003994,33325
4,cars_1309_pairs_6_n_feature,Logistic Regression,0.01377,42531
5,cars_1309_pairs_6_n_feature,Decision Tree,0.005043,42528
6,cars_1309_pairs_6_n_feature,k-Nearest Neighbors,0.007477,279015
7,cars_1309_pairs_6_n_feature,Gaussian Naive Bayes,0.005994,44309
8,cars_1385_pairs_13_n_feature,Logistic Regression,0.464717,247302
9,cars_1385_pairs_13_n_feature,Decision Tree,0.009381,247481


In [6]:
import numpy as np

res_list = []

for name, _ in classifiers:
    avg_time = df.loc[df['Model'] == name, 'Fit Time (s)'].mean()
    avg_mem = df.loc[df['Model'] == name, 'Memory Used (bytes)'].mean()
    res_list.append({
        'Model': name,
        'avg_time (s)': avg_time,
        'avg_memory (bytes)': avg_mem
    })

res_df = pd.DataFrame(res_list)
res_df

Unnamed: 0,Model,avg_time (s),avg_memory (bytes)
0,Logistic Regression,0.150215,244961.583333
1,Decision Tree,0.006314,245014.583333
2,k-Nearest Neighbors,0.004611,345862.25
3,Gaussian Naive Bayes,0.004439,247067.5


In [7]:
import joblib

joblib.dump(nb, 'naive_bayes')
joblib.dump(dt, 'dt')

['dt']

# Вывод
Худшим по времени получилась модель логистической регрессии, лучше - наивная гауссовская модель, по памяти: худшая - kNN, лучшая - логистическая, учитывая огромный объём памяти kNN по сравнению с остальными я буду считать эту модель худшей из 4х, т.к. лучше буду жертвовать памятью, чем временем, ведь вычислительные силы машины небесконечны. Среди всех лучшая модель - наивный байес/DT. Обе модели равны.