In [None]:
import numpy as np
import pandas as pd
import random
from itertools import product
from pathlib import Path

save_folder = "datasets_storage"

rows_ranges = [
    (30, 100),
    (100, 500),
    (500, 1000),
    (1000, 1500),
]

features_ranges = [
    (4, 7),
    (8, 10),
    (11, 15),
]

types_of_features = ["binary", "nominal", "ordinal", "quantitative"]


def create_feature(ftype: str, length: int):
    if ftype == "binary":
        return np.random.choice([0, 1], size=length)
    elif ftype == "nominal":
        return np.random.choice(['A', 'B', 'C', 'D'], size=length)
    elif ftype == "ordinal":
        return np.random.choice([1, 2, 3, 4, 5], size=length)
    elif ftype == "quantitative":
        return np.random.uniform(0, 100, size=length)
    else:
        raise ValueError("Неподдерживаемый тип признака")


def create_sample_dataset(feature_num: int, sample_num: int) -> pd.DataFrame:
    chosen_types = random.sample(types_of_features, k=min(4, feature_num))
    while len(chosen_types) < feature_num:
        chosen_types.append(random.choice(types_of_features))
    random.shuffle(chosen_types)

    dataset = {}

    for idx in range(feature_num):
        dataset[f"Obj1_Feat{idx + 1}"] = create_feature(
            chosen_types[idx], sample_num)

    for idx in range(feature_num):
        dataset[f"Obj2_Feat{idx + 1}"] = create_feature(
            chosen_types[idx], sample_num)

    collision_labels = []
    for i in range(sample_num):
        equal_feats = sum(
            dataset[f"Obj1_Feat{j + 1}"][i] == dataset[f"Obj2_Feat{j + 1}"][i]
            for j in range(feature_num)
        )
        collision_labels.append("Да" if equal_feats >=
                                feature_num // 2 else "Нет")

    dataset["Collision"] = collision_labels

    return pd.DataFrame(dataset)


all_datasets = []
set_id = 1

for row_rng, feat_rng in product(rows_ranges, features_ranges):
    samples_count = random.randint(*row_rng)
    features_count = random.randint(*feat_rng)

    dataset_df = create_sample_dataset(features_count, samples_count)
    all_datasets.append((set_id, samples_count, features_count, dataset_df))

    print(
        f"\n--- Набор данных {set_id} | Строк: {samples_count} | Признаков: {features_count} ---")
    print(dataset_df.head(10))
    set_id += 1

save_answer = input(
    "\nСохранить все датасеты в CSV? (да/нет): ").strip().lower()

if save_answer in ['да', 'yes', 'y']:
    for ds_id, rows, feats, df in all_datasets:
        path_to_save = f"{save_folder}/dataset_{ds_id}_rows{rows}_feats{feats}.csv"
        df.to_csv(path_to_save, index=False)
        print(f"Сохранено: {path_to_save}")
else:
    print("Сохранение отменено.")


--- Набор данных 1 | Строк: 68 | Признаков: 6 ---
  Obj1_Feat1  Obj1_Feat2  Obj1_Feat3  Obj1_Feat4  Obj1_Feat5  Obj1_Feat6  \
0          D   88.451874           1   39.619622   25.387480           4   
1          C   87.631081           0    8.461110   16.683996           4   
2          A   87.377974           1   64.616961    0.530923           5   
3          D   55.007622           1   98.944378   36.094769           3   
4          C   68.329991           0   19.277173   84.852764           3   
5          D   99.640327           1   82.979666   36.830818           5   
6          A   24.385606           1    1.339454    9.345762           5   
7          A   73.695094           1   25.338087   97.897575           5   
8          B   95.673870           0   99.987583   87.181161           1   
9          A   45.325060           1   18.321801   52.957104           3   

  Obj2_Feat1  Obj2_Feat2  Obj2_Feat3  Obj2_Feat4  Obj2_Feat5  Obj2_Feat6  \
0          A   27.194605           1

In [None]:
import numpy as np
import pandas as pd
import time
import pickle
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder

datasets_dir = Path("datasets_storage")
output_dir = Path("learn_models")
output_dir.mkdir(parents=True, exist_ok=True)

save_models = input(
    "\nСохранить модели в файлы? (да/нет): ").strip().lower() in ['да', 'yes', 'y']

dataset_files = list(datasets_dir.glob("*.csv"))

for dataset_file in dataset_files:
    print(f"\n--- Работаем с датасетом: {dataset_file.name} ---")

    df = pd.read_csv(dataset_file)
    X = df.drop(columns=["Collision"])
    y = df["Collision"]

    for col in X.columns:
        if X[col].dtype == object:
            le = LabelEncoder()
            X[col] = le.fit_transform(X[col])

    y = y.map({"Да": 1, "Нет": 0})

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    if len(np.unique(y_train)) < 2:
        print(f" Пропускаем {dataset_file.name}: только один класс.")
        continue

    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Decision Tree": DecisionTreeClassifier(),
        "K-Nearest Neighbors": KNeighborsClassifier(),
        "Support Vector Machine": SVC()
    }

    training_times = {}

    print("Обучение моделей:")
    for name, model in models.items():
        start_time = time.time()
        model.fit(X_train, y_train)
        elapsed_time = time.time() - start_time
        training_times[name] = elapsed_time
        print(f"    {name} обучена за {elapsed_time:.4f} секунд.")

    fastest_models = sorted(training_times.items(), key=lambda x: x[1])[:3]
    fastest_model_names = [name for name, _ in fastest_models]

    print("\nТри самые быстрые модели:")
    for name in fastest_model_names:
        print(f"  {name}")

    if save_models:
        for name in fastest_model_names:
            model = models[name]
            model_filename = output_dir / \
                f"{dataset_file.stem}_{name.replace(' ', '_').lower()}.pkl"
            with open(model_filename, 'wb') as f:
                pickle.dump(model, f)
            print(f"  Модель '{name}' сохранена как: {model_filename}")
    else:
        print("Сохранение моделей отключено.")


--- Работаем с датасетом: dataset_10_rows1492_feats7.csv ---
Обучение моделей:
    Logistic Regression обучена за 0.1131 секунд.
    Decision Tree обучена за 0.0180 секунд.
    K-Nearest Neighbors обучена за 0.0081 секунд.
    Support Vector Machine обучена за 0.0567 секунд.

Три самые быстрые модели:
  K-Nearest Neighbors
  Decision Tree
  Support Vector Machine
  Модель 'K-Nearest Neighbors' сохранена как: learn_models\dataset_10_rows1492_feats7_k-nearest_neighbors.pkl
  Модель 'Decision Tree' сохранена как: learn_models\dataset_10_rows1492_feats7_decision_tree.pkl
  Модель 'Support Vector Machine' сохранена как: learn_models\dataset_10_rows1492_feats7_support_vector_machine.pkl

--- Работаем с датасетом: dataset_11_rows1230_feats9.csv ---
Обучение моделей:
    Logistic Regression обучена за 0.1646 секунд.
    Decision Tree обучена за 0.0124 секунд.
    K-Nearest Neighbors обучена за 0.0020 секунд.
    Support Vector Machine обучена за 0.0677 секунд.

Три самые быстрые модели:
  K-N