In [None]:
import time
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier

from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.utils import shuffle

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
def load_data_frame():
    file_path = Path("full_data.csv")

    if file_path.exists():
        df = pd.read_csv(file_path)
        print("Полная обучающая выборка загружена")
    else:
        print("Полная обучающая выборка не обнаружена")
        all_files = [f"/home/jupyter/datasphere/project/human_activity_recognition/csh{i}/csh{i}.ann.features.csv" for i in range(101, 131)]
        df = pd.concat([pd.read_csv(f) for f in all_files], axis=0)
        pd.DataFrame(full_data).to_csv("full_data.csv", index=False)
        print("Полная обучающая выборка загружена и сохранена")
    return df

In [3]:
def preprocess_data(df):
    encoder = LabelEncoder()
    encoder.fit(df['activity'])

    scaler = StandardScaler()
    scaler.fit(df.drop("activity", axis=1))
    
    X_full = scaler.transform(df.drop("activity", axis=1))
    y_full = encoder.transform(df['activity'])
    
    pca = PCA(random_state=42, n_components=20)
    pca.fit(X_full)
    X_full = pca.transform(X_full)
    
    X_train, X_test, y_train, y_test  = train_test_split(
    X_full, y_full, 
    test_size=0.1,
    random_state=42)
    print('Полная обучающая выборка подготовлена')
    return X_train, X_test, y_train, y_test

In [4]:
df = load_data_frame()

Полная обучающая выборка загружена


In [5]:
X_train, X_test, y_train, y_test = preprocess_data(df)

Полная обучающая выборка подготовлена


In [6]:
def research_fit_vs_size_time(sizes, X_train, X_test, y_train, y_test, results):
    for size in sizes:
        X_train_current, _ , y_train_current, _ = train_test_split(X_train, y_train, test_size=1-size, shuffle=True, random_state=42)

        start_time = time.time()
        model = RandomForestClassifier(random_state=42, n_estimators=120, max_depth=None, n_jobs=-1)
        model.fit(X_train_current, y_train_current)
        train_time = time.time() - start_time

        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)

        results.append({
            'train_size_mb': X_train_current.nbytes / 2**20,
            'train_time_sec': train_time,
            'accuracy': accuracy,
        })
        print(f"Train size: {size:.0%}, {X_train_current.nbytes / 2**20} Mb | Time: {train_time:.2f} sec | Accuracy: {accuracy}")

In [None]:
sizes = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
results = []

research_fit_vs_size_time(sizes, X_train, X_test, y_train, y_test, results)

pd.DataFrame(results).to_csv("time_vs_data_size.csv", index=False)

In [6]:
def research_fit_vs_cores_time(cores, X_train, X_test, y_train, y_test, results):
    X_train_current, _ , y_train_current, _ = train_test_split(X_train, y_train, test_size=0.5, shuffle=True, random_state=42)
    for core in cores:
        start_time = time.time()
        model = RandomForestClassifier(random_state=42, n_estimators=75, max_depth=None, n_jobs=core)
        model.fit(X_train_current, y_train_current)
        train_time = time.time() - start_time

        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)

        results.append({
            'train_size_core': core,
            'train_time_sec': train_time,
            'accuracy': accuracy,
        })

        print(f"cores: {core} | Time: {train_time:.2f} sec | Accuracy: {accuracy}")

In [None]:
cores = [i for i in range(6, 33, 2)]
results = []

research_fit_vs_cores_time(cores, X_train, X_test, y_train, y_test, results)

pd.DataFrame(results).to_csv("time_vs_cores.csv", index=False)

cores: 6 | Time: 1249.80 sec | Accuracy: 0.8300698739517475


In [None]:
def vizualization_time_vs_size():
    df = pd.read_csv("time_vs_data_size.csv")
    
    sns.set(style="whitegrid", palette="muted")
    plt.rcParams['font.size'] = 12
    plt.rcParams['figure.figsize'] = (14, 8)
    
    fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
    
    sns.lineplot(data=df, x="train_size_mb", y="train_time_sec", 
                 ax=ax1, marker="o", color="darkorange", linewidth=2)
    ax1.set_title("Зависимость времени обучения от объема данных")
    ax1.set_ylabel("Время (сек)")
    ax1.grid(True, linestyle='--', alpha=0.7)
    
    metrics = ['accuracy']
    colors = ['dodgerblue']
    
    for metric, color in zip(metrics, colors):
        sns.lineplot(data=df, x="train_size_mb", y=metric, 
                     ax=ax2, marker="o", color=color, 
                     label=metric.upper(), linewidth=2)
    
    ax2.set_title("Зависимость метрик качества от объема данных")
    ax2.set_xlabel("Объем обучающих данных (Мб)")
    ax2.set_ylabel("Значение метрики")
    ax2.legend(loc="lower right")
    ax2.grid(True, linestyle='--', alpha=0.7)
    
    plt.savefig("time_vs_data_size_plot.png", dpi=300, bbox_inches='tight')
    plt.close()

In [9]:
def vizualization_time_vs_core():
    df = pd.read_csv("time_vs_cores.csv")
    
    sns.set(style="whitegrid", palette="muted")
    plt.rcParams['font.size'] = 12
    plt.rcParams['figure.figsize'] = (14, 8)
    
    fig, (ax1) = plt.subplots(1, 1, sharex=True)
    
    sns.lineplot(data=df, x="train_size_core", y="train_time_sec", 
                 ax=ax1, marker="o", color="darkorange", linewidth=2)
    ax1.set_title("Зависимость времени обучения от количества вычислителей")
    ax1.set_ylabel("Время (сек)")
    ax1.set_xlabel("Число вычислителей")
    ax1.grid(True, linestyle='--', alpha=0.7)

    plt.savefig("time_vs_cores_plot.png", dpi=300, bbox_inches='tight')
    plt.close()

In [10]:
vizualization_time_vs_size()
vizualization_time_vs_core()