#  Импорт библиотек

In [1]:
import numpy as np

import pandas as pd

import time

# mpl
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns; sns.set(style='white')
%matplotlib inline

# для генерации нормального распределения
from numpy.linalg import norm

# datasets
from sklearn import datasets
from sklearn.model_selection import train_test_split

# models
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
# metrics
from sklearn.metrics import accuracy_score, roc_auc_score

# 
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

random_state = 42

## Загружаем датасет

In [2]:
df = datasets.load_digits()
X = df.data
y = df.target

## Определим модели и преобразователи

In [3]:
models = {
    "SVC" : SVC(random_state=random_state),
    "LogisticRegression" : LogisticRegression(max_iter=10000, random_state=random_state),
    "DecisionTreeClassifier" : DecisionTreeClassifier(random_state=random_state)
}

transformers = {
    "PCA" : PCA(n_components=0.90, random_state=random_state),
    "TSNE" : TSNE(n_components=2, random_state=random_state)
}

## Получим предикт на неообработанных данных

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=random_state, stratify=y)
clear_predicts = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    clear_predicts[model_name] = model.predict(X_test)
    

## Применим алгоритмы уменьшения размерности

In [5]:
preprocessing_time = {
    "PCA" : {}
}
transformers_predicts = {
    "PCA" : {},
    "TSNE" : {}
}

## PCA

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=random_state, stratify=y)

start_time = time.time()
transformers["PCA"].fit(X_train)
X_train = transformers["PCA"].transform(X_train)
preprocessing_time["PCA"]["train"] = time.time() - start_time

start_time = time.time()
X_test = transformers["PCA"].transform(X_test)
preprocessing_time["PCA"]["test"] = time.time() - start_time

n_components = transformers["PCA"].n_components_

for model_name, model in models.items():
    model.fit(X_train, y_train)
    transformers_predicts["PCA"][model_name] = model.predict(X_test)

## TSNE

In [7]:
start_time = time.time()
X = transformers["TSNE"].fit_transform(X)
preprocessing_time["TSNE"] = time.time() - start_time

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=random_state, stratify=y)

for model_name, model in models.items():
    model.fit(X_train, y_train)
    transformers_predicts["TSNE"][model_name] = model.predict(X_test)

## Отобразим метрики для результатов без снижения размерности

In [8]:
for model_name, predict in clear_predicts.items():
    print(f"Model {model_name} score: ", accuracy_score(y_true=y_test, y_pred=predict), "\n")

Model SVC score:  0.9911111111111112 

Model LogisticRegression score:  0.9533333333333334 

Model DecisionTreeClassifier score:  0.8244444444444444 



## Отобразим метрики для результатов со снижением размерности

In [9]:
for transformer_name, predicts in transformers_predicts.items():
    print(f"Transformer {transformer_name} results:")
    for model_name, predict in predicts.items():
        print(f"Model {model_name} score: ", accuracy_score(y_true=y_test, y_pred=predict), "\n")
    print("-----------------------------------------")

Transformer PCA results:
Model SVC score:  0.9866666666666667 

Model LogisticRegression score:  0.9266666666666666 

Model DecisionTreeClassifier score:  0.86 

-----------------------------------------
Transformer TSNE results:
Model SVC score:  0.9555555555555556 

Model LogisticRegression score:  0.9066666666666666 

Model DecisionTreeClassifier score:  0.9666666666666667 

-----------------------------------------


## Сравнение времени работы алгоритмов TSNE и PCA

In [13]:
train_time = preprocessing_time["PCA"]["train"]
test_time = preprocessing_time["PCA"]["test"]
print("PCA preprocessing time:")
print(f"Train: {train_time} sec")
print(f"Test: {test_time} sec")
print(f"Summary: {train_time + test_time} sec")

PCA preprocessing time:
Train: 0.004702091217041016 sec
Test: 0.00023412704467773438 sec
Summary: 0.00493621826171875 sec


In [14]:
train_time = preprocessing_time["TSNE"]
print("TSNE preprocessing time:")
print(f"Train: {train_time} sec")

TSNE preprocessing time:
Train: 2.5306289196014404 sec


In [15]:
print(f"To retain 90 percent of the dispersion, the pca algorithm needs {n_components} components")

To retain 90 percent of the dispersion, the pca algorithm needs 21 components


## Результаты:
### Для сохранения 90 процентов дисперсии в алгоритме PCA необходима 21 компонента
### Время работы препроцессинга:
- PCA : 0.00493621826171875 sec
- TSNE : 2.5306289196014404 sec
### Изменение результатов 
- Уменьшение размерности дало прирост в точности для Дерева решений, в лучшем случае при использовании TSNE
- Уменьшение размерности негативно сказалось на точности Логистической регрессии и SVM