In [None]:
from ml.feature_engineering.factory import MLXFeaturesFactory

features_factory = MLXFeaturesFactory()

features = features_factory.load_derivative_features_cache("a7856b39-2e54-4282-aead-96d738cbd0d4")

features

In [None]:
# pca
from typing import Optional
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def show_pca(pca: PCA) -> None:
    """Show PCA result"""
    fig, ax = plt.subplots(figsize=(10, 8))
    ax.bar(np.arange(len(pca.explained_variance_ratio_)) + 1, pca.explained_variance_ratio_)
    ax.set_xlabel("Principal Component")
    ax.set_ylabel("Explained Variance Ratio")
    ax.set_title("Explained Variance Ratio of Principal Components")
    plt.show()

def pca(feature: pd.DataFrame, explained_variance_ratio: float = 0.95) -> Optional[np.ndarray]:
    scaler = StandardScaler()
    scaler.fit(feature.values)
    x_input = scaler.transform(feature.values)

    n_components = 0
    x_output: Optional[np.ndarray] = None
    print("input shape: ", x_input.shape)
    print("n_components max: ", len(feature.columns))

    for i in range(1, len(feature.columns)):
        pca = PCA(n_components=i)
        pca.fit(x_input)
        x_output = pca.transform(x_input)

        if sum(pca.explained_variance_ratio_) > explained_variance_ratio:
            n_components = i
            print("total explained variance ratio of first {} principal components: {}".format(i, sum(pca.explained_variance_ratio_)))
            show_pca(pca)
            break
        
        x_output = None
    
    if x_output is None:
        raise Exception("pca failed")

    print("find n_components: ", n_components)
    print("output shape: ", x_output.shape)
    return x_output

In [None]:
pcadf = features.drop("y", axis=1)
pca_output = pca(pcadf)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler


num_train = round(len(features)*0.7)

data_train = features.iloc[:num_train, :]
data_test = features.iloc[num_train:, :]


x_train = pca_output[:num_train]
y_train = data_train['y']

x_test = pca_output[num_train:]
y_test = data_test['y']

# scaler = StandardScaler()
# x_train = scaler.fit_transform(x_train)
# x_test = scaler.transform(x_test)


model = SVC(C=1, kernel='rbf')
model.fit(x_train, y_train)

y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

train_score = model.score(x_train,y_train)
print("train score",train_score)

test_score = model.score(x_test,y_test)
print("test score",test_score)

# report
report = classification_report(y_test, y_test_pred, labels=[-1, 0, 1], target_names=['short', 'no', 'long'])
print("report", report)