In [3]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

## Riil

In [3]:
# Load the data
df = pd.read_csv('../data/technical_indicators_BBCA.csv', delimiter=';')

# Remove duplicate columns
df = df.loc[:, ~df.columns.duplicated()]

# Separate features, target, and date
features = df.drop(columns=['Date', 'Close', 'Adj Close'])
target = df['Close']
date = df['Date']

# Standardize the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Apply PCA
pca = PCA()
pca.fit(features_scaled)

# Calculate cumulative explained variance
explained_variance_ratio = np.cumsum(pca.explained_variance_ratio_)

# Determine the number of components that explain 95% of the variance
optimal_n_components = np.argmax(explained_variance_ratio >= 0.95) + 1

# Fit PCA with the optimal number of components
pca_optimal = PCA(n_components=optimal_n_components)
pca_optimal_data = pca_optimal.fit_transform(features_scaled)

# Determine the most contributing features for each principal component
top_features = []
for i in range(optimal_n_components):
    component_contributions = np.abs(pca_optimal.components_[i])
    sorted_indices = np.argsort(-component_contributions)
    top_features.append(features.columns[sorted_indices[0]])

# Create standard PCA component names
pca_component_names = [f'PC{i+1}' for i in range(optimal_n_components)]

# Create a mapping of PCA components to their top contributing features
pca_feature_mapping = {f'PC{i+1}': feature for i, feature in enumerate(top_features)}

# Create DataFrames for unscaled and PCA-transformed data
unscaled_df = pd.DataFrame(scaler.inverse_transform(features_scaled), columns=features.columns, index=date)
unscaled_df['Close'] = target

pca_df = pd.DataFrame(pca_optimal_data, columns=pca_component_names, index=date)
pca_df['Close'] = target

# Save to CSV files
unscaled_df.to_csv('unscaled_data_BBCA.csv', index_label='Date')
pca_df.to_csv('pca_data_BBCA.csv', index_label='Date')

# Get the names of the top 5 contributing features for each component
top_5_features = np.array(features.columns)[np.argsort(-np.abs(pca_optimal.components_), axis=1)[:, :5]]

# Display results
print(f"Optimal number of components: {optimal_n_components}")
print("\nTop 5 contributing features for each component:")
for i, features in enumerate(top_5_features):
    print(f"PC{i+1}: {', '.join(features)}")
print(f"\nExplained variance ratio (cumulative): {explained_variance_ratio[:optimal_n_components]}")

# Scree plot
scree_plot = px.line(
    x=range(1, len(pca.explained_variance_ratio_) + 1),
    y=pca.explained_variance_ratio_,
    title="Scree Plot: Explained Variance per PCA Component",
    labels={"x": "Principal Component", "y": "Explained Variance Ratio"}
)

# Cumulative explained variance plot
cumulative_variance_plot = px.line(
    x=range(1, len(explained_variance_ratio) + 1),
    y=explained_variance_ratio,
    title="Cumulative Explained Variance",
    labels={"x": "Number of Components", "y": "Cumulative Explained Variance"}
)

# 2D Scatter plot
scatter_2d = px.scatter(
    x=pca_optimal_data[:, 0],
    y=pca_optimal_data[:, 1],
    title="PCA 2D Scatter Plot (First Two Components)",
    labels={"x": pca_component_names[0], "y": pca_component_names[1]}
)

# 3D Scatter plot (if applicable)
if optimal_n_components >= 3:
    scatter_3d = px.scatter_3d(
        x=pca_optimal_data[:, 0],
        y=pca_optimal_data[:, 1],
        z=pca_optimal_data[:, 2],
        title="PCA 3D Scatter Plot (First Three Components)",
        labels={"x": pca_component_names[0], "y": pca_component_names[1], "z": pca_component_names[2]}
    )

# Show the plots
scree_plot.show()
cumulative_variance_plot.show()
scatter_2d.show()
if optimal_n_components >= 3:
    scatter_3d.show()

# Print PCA Component to Top Feature Mapping
print("\nPCA Component to Top Feature Mapping:")
for pc, feature in pca_feature_mapping.items():
    print(f"{pc}: {feature}")

Optimal number of components: 46

Top 5 contributing features for each component:
PC1: EMA_5, SMA_5, BBANDS_middle_5_2, WMA_5, TRIMA_5
PC2: RSI_14, CMO, RSI, RSI_28, slowk_14_3
PC3: MACD_signal_19_39, MACDEXT_signal, MACDFIX_signal, MACD_signal_12_26, TRIX
PC4: BOP, PATTERN_LONGLINE, PATTERN_CLOSINGMARUBOZU, STOCHF_k, STOCHRSI_k
PC5: ATR_14, ATR_28, TRANGE, PLUS_DM, NATR
PC6: PATTERN_DOJI, PATTERN_LONGLEGGEDDOJI, PATTERN_HIGHWAVE, PATTERN_RICKSHAWMAN, PATTERN_SPINNINGTOP
PC7: PATTERN_MARUBOZU, PATTERN_BELTHOLD, PATTERN_LONGLINE, BOP, PATTERN_CLOSINGMARUBOZU
PC8: PATTERN_TAKURI, PATTERN_DRAGONFLYDOJI, HT_SINE_sine, PATTERN_HANGINGMAN, HT_SINE_leadsine
PC9: PATTERN_TAKURI, PATTERN_DRAGONFLYDOJI, PATTERN_HANGINGMAN, PATTERN_HIGHWAVE, PATTERN_SPINNINGTOP
PC10: PATTERN_INNECK, PATTERN_ONNECK, PATTERN_HARAMI, PATTERN_HARAMICROSS, PATTERN_HOMINGPIGEON
PC11: PATTERN_HARAMI, DX, PATTERN_HARAMICROSS, ADX_14, PATTERN_HOMINGPIGEON
PC12: PATTERN_ONNECK, PATTERN_INNECK, PATTERN_HARAMI, PATTERN_COUNT


PCA Component to Top Feature Mapping:
PC1: EMA_5
PC2: RSI_14
PC3: MACD_signal_19_39
PC4: BOP
PC5: ATR_14
PC6: PATTERN_DOJI
PC7: PATTERN_MARUBOZU
PC8: PATTERN_TAKURI
PC9: PATTERN_TAKURI
PC10: PATTERN_INNECK
PC11: PATTERN_HARAMI
PC12: PATTERN_ONNECK
PC13: DX
PC14: PATTERN_MORNINGDOJISTAR
PC15: PATTERN_EVENINGSTAR
PC16: PATTERN_GRAVESTONEDOJI
PC17: PATTERN_MORNINGSTAR
PC18: HT_TRENDMODE
PC19: PATTERN_SHORTLINE
PC20: PATTERN_HAMMER
PC21: HT_PHASOR_quadrature
PC22: PATTERN_SEPARATINGLINES
PC23: PATTERN_GAPSIDESIDEWHITE
PC24: PATTERN_HIKKAKE
PC25: PATTERN_DARKCLOUDCOVER
PC26: PATTERN_IDENTICAL3CROWS
PC27: PATTERN_3OUTSIDE
PC28: PATTERN_3WHITESOLDIERS
PC29: PATTERN_3WHITESOLDIERS
PC30: PATTERN_DARKCLOUDCOVER
PC31: PATTERN_PIERCING
PC32: PATTERN_IDENTICAL3CROWS
PC33: PATTERN_SEPARATINGLINES
PC34: PATTERN_TRISTAR
PC35: PATTERN_COUNTERATTACK
PC36: PATTERN_3OUTSIDE
PC37: PATTERN_COUNTERATTACK
PC38: PATTERN_DOJISTAR
PC39: PATTERN_DOJISTAR
PC40: PATTERN_ENGULFING
PC41: PATTERN_HAMMER
PC42: PATTERN