1. Perform PCA to reduce dimention.

In [1]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

df = pd.read_csv('combined_dataset.csv')
duplicate_columns = df.columns[df.columns.duplicated()]
print(duplicate_columns)
df.set_index(df.columns[0], inplace=True)

pca = PCA(n_components=20)  # Choose the number of components to retain

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)
X_pca = pca.fit_transform(X_scaled)


explained_variance_ratio = pca.explained_variance_ratio_


feature_names = df.columns


component_df = pd.DataFrame({
    'Principal Component': range(1, 21),  # Top 20 components
    'Explained Variance (%)': explained_variance_ratio * 100,
    'Top Feature': [feature_names[i] for i in pca.components_.argmax(axis=1)]
})

# Sort the DataFrame 
component_df = component_df.sort_values(by='Explained Variance (%)', ascending=False)


print(component_df)


Index([], dtype='object')
    Principal Component  Explained Variance (%)              Top Feature
0                     1               12.966038  TMUS_1hr_Moving_Average
1                     2                9.591556              MSFT_Volume
2                     3                8.629144        NASDAQ_Volatility
3                     4                5.341366        NASDAQ_Log_Change
4                     5                3.340640     NFLX_High_Low_Spread
5                     6                2.500866          AMAT_Volatility
6                     7                2.231122   TXN_1hr_Moving_Average
7                     8                1.809474          ADBE_Volatility
8                     9                1.664596  QCOM_1hr_Moving_Average
9                    10                1.399536     MSFT_High_Low_Spread
10                   11                1.315777           HON_Volatility
11                   12                1.239295          NFLX_Volatility
12                   13  

2. Kernel PCA

In [2]:
from sklearn.decomposition import KernelPCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

df = pd.read_csv('combined_dataset.csv')

df.set_index(df.columns[0], inplace=True)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)


kernel_pca = KernelPCA(kernel='rbf', n_components=20)

X_kernel_pca = kernel_pca.fit_transform(X_scaled)



In [3]:
# Converting the transformed data into a DataFrame
X_kernel_pca_df = pd.DataFrame(X_kernel_pca, columns=[f'PC{i+1}' for i in range(1, 21)])

# Printing the transformed data
print(X_kernel_pca_df)


            PC2       PC3       PC4       PC5       PC6       PC7       PC8  \
0     -0.324059 -0.085298  0.054238 -0.138918 -0.050316 -0.029660 -0.051468   
1     -0.322553 -0.087503  0.108259 -0.051507 -0.046203 -0.007239 -0.004692   
2     -0.322075 -0.075787  0.031551 -0.172285 -0.042285 -0.014541 -0.017169   
3     -0.269825  0.094429  0.238493  0.150030 -0.039064  0.049505  0.035369   
4     -0.307715 -0.013144  0.251520  0.025153 -0.035242  0.043865  0.047053   
...         ...       ...       ...       ...       ...       ...       ...   
19092  0.328708 -0.001441 -0.098725 -0.040852  0.200911 -0.000274  0.030983   
19093 -0.096349  0.436710 -0.078957 -0.000852 -0.020590  0.091920  0.055087   
19094  0.223176  0.139104 -0.002326  0.083162  0.139886 -0.003650 -0.036680   
19095  0.097008  0.315731  0.078717  0.136064  0.051036 -0.002589 -0.122644   
19096 -0.088477  0.451630 -0.081098 -0.007008 -0.027562  0.101888  0.025923   

            PC9      PC10      PC11      PC12      