In [38]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, roc_curve, auc

In [39]:
# import the raw dataset
df = pd.read_csv("feature_extracted.csv")

In [40]:
df

# 1 2 3 4 9 11 13 15

Unnamed: 0,Open,High,Low,Close,Volume,ETFs_index,target,Close_Ratio_2,Trend_2,Close_Ratio_5,Trend_5,Close_Ratio_60,Trend_60,Close_Ratio_250,Trend_250,Close_Ratio_1000,Trend_1000
0,15.8500,16.0500,15.5500,15.6000,11405,78,0,0.451045,41.0,0.433488,94.0,0.435645,1232.0,0.410164,5057.0,0.421132,19917.0
1,25.7300,26.0900,25.7300,26.0900,464,14,0,0.761027,40.0,0.724536,94.0,0.730506,1232.0,0.685969,5056.0,0.704326,19916.0
2,13.6480,13.6720,13.4780,13.5500,34975,18,1,0.394723,40.0,0.379756,94.0,0.379407,1232.0,0.356258,5055.0,0.365804,19916.0
3,21.5090,21.5460,21.2640,21.5460,112463,85,1,0.629987,41.0,0.605272,95.0,0.603240,1233.0,0.566511,5055.0,0.581679,19916.0
4,22.3410,22.3480,22.1670,22.1670,770,46,0,0.656987,42.0,0.621724,95.0,0.620620,1234.0,0.582856,5055.0,0.598454,19916.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146211,25.2200,25.3200,25.1550,25.2900,4516551,89,1,0.456379,29.0,0.472916,66.0,0.487958,1188.0,0.503038,5305.0,0.535405,20609.0
146212,24.2700,24.3100,24.1168,24.3000,93531,6,0,0.447430,30.0,0.454890,66.0,0.468795,1188.0,0.483361,5305.0,0.514443,20609.0
146213,50.2329,50.3031,49.9600,50.3031,6267,12,0,0.928465,30.0,0.939920,65.0,0.970268,1187.0,1.000553,5304.0,1.065006,20608.0
146214,28.1900,28.2790,28.0300,28.2000,43206,19,1,0.520557,29.0,0.528568,65.0,0.544125,1186.0,0.561022,5303.0,0.597051,20607.0


In [41]:
# drop the label, split the data, and normalize the data
xTrain, xTest, yTrain, yTest = train_test_split(df.drop("target", axis=1), df["target"], test_size=0.2, random_state=42)

scalar = StandardScaler()

xTrain = scalar.fit_transform(xTrain)
xTest = scalar.transform(xTest)

In [42]:
# feature analysis
pca = PCA()
pca.fit(xTrain)

cumulative_var = np.cumsum(pca.explained_variance_ratio_)

print(pca.explained_variance_ratio_)

num_components = 0
for i in range(len(cumulative_var)):
    if cumulative_var[i] >= 0.95:
        num_components = i + 1
        break
    
print(num_components)

pca_95_var = PCA(n_components=num_components)

xTrain_trainsformed = pca_95_var.fit_transform(xTrain)
xTest_trainsformed = pca_95_var.transform(xTest)

principal_components = pca_95_var.components_

print(principal_components)

# 1 2 3 4 9 11 13 15

[5.41954569e-01 1.34069465e-01 8.87594538e-02 6.72041445e-02
 5.75790752e-02 4.87164246e-02 2.27230746e-02 1.83309780e-02
 1.24698874e-02 6.02715211e-03 1.30281793e-03 5.88510679e-04
 2.13027306e-04 3.70017941e-05 2.05085251e-05 3.91004796e-06]
7
[[ 3.36390153e-01  3.36240094e-01  3.36505052e-01  3.36420779e-01
   8.40905417e-03  2.32014564e-02  3.24764238e-01  5.78783953e-04
   3.30719466e-01  4.50544128e-04  3.33199757e-01  1.11816678e-03
   3.31023485e-01  4.31546619e-03  3.33579443e-01  4.38125650e-03]
 [-9.05849099e-04 -6.22451957e-04 -1.19557552e-03 -8.79811477e-04
   8.86410602e-03  6.54767448e-03  6.89709605e-03 -4.36182029e-01
   6.61277544e-03 -4.83440708e-01  4.57740077e-03 -4.92899475e-01
  -4.49937869e-04 -4.86466084e-01 -1.01237371e-03 -3.10149709e-01]
 [-5.33795895e-03 -5.34488777e-03 -5.30852122e-03 -5.34113859e-03
  -9.26275726e-03  9.48296698e-03  1.11962282e-02  5.42220081e-01
   1.06522560e-02  4.88459093e-01  8.44743339e-03 -2.26329502e-01
   4.49309903e-03 -4.4381

In [43]:
# generate output dataset
pd.DataFrame(xTrain_trainsformed).to_csv("pca/xTrain.csv", index=False)
pd.DataFrame(xTest_trainsformed).to_csv("pca/xTest.csv", index=False)

pd.DataFrame(yTrain).to_csv("pca/yTrain_discrete.csv", index=False)
pd.DataFrame(yTest).to_csv("pca/yTest_discrete.csv", index=False)