In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, roc_curve, auc

In [14]:
# import the raw dataset
df = pd.read_csv("feature_extracted.csv")

In [15]:
df

# 1 2 3 4 9 11 13 15

Unnamed: 0,Open,High,Low,Close,Volume,ETFs_index,target,Close_Ratio_2,Trend_2,Close_Ratio_5,Trend_5,Close_Ratio_60,Trend_60,Close_Ratio_250,Trend_250,Close_Ratio_1000,Trend_1000
0,39.240,39.2400,38.830,39.1500,3030,0,1,1.084074,37.0,1.106507,90.0,1.092566,1229.0,1.028655,5053.0,1.056832,19914.0
1,209.850,211.8000,206.800,207.5500,47691,26,1,5.390895,37.0,5.705599,90.0,5.781958,1230.0,5.450793,5054.0,5.602066,19914.0
2,96.467,96.5460,96.314,96.3620,12049,49,1,2.502607,38.0,2.620646,91.0,2.682057,1231.0,2.530414,5055.0,2.600853,19915.0
3,30.421,30.9500,30.166,30.9230,96770,36,1,0.799377,38.0,0.840343,92.0,0.860760,1232.0,0.812056,5055.0,0.834634,19915.0
4,9.516,9.6460,9.516,9.6460,504,71,0,0.249476,38.0,0.262139,93.0,0.268605,1233.0,0.253488,5055.0,0.260359,19915.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147981,67.190,67.1900,66.370,67.0700,1961179,8,1,1.287979,31.0,1.275114,66.0,1.301645,1188.0,1.342359,5297.0,1.428174,20548.0
147982,13.500,13.5500,13.475,13.5400,162075,71,0,0.262483,31.0,0.257756,66.0,0.262907,1189.0,0.271002,5297.0,0.288319,20549.0
147983,14.460,14.5868,14.460,14.5001,9162,12,1,0.281760,31.0,0.276321,65.0,0.281690,1188.0,0.290228,5296.0,0.308770,20549.0
147984,28.400,28.5899,28.200,28.4900,180378,99,1,0.554265,31.0,0.548329,66.0,0.553448,1189.0,0.570249,5297.0,0.606703,20550.0


In [16]:
# drop the label, split the data, and normalize the data
xTrain, xTest, yTrain, yTest = train_test_split(df.drop("target", axis=1), df["target"], test_size=0.2, random_state=42)

scalar = StandardScaler()

xTrain = scalar.fit_transform(xTrain)
xTest = scalar.transform(xTest)

In [17]:
# feature analysis
pca = PCA()
pca.fit(xTrain)

cumulative_var = np.cumsum(pca.explained_variance_ratio_)

print(pca.explained_variance_ratio_)

num_components = 0
for i in range(len(cumulative_var)):
    if cumulative_var[i] >= 0.95:
        num_components = i + 1
        break
    
print(num_components)

pca_95_var = PCA(n_components=num_components)

xTrain_trainsformed = pca_95_var.fit_transform(xTrain)
xTest_trainsformed = pca_95_var.transform(xTest)

principal_components = pca_95_var.components_

print(principal_components)

# 1 2 3 4 9 11 13 15

[5.42122045e-01 1.34162340e-01 8.87949163e-02 6.72233246e-02
 5.75724071e-02 4.86953134e-02 2.27470556e-02 1.82426436e-02
 1.28139127e-02 5.49605665e-03 1.25056872e-03 6.06223788e-04
 2.10459630e-04 3.62514038e-05 2.20856788e-05 4.39662968e-06]
7
[[ 3.36255126e-01  3.36058688e-01  3.36417643e-01  3.36289720e-01
   8.52039524e-03  2.16034072e-02  3.24568033e-01  1.36046089e-03
   3.30872594e-01  6.57541044e-04  3.33580561e-01  1.64629876e-03
   3.31334280e-01  4.25812050e-03  3.33563644e-01  4.77809635e-03]
 [-9.08007374e-04 -6.17526317e-04 -1.18290887e-03 -8.36986173e-04
   1.20008872e-02  4.30304555e-03  7.32286399e-03 -4.35916625e-01
   7.36018295e-03 -4.82225773e-01  5.40020728e-03 -4.89903132e-01
   3.07053587e-04 -4.86552960e-01 -1.24640206e-03 -3.16846477e-01]
 [-5.24401023e-03 -5.25463789e-03 -5.23303647e-03 -5.28652173e-03
  -2.17059231e-02  9.78917421e-03  1.06300852e-02  5.44027708e-01
   1.07835357e-02  4.90810513e-01  8.71692748e-03 -2.27773252e-01
   4.37978822e-03 -4.4365

In [18]:
# generate output dataset
pd.DataFrame(xTrain_trainsformed).to_csv("pca_data/xTrain.csv", index=False)
pd.DataFrame(xTest_trainsformed).to_csv("pca_data/xTest.csv", index=False)

pd.DataFrame(yTrain).to_csv("pca_data/yTrain_discrete.csv", index=False)
pd.DataFrame(yTest).to_csv("pca_data/yTest_discrete.csv", index=False)