# Drop Highly Correlated Features

Feature selection is a removing unnecessary features.

In [1]:
# Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

# yfinance is used to fetch data 
import yfinance as yf
yf.pdr_override()

In [2]:
# input
symbol = 'AMD'
start = '2014-01-01'
end = '2018-08-27'

# Read data 
dataset = yf.download(symbol,start,end)

# View columns 
dataset.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Adj Close,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-01-02,3.95,3.95,3.98,3.84,3.85,20548400
2014-01-03,4.0,4.0,4.0,3.88,3.98,22887200
2014-01-06,4.13,4.13,4.18,3.99,4.01,42398300
2014-01-07,4.18,4.18,4.25,4.11,4.19,42932100
2014-01-08,4.18,4.18,4.26,4.14,4.23,30678700


In [3]:
dataset['Increase_Decrease'] = np.where(dataset['Volume'].shift(-1) > dataset['Volume'],1,0)
dataset['Buy_Sell_on_Open'] = np.where(dataset['Open'].shift(-1) > dataset['Open'],1,0)
dataset['Buy_Sell'] = np.where(dataset['Adj Close'].shift(-1) > dataset['Adj Close'],1,0)
dataset['Returns'] = dataset['Adj Close'].pct_change()
dataset = dataset.dropna()
dataset.head()

Unnamed: 0_level_0,Adj Close,Close,High,Low,Open,Volume,Increase_Decrease,Buy_Sell_on_Open,Buy_Sell,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2014-01-03,4.0,4.0,4.0,3.88,3.98,22887200,1,1,1,0.012658
2014-01-06,4.13,4.13,4.18,3.99,4.01,42398300,1,1,1,0.0325
2014-01-07,4.18,4.18,4.25,4.11,4.19,42932100,0,1,0,0.012106
2014-01-08,4.18,4.18,4.26,4.14,4.23,30678700,0,0,0,0.0
2014-01-09,4.09,4.09,4.23,4.05,4.2,30667600,0,0,1,-0.021531


In [4]:
# Create correlation matrix
corr_matrix = dataset.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

In [5]:
# Drop features 
dataset.drop(dataset[to_drop], axis=1)

Unnamed: 0_level_0,Adj Close,Volume,Increase_Decrease,Buy_Sell_on_Open,Buy_Sell,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-01-03,4.000000,22887200,1,1,1,0.012658
2014-01-06,4.130000,42398300,1,1,1,0.032500
2014-01-07,4.180000,42932100,0,1,0,0.012106
2014-01-08,4.180000,30678700,0,0,0,0.000000
2014-01-09,4.090000,30667600,0,0,1,-0.021531
2014-01-10,4.170000,20840800,1,1,0,0.019560
2014-01-13,4.130000,22856100,1,0,1,-0.009592
2014-01-14,4.300000,42434800,1,1,1,0.041162
2014-01-15,4.470000,66613100,0,1,0,0.039535
2014-01-16,4.380000,46975600,0,0,0,-0.020134
