In [1]:
import numpy as np
import pandas as pd
import math

In [2]:
data = [['i = 1',0,0,0],
        ['i = 2',1,0,-3],
        ['i = 3',0,1,-1],
        ['i = 4',2,1,-7],
        ['i = 5',1,2,-5],
        ['i = 6',-1,0,3],
        ['i = 7',0,-1,1],
        ['i = 8',-2,-1,7],
        ['i = 9',-1,-2,5]]
df = pd.DataFrame(data, columns = ['Index','x1,i','x2,i','yi'])
df

Unnamed: 0,Index,"x1,i","x2,i",yi
0,i = 1,0,0,0
1,i = 2,1,0,-3
2,i = 3,0,1,-1
3,i = 4,2,1,-7
4,i = 5,1,2,-5
5,i = 6,-1,0,3
6,i = 7,0,-1,1
7,i = 8,-2,-1,7
8,i = 9,-1,-2,5


In [3]:
x = df.drop(['Index'],axis = 1)[['x1,i','x2,i']]
v1 = x['x1,i'].to_numpy()
v2 = x['x2,i'].to_numpy()
y = df['yi'].to_numpy()

print(f'v1: {v1}, {type(v1)}')
print(f'v2: {v2}, {type(v2)}')
print(f'y: {y}, {type(y)}')

v1: [ 0  1  0  2  1 -1  0 -2 -1], <class 'numpy.ndarray'>
v2: [ 0  0  1  1  2  0 -1 -1 -2], <class 'numpy.ndarray'>
y: [ 0 -3 -1 -7 -5  3  1  7  5], <class 'numpy.ndarray'>


In [4]:
v1_norm = np.linalg.norm(v1,ord = 2)
v2_norm = np.linalg.norm(v2,ord = 2)
y_norm = np.linalg.norm(y,ord = 2)

print(f'v1_norm: {v1_norm}, {type(v1_norm)}')
print(f'v2_norm: {v2_norm}, {type(v2_norm)}')
print(f'y_norm: {y_norm}, {type(y_norm)}')

v1_norm: 3.4641016151377544, <class 'numpy.float64'>
v2_norm: 3.4641016151377544, <class 'numpy.float64'>
y_norm: 12.96148139681572, <class 'numpy.float64'>


In [5]:
rho_1 = abs(np.dot(v1.transpose(), y)/np.multiply(v1_norm, y_norm))
rho_2 = abs(np.dot(v2.transpose(), y)/np.multiply(v2_norm, y_norm))

print(f'rho_1: {rho_1}, {type(rho_1)}')
print(f'rho_2: {rho_2}, {type(rho_2)}')

rho_1: 0.9799578870122228, <class 'numpy.float64'>
rho_2: 0.8017837257372732, <class 'numpy.float64'>


In [6]:
if rho_1 > rho_2:
    print('rho_1')
else:
    print('rho_2')

rho_1


## Accumulating the above steps into a function

In [7]:
def featureSelection(x,y):
    l = len(x.columns)

    #required empty arrays
    vectors = []
    vector_norms = []
    pearson_coeffs = []
    y = df.iloc[:,-1].to_numpy()

    #appending column vectors to empty array
    vectors = [x[i].to_numpy() for i in x.columns]

    #appending vector norm values to empty array
    vector_norms = [np.linalg.norm(vector, ord = 2) for vector in vectors]
        
    y_norm = np.linalg.norm(y,ord = 2)
        
    # print(f'vectors: {vectors}\n')
    # print(f'vector norms: {vector_norms}\n')
    # print(f'y norm: {y_norm}\n')

    #appending Pearson Correlation coefficients to empty array
    pearson_coeffs = [
            abs(np.dot(vectors[i].transpose(), y)/np.multiply(vector_norms[i], y_norm)) for i in range(l)
        ]

    #getting the maximum pearson coefficient and its index
    max_pears_coeff = max(pearson_coeffs)
    max_pears_coeff_index = pearson_coeffs.index(max_pears_coeff)

    pears_coeff_tuple = tuple(zip(x.columns,pearson_coeffs))
    # pears_coeff_df = pd.DataFrame(pears_coeff_tuple,columns = ['Column','Pearson Correlation Coefficient'])
    pears_coeff_df = pd.DataFrame(pears_coeff_tuple,columns = ({
        "Column" : x.columns,
        "Pearson Correlation Coeffs" : pearson_coeffs
    }))
    
    return x.columns[max_pears_coeff_index],max_pears_coeff

In [8]:
x = df.drop(['Index'],axis = 1)[['x1,i','x2,i']]
y = df['yi'].to_numpy()
(req_col,parson_coeff) = featureSelection(x,y)
# featureSelection(x,y)
print(req_col)
print(parson_coeff)

x1,i
0.9799578870122228
