## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D

In [2]:
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from numpy import corrcoef

In [3]:
df = pd.read_csv('./FODS-A2.csv')

In [4]:
df.head()

Unnamed: 0,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,...,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2,Appliances
0,21.2,33.29,19.823333,31.79,23.463333,38.23,20.5,31.73,19.2,39.363333,...,29.23,9.85,756.183333,41.833333,4.833333,40.0,-2.67,42.01718,42.01718,290
1,21.79,38.5,19.5,40.633333,22.5,37.9,21.0,37.9,20.033333,47.29,...,40.326667,6.9,754.0,75.0,4.0,40.0,2.8,24.62438,24.62438,50
2,22.39,41.39,20.2,43.79,24.5,39.333333,20.1,38.26,19.39,48.09,...,42.06,10.1,756.433333,68.0,5.833333,40.0,4.45,3.73126,3.73126,260
3,24.0,30.26,24.39,26.963333,23.39,33.4,22.79,31.2,21.033333,40.626667,...,35.5,19.1,760.0,31.0,4.0,40.0,1.5,1.058826,1.058826,50
4,20.05,38.245,17.6,41.0,21.1,37.2,19.89,36.4,18.2,43.56,...,38.863333,0.1,754.6,99.0,1.0,32.0,-0.1,39.248108,39.248108,30


In [5]:
df.shape

(7894, 27)

In [6]:
data = df.to_numpy()
f = list(df.columns)

## Functions

In [7]:
def predict(data,params):
    X = data[:,0:-1]
    Y = data[:,[-1]]
    Yhat = np.dot(X,params)
    return Yhat

In [8]:
def norm_train(data):
    X = data[:,1:-1]
    xm = np.mean(X,axis=0)
    xs = np.std(X,axis=0)
    norm_X = np.divide(X-xm,xs)
    norm_data = np.c_[data[:,[0]],norm_X,data[:,[-1]]]
    return norm_data,xm,xs

In [9]:
def norm_test(data,xm,xs):
    X = data[:,1:-1]
    norm_X = np.divide(X-xm,xs)
    norm_data = np.c_[data[:,[0]],norm_X,data[:,[-1]]]
    return norm_data

In [10]:
def Cost(data,params):
    m = len(data)
    X = data[:,0:-1]
    Y = data[:,[-1]]
    
    cost = np.sum((np.dot(X,params) - Y) **2) / (2*m)
    return cost

In [11]:
def grad_desc(data,params,lr=0.01,epochs=1000):
    m = len(data)
    X = data[:,0:-1]
    y = data[:,[-1]]
    J = np.zeros(epochs)
    
    for i in range(epochs):
        delta = np.dot(X.T, (np.dot(X, params) - y))
        params = params - (lr/m)*delta
        J[i] = Cost(data, params)
        
    return params, J

In [12]:
def split(data):
    random.seed(30)
    mm = len(data)
    rangelist = list(range(mm))
    sz = int(len(rangelist)*0.8)
    frac = random.sample(rangelist,sz) 
    rem = [ val for val in rangelist if val not in frac]
    
    data_train = data[frac,:]
    data_test = data[rem,:]
    return data_train,data_test

## 2A) PCA

In [13]:
def calc_pca(n,data):
    X = data[:,0:-1]
    pca = PCA(n_components=n)
    pca.fit(X)
    data_pca = pca.transform(X)
    data_pca = np.concatenate((data_pca,data[:,[-1]]) , axis=1)
    return pca,data_pca

In [14]:
pca_errorlist = []
pca_eigenvals = []
c  = np.ones((len(data),1))

In [15]:
for i in range(1,27):
    
    pca,data_pca = calc_pca(i,data)
    
    eigenvals = list(pca.explained_variance_[0:i])
    data_pca = np.concatenate((c, data_pca), axis = 1)
    
    data_train,data_test = split(data_pca)

    
    params =  np.zeros((data_pca.shape[1] - 1, 1))
    
    data_train,data_train_mean,data_train_std = norm_train(data_train)
    
    print("data_train shape = ",data_train.shape," and params shape = ",params.shape," for i =  ",i,"\n")
    params,J_hist = grad_desc(data_train,params,0.01,1000)
    
    train_error = J_hist[-1]
    
    data_test = norm_test(data_test,data_train_mean,data_train_std)
#     print("data_test shape = ",data_test.shape, " Params shape = ",params.shape,"\n")
    yhat = predict(data_test,params)
    
    test_error = Cost(data_test,params)
    
    
    pca_errorlist.append([i,train_error,test_error])
    pca_eigenvals.append(eigenvals)
#     print("data_train shape = ",data_train.shape," and params shape = ",params.shape," for i =  ",i,"is exiting \n")
    
    
    

data_train shape =  (6315, 3)  and params shape =  (2, 1)  for i =   1 

data_train shape =  (6315, 4)  and params shape =  (3, 1)  for i =   2 

data_train shape =  (6315, 5)  and params shape =  (4, 1)  for i =   3 

data_train shape =  (6315, 6)  and params shape =  (5, 1)  for i =   4 

data_train shape =  (6315, 7)  and params shape =  (6, 1)  for i =   5 

data_train shape =  (6315, 8)  and params shape =  (7, 1)  for i =   6 

data_train shape =  (6315, 9)  and params shape =  (8, 1)  for i =   7 

data_train shape =  (6315, 10)  and params shape =  (9, 1)  for i =   8 

data_train shape =  (6315, 11)  and params shape =  (10, 1)  for i =   9 

data_train shape =  (6315, 12)  and params shape =  (11, 1)  for i =   10 

data_train shape =  (6315, 13)  and params shape =  (12, 1)  for i =   11 

data_train shape =  (6315, 14)  and params shape =  (13, 1)  for i =   12 

data_train shape =  (6315, 15)  and params shape =  (14, 1)  for i =   13 

data_train shape =  (6315, 16)  and 

## Correlation coefficient

In [16]:
def correlation_coeff(data):
    coeff_org = []
    coeff_abs = []
    for i in range(1,27):
        corr = corrcoef(data[:,i-1],data[:,-1])[0][1]
        coeff_org.append([i,corr])
        coeff_abs.append([i,abs(corr)])
        coeff_sorted = sorted(coeff_abs,key = lambda l:l[1],reverse=True)
        
    return  coeff_sorted,coeff_org

In [17]:
coeffs,coeff_org = correlation_coeff(data)
coeffs[0:5]

[[21, 0.15788804064111284],
 [11, 0.1157990544069468],
 [3, 0.10954237300483136],
 [19, 0.09890810764010086],
 [16, 0.09104028998178089]]

In [18]:
corr_errors = []
corr_f = []
const = np.ones((len(data),1))

In [21]:
for n in range(1, 27):
    cols = []
    for i in range(n):
        cols.append(coeffs[i][0] - 1)
    
    feat = [f[i] for i in cols]
    corr_f.append(feat)
        
    corr_data = data[:,cols]
    corr_data = np.concatenate((corr_data, data[:,[-1]]), axis=1)
    corr_data = np.concatenate((const, corr_data), axis = 1)
    
    data_train, data_test = split(corr_data)
    
    params = np.zeros((corr_data.shape[1] - 1, 1))
    
    data_train, train_mean, train_std = norm_train(data_train)
    print("data_train shape = ",data_train.shape," and params shape = ",params.shape," for i =  ",n,"\n")
        
    params, J_history = grad_desc(data_train, params, 0.01, 10**3)
    
    training_error = J_history[-1]
    
    data_test = norm_test(data_test, train_mean, train_std)
    yhat = predict(data_test, params)
    testing_error = Cost(data_test, params)
    
    corr_errors.append([n, training_error, testing_error]) 

data_train shape =  (6315, 3)  and params shape =  (2, 1)  for i =   1 

data_train shape =  (6315, 4)  and params shape =  (3, 1)  for i =   2 

data_train shape =  (6315, 5)  and params shape =  (4, 1)  for i =   3 

data_train shape =  (6315, 6)  and params shape =  (5, 1)  for i =   4 

data_train shape =  (6315, 7)  and params shape =  (6, 1)  for i =   5 

data_train shape =  (6315, 8)  and params shape =  (7, 1)  for i =   6 

data_train shape =  (6315, 9)  and params shape =  (8, 1)  for i =   7 

data_train shape =  (6315, 10)  and params shape =  (9, 1)  for i =   8 

data_train shape =  (6315, 11)  and params shape =  (10, 1)  for i =   9 

data_train shape =  (6315, 12)  and params shape =  (11, 1)  for i =   10 

data_train shape =  (6315, 13)  and params shape =  (12, 1)  for i =   11 

data_train shape =  (6315, 14)  and params shape =  (13, 1)  for i =   12 

data_train shape =  (6315, 15)  and params shape =  (14, 1)  for i =   13 

data_train shape =  (6315, 16)  and 