# Import Dataset and Libraries

In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D

In [19]:
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from numpy import corrcoef

In [20]:
fname = 'FODS-A2.csv'
df = pd.read_csv(fname,header=0,encoding='UTF8')
print(df.shape)
df.head()

(7894, 27)


Unnamed: 0,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,...,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2,Appliances
0,21.2,33.29,19.823333,31.79,23.463333,38.23,20.5,31.73,19.2,39.363333,...,29.23,9.85,756.183333,41.833333,4.833333,40.0,-2.67,42.01718,42.01718,290
1,21.79,38.5,19.5,40.633333,22.5,37.9,21.0,37.9,20.033333,47.29,...,40.326667,6.9,754.0,75.0,4.0,40.0,2.8,24.62438,24.62438,50
2,22.39,41.39,20.2,43.79,24.5,39.333333,20.1,38.26,19.39,48.09,...,42.06,10.1,756.433333,68.0,5.833333,40.0,4.45,3.73126,3.73126,260
3,24.0,30.26,24.39,26.963333,23.39,33.4,22.79,31.2,21.033333,40.626667,...,35.5,19.1,760.0,31.0,4.0,40.0,1.5,1.058826,1.058826,50
4,20.05,38.245,17.6,41.0,21.1,37.2,19.89,36.4,18.2,43.56,...,38.863333,0.1,754.6,99.0,1.0,32.0,-0.1,39.248108,39.248108,30


In [21]:
data = df.to_numpy()
features = list(df.columns)

# Linear Regression and Normalization

In [22]:
def predict(data, theta):
    X = data[:,0:-1]
    y = data[:,-1].reshape(-1, 1)
    
    y_pred = np.dot(X, theta)
    return y_pred

In [23]:
def train_norm(data):
    X = data[:,1:-1]
    X_mean = np.mean(X, axis=0)
    X_std = np.std(X, axis = 0)
    X_norm = np.divide((X - X_mean), X_std)
    
    data_norm = np.c_[data[:,0], X_norm, data[:,-1]]
    
    return data_norm, X_mean, X_std

In [24]:
def test_norm(data, X_mean, X_std):
    X = data[:,1:-1]
    X_norm = np.divide((X - X_mean), X_std)
    
    data_norm = np.c_[data[:,0], X_norm, data[:,-1]]
    return data_norm

In [25]:
def computeCost(data, theta):
    m = len(data)
    X = data[:,0:-1]
    y = data[:,-1].reshape(-1, 1)
    
    J = np.sum((np.dot(X, theta) - y)**2) / (2 * m)
    
    return J

In [26]:
def gradDes(data, theta, alpha, num_iters):
    m = len(data)
    X = data[:,0:-1]
    y = data[:,-1].reshape(-1, 1)
    J_history = np.zeros(num_iters)
    
    for iter in range(num_iters):
        delta = np.dot(X.T, (np.dot(X, theta) - y))
        theta = theta - (alpha/m)*delta
        J_history[iter] = computeCost(data, theta)
        
    return theta, J_history

In [27]:
def data_split(data):
    random.seed(30)
    order = list(range(len(data)))
    shuffle = random.sample(order, int(len(order) * 0.8))
    rem = [i for i in order if i not in shuffle]
    
    train_set = data[shuffle,:]
    test_set = data[rem, :]
    return train_set, test_set

# Principal Component Analysis

In [28]:
def PCA_calc(n, data):
    pca = PCA(n_components = n)
    pca.fit(data[:,0:-1])
    pca_data = pca.transform(data[:,0:-1])
    pca_data = np.concatenate((pca_data, data[:,-1].reshape(-1,1)), axis=1)
    return pca_data, pca

In [29]:
pca_errors = []
pca_eigenval_list = []
const = np.ones((len(data), 1))

In [30]:
for n in range(1, 27):
    pca_data, pca = PCA_calc(n, data)
    eigenvalues = list(pca.explained_variance_[0:n])
    
    pca_data = np.concatenate((const, pca_data), axis = 1)
    
    train_set, test_set = data_split(pca_data)
    
    theta = np.zeros((pca_data.shape[1] - 1, 1))
    
    train_set, train_mean, train_std = train_norm(train_set)
    theta, J_history = gradDes(train_set, theta, 0.01, 10**3)
    
    training_error = J_history[-1]
    
    test_set = test_norm(test_set, train_mean, train_std)
    y_pred = predict(test_set, theta)
    testing_error = computeCost(test_set, theta)
        
    pca_errors.append([n, training_error, testing_error])
    pca_eigenval_list.append(eigenvalues)

In [31]:
pca_errors

# Pearson Correlation Coefficient

In [32]:
def calc_corr(data):
    corr_coeffs_org = []
    corr_coeffs_abs = []
    for n in range(1, 27):
        corr = corrcoef(data[:, n - 1], data[:, -1])[0][1]
        corr_coeffs_org.append([n, corr])
        corr_coeffs_abs.append([n, np.absolute(corr)])
        
        corr_coeffs_sorted = sorted(corr_coeffs_abs, key=lambda l:l[1], reverse=True)
    return corr_coeffs_sorted, corr_coeffs_org

In [33]:
corr_coeffs, corr_coeffs_org = calc_corr(data)
corr_coeffs[0:5]

[[21, 0.1578880406411128],
 [11, 0.11579905440694678],
 [3, 0.10954237300483129],
 [19, 0.09890810764010083],
 [16, 0.09104028998178086]]

In [37]:
corr_errors = []
corr_features = []
const = np.ones((len(data), 1))

In [35]:
for n in range(1, 27):
    columns = []
    for i in range(n):
        columns.append(corr_coeffs[i][0] - 1)
    
    feat = [features[i] for i in columns]
    corr_features.append(feat)
        
    corr_data = data[:,columns]
    corr_data = np.concatenate((corr_data, data[:,-1].reshape(-1,1)), axis=1)
    corr_data = np.concatenate((const, corr_data), axis = 1)
    
    train_set, test_set = data_split(corr_data)
    
    theta = np.zeros((corr_data.shape[1] - 1, 1))
    
    train_set, train_mean, train_std = train_norm(train_set)
    theta, J_history = gradDes(train_set, theta, 0.01, 10**3)
    
    training_error = J_history[-1]
    
    test_set = test_norm(test_set, train_mean, train_std)
    y_pred = predict(test_set, theta)
    testing_error = computeCost(test_set, theta)
    
    corr_errors.append([n, training_error, testing_error])   

In [36]:
corr_errors