In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn.utils.validation")

In [2]:
df = pd.read_csv("DataSets/data.csv")
df.head()

Unnamed: 0,number_people,date,timestamp,day_of_week,is_weekend,is_holiday,temperature,is_start_of_semester,is_during_semester,month,hour
0,37,2015-08-14 17:00:11-07:00,61211,4,0,0,71.76,0,0,8,17
1,45,2015-08-14 17:20:14-07:00,62414,4,0,0,71.76,0,0,8,17
2,40,2015-08-14 17:30:15-07:00,63015,4,0,0,71.76,0,0,8,17
3,44,2015-08-14 17:40:16-07:00,63616,4,0,0,71.76,0,0,8,17
4,45,2015-08-14 17:50:17-07:00,64217,4,0,0,71.76,0,0,8,17


In [3]:
df.shape

(62184, 11)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62184 entries, 0 to 62183
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   number_people         62184 non-null  int64  
 1   date                  62184 non-null  object 
 2   timestamp             62184 non-null  int64  
 3   day_of_week           62184 non-null  int64  
 4   is_weekend            62184 non-null  int64  
 5   is_holiday            62184 non-null  int64  
 6   temperature           62184 non-null  float64
 7   is_start_of_semester  62184 non-null  int64  
 8   is_during_semester    62184 non-null  int64  
 9   month                 62184 non-null  int64  
 10  hour                  62184 non-null  int64  
dtypes: float64(1), int64(9), object(1)
memory usage: 5.2+ MB


In [5]:
df = df.drop(columns='date')

### 1) Applying Standard Scalar

In [6]:
scalar = StandardScaler()
df.iloc[:,1:] = scalar.fit_transform(df.iloc[:,1:])

### 2) Find Covariance Matrix

In [8]:
data_excluding_1stCol = df.iloc[:,1:]
cov_mat = np.cov(data_excluding_1stCol, rowvar=False)
cov_mat

array([[ 1.00001608e+00, -1.79321968e-03, -5.08815704e-04,
         2.85078360e-03,  1.84852463e-01,  9.55105884e-03,
         4.46766172e-02, -2.32214497e-02,  9.99093506e-01],
       [-1.79321968e-03,  1.00001608e+00,  7.91350923e-01,
        -7.58632581e-02,  1.11689106e-02, -1.17822146e-02,
        -4.82370614e-03,  1.55589363e-02, -1.91430511e-03],
       [-5.08815704e-04,  7.91350923e-01,  1.00001608e+00,
        -3.18993471e-02,  2.06736733e-02, -1.66460432e-02,
        -3.61277725e-02,  8.46248251e-03, -5.17297084e-04],
       [ 2.85078360e-03, -7.58632581e-02, -3.18993471e-02,
         1.00001608e+00, -8.85280154e-02, -1.48581472e-02,
        -7.07995743e-02, -9.49438154e-02,  2.84321058e-03],
       [ 1.84852463e-01,  1.11689106e-02,  2.06736733e-02,
        -8.85280154e-02,  1.00001608e+00,  9.32433629e-02,
         1.52478347e-01,  6.31255958e-02,  1.85123709e-01],
       [ 9.55105884e-03, -1.17822146e-02, -1.66460432e-02,
        -1.48581472e-02,  9.32433629e-02,  1.000016

### 3) Finding EV and EVs

In [11]:
eigen_values, eigen_vectors = np.linalg.eig(cov_mat)

In [12]:
eigen_values

array([2.07646693e+00, 9.21968609e-04, 1.80177983e+00, 2.06714659e-01,
       1.30805550e+00, 1.15910446e+00, 6.96730921e-01, 8.35014630e-01,
       9.15355845e-01])

In [13]:
eigen_vectors

array([[ 6.78375185e-01, -7.07077464e-01,  6.23492637e-03,
         4.64693511e-04,  1.50713904e-01,  8.89446773e-03,
         2.87665177e-02, -1.26641828e-01,  1.22482764e-02],
       [ 2.50474174e-03,  2.47614228e-04, -7.03172847e-01,
        -7.06950124e-01,  1.36082414e-03, -5.36403712e-02,
        -1.91735577e-02, -3.51437240e-02, -3.57950504e-02],
       [ 2.34122969e-03, -2.16714902e-04, -7.01042996e-01,
         7.05384710e-01,  3.33687865e-02, -6.99962109e-02,
         7.64938225e-03,  1.16871713e-02, -6.89750084e-02],
       [-2.43225805e-02, -8.95513198e-06,  9.93926778e-02,
        -3.88614592e-02,  3.47124899e-01, -3.63288427e-01,
         3.74775191e-02,  1.89808616e-01, -8.35531713e-01],
       [ 2.54328513e-01, -1.08616539e-04, -3.01477754e-02,
        -1.82769872e-02, -4.13390523e-01,  1.00041479e-01,
        -2.96856471e-02,  8.66836273e-01, -2.97938771e-02],
       [ 5.63548937e-02, -1.97977523e-04,  3.30320169e-02,
        -1.90259303e-03, -4.74515670e-01, -5.814093

### 4) Taking top 3 Principle Components

In [14]:
pc = eigen_vectors[0:3]
pc

array([[ 6.78375185e-01, -7.07077464e-01,  6.23492637e-03,
         4.64693511e-04,  1.50713904e-01,  8.89446773e-03,
         2.87665177e-02, -1.26641828e-01,  1.22482764e-02],
       [ 2.50474174e-03,  2.47614228e-04, -7.03172847e-01,
        -7.06950124e-01,  1.36082414e-03, -5.36403712e-02,
        -1.91735577e-02, -3.51437240e-02, -3.57950504e-02],
       [ 2.34122969e-03, -2.16714902e-04, -7.01042996e-01,
         7.05384710e-01,  3.33687865e-02, -6.99962109e-02,
         7.64938225e-03,  1.16871713e-02, -6.89750084e-02]])

In [15]:
transformed_df = np.dot(df.iloc[:,1:], pc.T)


In [16]:
new_df = pd.DataFrame(transformed_df,columns=['PC1','PC2', 'PC3'])

In [17]:
new_df['target'] = df['number_people'].values

### New DataFrame after Dimensionality Reduction

In [19]:
new_df.head()

Unnamed: 0,PC1,PC2,PC3,target
0,0.328007,0.493421,0.438396,37
1,0.361714,0.493545,0.438513,45
2,0.378553,0.493607,0.438571,40
3,0.395393,0.493669,0.438629,44
4,0.412232,0.493731,0.438687,45
