### Re-defining TfL’s Strategic Neighbourhoods from the spatial and social perspectives
#### Code written by Xinlei Yan

# Notebook for principal component analysis of variables


## 1. Load packages

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

## 2. Read in data

In [2]:
data=pd.read_csv("data/factors/factors.csv")

In [3]:
#data.info()
data_sub=data.iloc[:,10:]
data_sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162152 entries, 0 to 162151
Data columns (total 23 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   d_retail    162152 non-null  float64
 1   d_transpor  162152 non-null  float64
 2   D_public    162152 non-null  float64
 3   D_attracti  162152 non-null  float64
 4   bedroom     162152 non-null  float64
 5   house_type  162152 non-null  float64
 6   house_age   162152 non-null  float64
 7   house_coun  162152 non-null  float64
 8   crime       162152 non-null  float64
 9   barriers    162152 non-null  float64
 10  education   162152 non-null  float64
 11  health      162152 non-null  float64
 12  employment  162152 non-null  float64
 13  income      162152 non-null  float64
 14  IMD         162152 non-null  float64
 15  BAME        162152 non-null  float64
 16  persons_he  162152 non-null  float64
 17  older       162152 non-null  float64
 18  working_ag  162152 non-null  float64
 19  ho

In [4]:
data_list=data_sub.values.tolist()

## 3. Z-score data standardization

In [5]:
# standardizing the variables using the sklearn package
scaler = StandardScaler()
scaler.fit(data_list)
trans_list = scaler.transform(data_list)

In [6]:
trans_list

array([[ 6.38318694,  9.56720421, 20.54863685, ...,  0.06348218,
         1.15061659, -0.7786892 ],
       [ 9.73627129,  6.44267938,  8.29870314, ...,  0.24390143,
         1.15456654, -0.7786892 ],
       [ 2.66691478,  3.08450291, 12.31138408, ..., -0.04280803,
         1.08022689,  1.44961273],
       ...,
       [11.54618504, 12.17227284, 13.41754478, ..., -0.44974105,
         2.1796841 , -0.7786892 ],
       [ 8.70938106, 10.58042179, 18.55903516, ..., -0.25661396,
         2.152776  , -0.7786892 ],
       [ 8.00607777, 10.78291009, 16.15595274, ..., -0.26135079,
         2.12802833, -0.7786892 ]])

## 4. Apply PCA

In [7]:
pca = PCA(n_components='mle')
newdata=pca.fit(trans_list)

Print out the variance explained

In [8]:
print(pca.explained_variance_ratio_)


[0.3672947  0.22117153 0.06312338 0.05770227 0.03721374 0.03632611
 0.03260527 0.02712179 0.02654243 0.02386373 0.0206917  0.01658567
 0.01295079 0.01205113 0.01112748 0.00997025 0.00767049 0.00490432
 0.00467446 0.00284469 0.00218758 0.00109877]


Print out the contributions of the variables to the components

In [9]:
print(pca.components_)


[[ 1.57837560e-01  9.64690677e-02  1.14750267e-01  1.67095442e-01
   2.86428281e-01  2.69511419e-01  1.75791109e-02 -1.43814239e-01
  -2.23179720e-01 -2.42406030e-01 -1.22722234e-01 -2.67747879e-01
  -2.59196720e-01 -2.62095415e-01 -3.00403308e-01 -1.85227318e-01
  -2.77260831e-01  2.97472574e-01 -2.22280080e-01  1.18805368e-02
  -3.48751069e-03  2.58417434e-01  1.07567540e-01]
 [-1.23899959e-01 -7.65285383e-02 -1.22437691e-01 -1.70161557e-01
  -1.74964441e-01 -1.87635223e-01 -1.93299148e-01  3.15817051e-01
  -2.33541841e-01  1.18952174e-03 -2.98474048e-01 -1.95525623e-01
  -2.54349974e-01 -2.63450081e-01 -1.96983801e-01 -1.87542165e-01
   1.17096866e-01  8.09587480e-03  2.68964000e-01  2.99643334e-01
   3.35683213e-01 -1.90765185e-01 -1.30412469e-01]
 [ 4.50795652e-01  5.81578989e-01  4.51550534e-01  1.06945705e-01
  -8.97710824e-02 -7.26469201e-02 -1.01135712e-01  9.82832050e-02
   1.46031779e-02  1.88912515e-01 -6.59391689e-02  1.26376393e-02
   1.48934180e-03  4.56804941e-03  5.068

Fit the model to the data to get the new components

In [10]:
newX = pca.fit_transform(trans_list)

In [11]:
newX

array([[ 9.00401508e+00, -5.52637142e+00,  1.78746751e+01, ...,
         8.11554172e-02,  2.66779164e-01,  1.58799921e-02],
       [ 8.45532826e+00, -5.08836580e+00,  1.20297664e+01, ...,
        -1.44832680e-02,  2.84883199e-01, -2.58953004e-02],
       [ 6.16612792e+00, -3.09888543e+00,  8.89175053e+00, ...,
         5.28516717e-02,  1.18702710e-01,  2.87739877e-02],
       ...,
       [ 9.88137071e+00, -7.38540234e+00,  1.80064810e+01, ...,
        -3.41196814e-01, -9.74376065e-02,  6.86791378e-02],
       [ 9.75354036e+00, -6.07672708e+00,  1.75012062e+01, ...,
        -7.83306741e-02, -1.83318713e-01,  1.38294061e-01],
       [ 9.30121192e+00, -5.67153351e+00,  1.61981492e+01, ...,
        -8.59793769e-02, -1.81668050e-01,  1.53506557e-01]])

In [12]:
# write the new components to file
df = pd.DataFrame(newX)
df.to_csv("PCA.csv")