# PCA Analysis

**Abstract:**
One-sentence description

**Description:**
In the following cell, I...


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import sys
import os

sys.path.append("..")

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
# plt.style.use('ggplot')
plt.rcParams['axes.facecolor'] = 'white'

from config import SCHEME_87, ROOT
os.chdir(ROOT)

  return warn(


In [2]:
from core.datasets import Datasets

dataset = Datasets(unit_base='data', name='filter_features')

dataset.add_item_from_file(
    rel_path="processed/merged_data.csv",
    name='merged',
    description='Merged: Zhou WU, GDP data, and Natural data',
    index_col=0,
)


province_mean = dataset.merged.obj.groupby('Province').mean().dropna(how='any')

dataset.add_item_from_dataframe(
    data=province_mean,
    name='province_mean',
    category='assets',
    rel_path_folder="processed",
    save=True,
    description='Mean, dropna=any.'
)

dataset.report()

<core.datasets.DataItem at 0x7fac509dd640>

<core.datasets.DataItem at 0x7fac509ddeb0>

+---------------+--------------------------------+--------------------------------+----------+
|      Name     |          Description           |              Path              | Category |
+---------------+--------------------------------+--------------------------------+----------+
|     merged    | Merged: Zhou WU, GDP data, and |   data/processed/merged.csv    |  assets  |
|               |          Natural data          |                                |          |
| province_mean |       Mean, dropna=any.        | data/processed/province_mean.c |  assets  |
|               |                                |               sv               |          |
+---------------+--------------------------------+--------------------------------+----------+


In [3]:
from core.model import Method
from core.src.filter_features import fit_pca, filter_features_by_vif, transform_features
import yaml

os.chdir(ROOT)
YAML_PATH = "model/87_all_wci.yaml"
with open(YAML_PATH, "r", encoding="utf-8") as file:
    params = yaml.load(file.read(), Loader=yaml.FullLoader)
    file.close()

features = params.get("parameters")['X_inputs']
features.remove("Province")
features.remove("Year")
n_components = 0.85

methods = Method(unit_base='model', name='filter_features')
methods.add_function_item(
    function=fit_pca,
    description="all features without GDP, n_components=85%, normalize=True.",
    parameters={
        'features': features,
        'n_components': n_components,
        'normalize': True
    }
)

methods.add_function_item(
    function=filter_features_by_vif,
    description="all features without GDP, threshold_VIF=10",
    parameters={
        'features': features,
        'threshold': 10,
        'normalize': True
    }
)

methods.add_function_item(
    function=transform_features,
    description="all features without GDP, normalize=False",
    parameters={
        'features': features,
        'normalize': False
    }
)

methods.path
methods.report()

'/Users/songshgeo/Documents/Pycharm/WAInstitution_YRB_2021/model'

+------------------------+--------------------------------+------------------------+--------------------------------+
|          Name          |          Description           |          Func          |             Params             |
+------------------------+--------------------------------+------------------------+--------------------------------+
|        fit_pca         |   all features without GDP,    |        fit_pca         |        (1) features.(2)        |
|                        |       n_components=85%,        |                        |  n_components.(3) normalize.   |
|                        |        normalize=True.         |                        |                                |
| filter_features_by_vif |   all features without GDP,    | filter_features_by_vif | (1) features.(2) threshold.(3) |
|                        |        threshold_VIF=10        |                        |           normalize.           |
|   transform_features   |   all features without GDP,  

In [4]:
from core.analysis import Analyst

analyst = Analyst(unit_base="content", name='filter_features')
analyst.add_analyst_item(
    name='pca_fit',
    description='Fit a PCA model by mean province merged data.',
    data_item=dataset.province_mean,
    method_item=methods.fit_pca,
)

analyst.add_analyst_item(
    name='vif_filter',
    description='Filter features VIF < 10 by mean province merged data',
    data_item=dataset.province_mean,
    method_item=methods.filter_features_by_vif,
)

analyst.add_analyst_item(
    name='pca_transform',
    description='Transform dataset by a fitted PCA model.',
    data_item=dataset.merged,
    method_item=methods.transform_features,
)

analyst.report()

+---------------+--------------------------------+---------------+------------------------+--------+
|      Name     |          Description           |      Data     |         Method         | Check? |
+---------------+--------------------------------+---------------+------------------------+--------+
|    pca_fit    |    Fit a PCA model by mean     | province_mean |        fit_pca         |  None  |
|               |     province merged data.      |               |                        |        |
|   vif_filter  |  Filter features VIF < 10 by   | province_mean | filter_features_by_vif |  None  |
|               |   mean province merged data    |               |                        |        |
| pca_transform | Transform dataset by a fitted  |     merged    |   transform_features   |  None  |
|               |           PCA model.           |               |                        |        |
+---------------+--------------------------------+---------------+------------------------+

In [5]:
result = analyst.pca_fit.do_analysis()
transformed_data = analyst.pca_transform.do_analysis(fitted_model=result[0])
transformed_data.head()

[pca] >Normalizing input data per feature (zero mean and unit variance)..
[pca] >The PCA reduction is performed to capture [85.0%] explained variance using the [25] columns of the input data.
[pca] >Fit using PCA.
[pca] >Compute loadings and PCs.
[pca] >Compute explained variance.
[pca] >Number of components is [5] that covers the [85.00%] explained variance.
[pca] >The PCA reduction is performed on the [25] columns of the input dataframe.
[pca] >Fit using PCA.
[pca] >Compute loadings and PCs.
[pca] >Outlier detection using Hotelling T2 test with alpha=[0.05] and n_components=[5]
[pca] >Outlier detection using SPE/DmodX with n_std=[2]
[pca] >Column labels are auto-completed.
[pca] >Row labels are auto-completed.
[pca] >Normalizing input data per feature (zero mean and unit variance)..


Unnamed: 0,Province,Year,IRR,Irrigated area: Total,IND,Industrial gross value added (GVA): Total,URB,RUR,area,PIRR,...,lrad,srad,wind,pres,shum,PC1,PC2,PC3,PC4,PC5
0,Anhui,1979,19.352828,2507.85708,1.307743,6.957575,0.412628,0.91149,140116.916198,622.801535,...,343.6105,164.30405,2.557015,99895.77,0.009364,-2.249835,0.932923,2.518116,2.461679,1.561266
1,Anhui,1980,18.7335,2425.5884,1.570926,8.414425,0.411466,0.804387,140116.916198,442.283199,...,342.25723,148.69427,2.520655,99892.71,0.009192,-2.188632,1.13066,2.647293,2.201552,1.512992
2,Anhui,1981,18.365685,2382.902444,1.428027,9.407631,0.453977,0.911853,140116.916198,514.166045,...,344.76782,154.12779,2.645633,99894.54,0.009203,-2.185968,0.919818,2.505952,2.164458,1.45557
3,Anhui,1982,17.734384,2301.342919,1.497168,10.697606,0.46891,0.917998,140116.916198,484.141343,...,348.07114,145.7305,2.613635,99894.89,0.009487,-2.18203,0.8838,2.457941,2.014865,1.403741
4,Anhui,1983,17.054947,2234.93566,1.564192,12.03689,0.488977,0.9497,140116.916198,515.032473,...,344.742,155.4465,2.580364,99895.39,0.009535,-2.137451,1.054929,2.552039,1.706661,1.446822
