# PCA Analysis

**Abstract:**
One-sentence description

**Description:**
In the following cell, I...


In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import pandas as pd


from src import fit_pca

数据处理过程：

按省进行分组平均，丢掉有任何缺失值的省份

In [None]:
# 读取数据
merged_data = pd.read_csv(r"../data/processed/merged_data.csv", index_col=0)
merged_mean = merged_data.groupby("Province").mean().dropna(how="any")
merged_mean.head()

## Features selection and fit

In [None]:
Y_input = "Total water use"

not_features = ["Province", "Year", "Total water use"]
X_inputs = [
    "Province",
    "Year",
    #         'IRR',
    #         'Irrigated area: Total',
    "Irrigated area: Rice",
    "Irrigated area: Wheat",
    "Irrigated area: Maize",
    "Irrigated area: Vegetables and fruits",
    "Irrigated area: Others",
    #         'IND',
    #         'Industrial gross value added (GVA): Total',
    "Industrial gross value added (GVA): Textile",
    "Industrial gross value added (GVA): Papermaking",
    "Industrial gross value added (GVA): Petrochemicals",
    "Industrial gross value added (GVA): Metallurgy",
    "Industrial gross value added (GVA): Mining",
    "Industrial gross value added (GVA): Food",
    "Industrial gross value added (GVA): Cements",
    "Industrial gross value added (GVA): Machinery",
    "Industrial gross value added (GVA): Electronics",
    "Industrial gross value added (GVA): Thermal electrivity",
    "Industrial gross value added (GVA): Others",
    #         'URB',
    "Urban population",
    "Service GVA",
    #         'RUR',
    "Rural population",
    "Livestock population",
    "Total water use",
    #         'area',
    #         'PIRR',
    #         'AIRR',
    "WCI",
    "Ratio of industrial water recycling",
    # #         'Ratio of industrial water evaporated',
    # #         'gdp',
    # #         'gdp-1',
    # #         'gdp-2',
    # #         'gdp-3',
    # #         'gdp-avg',
    "prec",
    "temp",
    #  'wind',
    # PC1,
    # PC2,
    # PC3,
    # PC4,
    # PC5
]
# 5 principals 89.63%
features = [f for f in X_inputs if f not in not_features]

model, results = fit_pca(merged_mean, features=features, n_components=0.85)
fig, ax = model.plot(figsize=(4, 3))
fig.savefig("../../PhD_Thesis/img/ch5/ch5_elbow.png", dpi=300)

我们进行了主成分分析，使用了24个特征。结果显示，前5个主成分能够解释89.63%的方差变化。其中，第一个主成分解释了方差变化的51.6%，第二个主成分解释了16.9%的方差变化。这表明，前两个主成分是最重要的，能够代表大部分原始特征的变异程度。

## Biplot of the Components

In [None]:
fontdict = {"weight": "normal", "size": 9, "ha": "center", "va": "center", "c": "black"}
fig, ax = model.biplot(
    figsize=(5, 4),
    s=0,  # merged_mean[Y_input].values
    n_feat=8,
    jitter=0.01,
    legend=False,
    label=False,
    # SPE=True,
    fontdict=fontdict,
    # alpha_transparency=0.6,
    hotellingt2=True,
    title="",
)

In [None]:
from mksci_font import mksci_font
from pca.pca import _get_coordinates


@mksci_font(xlabel="主成分1", ylabel="主成分2")
def better_biplot(fig, ax):
    xs, ys, zs, ax = _get_coordinates(model.results["PC"], [0, 1], fig, ax, False)
    ax.scatter(
        xs,
        ys,
        s=merged_mean[Y_input].values * 30,
        alpha=0.4,
        edgecolors="white",
        color="white",
    )
    ax.grid(False)
    return ax


better_biplot(fig, ax)

# 保存到毕业论文的作图区
fig.savefig(r"../../PhD_Thesis/img/ch5/ch5_biplot.png", dpi=300)

description = """
这里的
"""
fig

## Find Significant Features

In order to test the **significance of the PCA loadings**, we used a combination of three methods: 
1) the bootstrapped eigenvector method3
2) the threshold method loadings are significant when their absolute value and contribution are larger than a specific threshold depending on the number of dimensions (ndim , i.e. variables), and 
3) a fixed threshold fixed according to Richman et al.

In practice the loadings are significant, and considered as “high relevance”, if 
1) the p-value from method 1 is below 0.01; 
2) their contribution is above 1/ndim (i.e. above 8.3%);
3) the absolute value of the loadings is above 0.34. 


The results are summarized in Table S3

@migliavacca2021

In [None]:
from matplotlib import pyplot as plt


def sig_loadings(model, pc=1, method="contribution", color="c", threshold=0.3, ax=None):
    loadings = model.results["loadings"]
    if not ax:
        _, ax = plt.subplots(figsize=(2.5, 6))
    if method == "contribution":
        threshold = 1 / len(loadings)
    data = loadings.loc[f"PC{pc}"]
    if isinstance(color, str):
        colors = ["lightgray" if abs(da) < threshold else color for da in data]
    elif hasattr(color, "__iter__"):
        colors = [
            "lightgray" if abs(da) < threshold else color[i]
            for i, da in enumerate(data)
        ]
    ax.barh(width=data.values, y=data.index, color=colors)

    # 美化
    ax.spines[["top", "left", "right"]].set_visible(False)
    ax.set_yticks([])
    ax.set_xlabel(f"PC{pc}")
    # ax.set_xlim(-threshold-.05, +threshold+.05)
    return ax


sig_loadings(model)

In [None]:
model.results.keys()

In [None]:
fig, axs = plt.subplots(1, 5, figsize=(10, 6))

for i, ax in enumerate(axs):
    sig_loadings(model, i + 1, ax=ax)