In [1]:
#PCAで主成分分析して特徴量を減らす
#2017年の中学生のデータを取り込み
import pandas as pd
df = pd.read_csv('pref_2017.csv')
df.head()

Unnamed: 0,2017,f_rate,gaku,life,sports,morals,jison
0,Hokkaido,3.39,65.0,5.1,40.7,4.9,5.1
1,Aomori,2.99,66.3,5.25,42.28,5.3,5.4
2,Iwate,2.56,63.8,5.2,44.22,5.5,5.1
3,Miyagi,4.34,65.0,5.35,42.01,4.8,5.0
4,Akita,2.45,70.0,6.15,44.22,5.9,5.9


In [2]:
#欠損値の確認
df.isnull().sum()

2017      0
f_rate    0
gaku      0
life      0
sports    0
morals    0
jison     0
dtype: int64

In [3]:
#データの標準化
from sklearn.preprocessing import StandardScaler
df1 = df.drop(['2017'], axis = 1)
sc = StandardScaler()
sc_df = sc.fit_transform(df1)

In [5]:
#寄与率の計算するためにもとのデータ分の新規の特徴量の列を出す
from sklearn.decomposition import PCA
model = PCA()
#新規列への学習と当てはめ
tmp = model.fit_transform(sc_df)
#寄与率の計算
model.explained_variance_ratio_ #この結果から、上位3列目で0.8を超えたことがわかる

array([0.53529033, 0.17308258, 0.11934835, 0.10558211, 0.04392937,
       0.02276726])

In [6]:
#列を3つに指定して主成分分析
model = PCA(n_components = 3)

#標準化したsc_dfを、データを学習させて、新しい3つの列に当てはめる
new = model.fit_transform(sc_df)


In [7]:
#主成分分析によって得られた列をデータフレームにして都道府県列を戻す
new_df = pd.DataFrame(new)
new_df.columns = ['PC1', 'PC2','PC3']
new_df['2017'] = df['2017']
new_df.head()

Unnamed: 0,PC1,PC2,PC3,2017
0,-0.924456,0.850052,-0.379663,Hokkaido
1,1.255022,0.310658,-0.081111,Aomori
2,1.348875,-0.434147,1.956934,Iwate
3,-1.166385,2.120688,-1.262144,Miyagi
4,5.704351,-0.049635,-0.571783,Akita


In [8]:
#相関係数を計算するために標準化済のもとのデータと主成分分析したデータを結合する
df2 = pd.DataFrame(sc_df, columns = df1.columns)
df3 = pd.concat([df2, new_df], axis = 1)
df3.head()

Unnamed: 0,f_rate,gaku,life,sports,morals,jison,PC1,PC2,PC3,2017
0,0.596326,-0.367854,0.032233,-1.243218,-0.596623,0.116904,-0.924456,0.850052,-0.379663,Hokkaido
1,-0.37769,0.297113,0.486717,-0.122949,0.622563,1.147117,1.255022,0.310658,-0.081111,Aomori
2,-1.424758,-0.981668,0.335222,1.252571,1.232157,0.116904,1.348875,-0.434147,1.956934,Iwate
3,2.909614,-0.367854,0.789707,-0.314387,-0.90142,-0.226501,-1.166385,2.120688,-1.262144,Miyagi
4,-1.692612,2.189708,3.213622,1.252571,2.451343,2.864138,5.704351,-0.049635,-0.571783,Akita


In [9]:
#相関係数
df_corr = df3.corr()
df_corr.loc[:'jison', 'PC1':]
#PC1 体力、規範意識、生活習慣、自己肯定感が高い
#PC2 不登校率が高い
#PC3 学力が低い,不登校率も低い

  df_corr = df3.corr()


Unnamed: 0,PC1,PC2,PC3
f_rate,-0.471174,0.673042,-0.398254
gaku,0.533246,-0.466249,-0.704171
life,0.853416,0.30482,-0.015647
sports,0.630981,-0.32859,0.222016
morals,0.883525,0.269652,0.104319
jison,0.893484,0.307444,-0.034756


In [9]:
#csvへ書き出し
new_df.to_csv('PCA_2017.csv')