In [1]:
import xml.etree.ElementTree as et
import os
import pandas as pd
import numpy as np
from la02_ConvertXml import xmlInFolder, merge_xml, xmlInFolder
from bs4 import BeautifulSoup

## 資料整合

1. 讀取下載回來的 xml 檔，然後把數據（議員名稱、議題、投票）整合成一個 pandas data frame。
2. 整理議員的政黨。

In [2]:
xmList = xmlInFolder('./cm')
legco_pd = merge_xml(xmList)
print(legco_pd.head())

There are 59 votes in 20190515 meeting


ValueError: Unable to coerce to Series, length must be 60: given 0

In [None]:
with open('./party.html', 'r') as f:
    party = f.read()
party_soup = BeautifulSoup(party, 'html.parser')
tr = party_soup.find_all('tr')
tr_text = []
for i in tr:
    j = i.text.strip().split('\n\n')
    a = j[0].split('\n')[-1]
    b = j[-1].split('\n')[0]
    tr_text.append([a, b])
tr_text.remove(['備註', '席位'])
tr_text.remove(['懸空', ''])
for _ in range(3):
    tr_text.remove(['懸空', '懸空'])

party_pd = pd.DataFrame(tr_text)
party_pd.set_index(0, inplace=True)
party_pd.loc['吳永嘉'][1] = '經民聯'
party_pd.loc['邵家輝'][1] = '自由黨'
party_pd

先把建制和非制建政黨分開...

In [None]:
chi = ['民建聯', '工聯會', '經民聯', '自由黨', '新民黨', '實政圓桌', '新論壇', '勞聯']

### 準備分析用的工具模組

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn.cluster import KMeans

因為文字不能直接放到 sklearn 分析，我們要先把投票狀態 present, absent, yes, no （還有 NaN）轉化成數字。

In [None]:
# voteMap = {'Yes': 1, 'Present': 2, 'Absent': 3, 'No': 4, np.nan: 0, 'Abstain': 5}
voteMap = {'Yes': 1, 'Present': 0, 'Absent': 0, 'No': -1, np.nan: 0, 'Abstain': 0}
legco_pdMap = legco_pd.set_index('index').applymap(lambda x: voteMap[x])
legco_pdMap.head()

以 sklearn 的 k-means clustering 來試著把議員們分類。先來看一下可以分多少個 cluster，

In [None]:
wss = []
for num_cluster in range(1, 15):
    kmeans = KMeans(n_clusters = num_cluster, random_state = 0).fit(legco_pdMap)
    wss.append(kmeans.inertia_)

plt.plot(range(1, 15), wss)

看來 4 - 8 個 clusters 會是比較合理的選擇...每個都試一下看看...

In [None]:
pd.set_option('display.max_rows', None)

df_clusters = pd.DataFrame(legco_pdMap.index)
for num_cluster in range(4, 8):
    kmeans = KMeans(n_clusters = num_cluster, random_state = 0).fit(legco_pdMap)
    clusters = kmeans.predict(legco_pdMap)
    df_clusters[num_cluster] = clusters

for keys, items in df_clusters.groupby([4, 5, 6, 7]):
    print(items)

分成 4 類的話就是建制，非建制，DQ，補選...好像意義不大。分成 5 類的話，非建制派（而又不是 DQ 或是補上）分成兩組，再細分的話感覺意義不大。值的一提的是，建制派的向心力非常強，到 7 clusters 時才分成兩邊。

Alternative clustering method: k-POD algorithm to duel with the missing data
i.e. we won't set np.nan to another value, instead we use the mean of centeroids of clusters. 

接下來試一下用 Principal Component Analysis 來把議員們放在2維平面。

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

In [None]:
pca = PCA(n_components=2)
legco_pca = pca.fit(legco_pdMap)
legco_pcaTrans = pca.transform(legco_pdMap)
print(legco_pcaTrans.shape)

In [None]:
# color=['red','pink','orange','gray']
# fig, axi1=plt.subplots(1)
# for i in range(n_clusters):
#     axi1.scatter(X[y_pred==i, 0], X[y_pred==i, 1],
#                marker='o',
#                s=8,
#                c=color[i])

plt.scatter(legco_pcaTrans[:, 0], legco_pcaTrans[:, 1])
plt.show

In [None]:
scale = MinMaxScaler()
legco_rescale = scale.fit_transform(legco_pdMap)
pca_2 = PCA(n_components = 0.70)
pca_2.fit(legco_rescale)
reduced = pca_2.transform(legco_rescale)
reduced.shape

In [None]:
from mpl_toolkits.mplot3d import Axes3D

In [None]:
fig = plt.figure()
ax = Axes3D(fig)

ax.scatter(reduced[:, 0], reduced[:, 1], reduced[:, 2])
plt.show()