# 모듈 다운로드

In [None]:
import pandas as pd

In [None]:
import numpy as np

In [None]:
import seaborn as sns

In [None]:
!pip install factor_analyzer

In [None]:
from sklearn.decomposition import PCA
from factor_analyzer import Rotator

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
import matplotlib.pyplot as plt

# 데이터 불러오기

In [None]:
mac_path = "/Users/sunjaelee/Library/CloudStorage/Dropbox/교과/다차원자료분석PBL/기말고사/데이터/final_data.csv"
window_path = "C:/Users/cbskust/Dropbox/교과/다차원자료분석PBL/기말고사/데이터/final_data.csv"

In [None]:
data = pd.read_csv(mac_path)

## 변수명 변경

In [None]:
data.rename(columns = {"EDUCATION_EFFECT": "SCHOOL_EFFECT",
                      "BUILDING_PCA_SCORE" : "BUILDING",
                      "ECONOMY_PCA_SCORE" : "ECONOMY",
                      "POP_PCA_SCORE" : "POP"}, inplace = True)

In [None]:
data.columns

## 주성분분석에 쓸 변수 선택

In [None]:
selected_columns = ["OLD_IDX", "BIGROAD_EFFECT", "ROAD_EFFECT", "BUS_EFFECT", "ACADEMY", "HOSPITAL", "BANK", 
    "MART_DEPARTMENT", "SECURITY", "HARMFUL_FACILITY_EFFECT", "SCHOOL_EFFECT", 
    "POP", "ECONOMY", "BUILDING"]

In [None]:
selected_data = data[selected_columns]

## 스케일링

In [None]:
scaler = StandardScaler()

In [None]:
data_scaled = scaler.fit_transform(selected_data)

# 주성분분석

In [None]:
# PCA 수행
pca = PCA()
pca_result = pca.fit_transform(data_scaled)

## 주성분 개수 선정

In [None]:
pca.explained_variance_

In [None]:
# 설명된 분산 비율과 누적 설명된 분산 비율 계산
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_explained_variance = explained_variance_ratio.cumsum()

# Scree Plot과 누적 설명된 분산 비율 Plot
plt.figure(figsize=(12, 6))

# Scree Plot
plt.subplot(1, 2, 1)
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, marker='o', linestyle='--')
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.xticks(range(1, len(explained_variance_ratio) + 1))
plt.grid(True)

# Cumulative Explained Variance Plot
plt.subplot(1, 2, 2)
plt.plot(range(1, len(cumulative_explained_variance) + 1), cumulative_explained_variance, marker='o', linestyle='--', color='r')
plt.title('Cumulative Explained Variance')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Variance Explained')
plt.xticks(range(1, len(cumulative_explained_variance) + 1))
plt.grid(True)

plt.tight_layout()
plt.show()

# 설명된 분산 비율과 누적 설명된 분산 비율 출력
print("Explained Variance Ratio:", explained_variance_ratio)
print("Cumulative Explained Variance:", cumulative_explained_variance)

## 6개의 주성분으로 주성분분석(rotate 결과 포함)

In [None]:
pca = PCA(n_components = 6)
pca_result = pca.fit_transform(data_scaled)
loadings = pca.components_.T

# 로딩 행렬을 데이터프레임으로 변환
loading_matrix = pd.DataFrame(loadings, columns=[f'PC{i+1}' for i in range(6)], index = selected_columns)

# Varimax 회전 적용
rotator = Rotator(method='varimax')
rotated_loadings = rotator.fit_transform(loading_matrix.values)

# 회전된 로딩 행렬을 데이터프레임으로 변환
rotated_loading_matrix = pd.DataFrame(rotated_loadings, columns=[f'PC{i+1}' for i in range(6)], index = selected_columns)

# 결과 출력
print("Original Loadings:")
print(loading_matrix)
print("\nRotated Loadings (Varimax):")
print(rotated_loading_matrix)

In [None]:
rotated_loading_matrix.index

In [None]:
new_order = ["HOSPITAL", "BANK", "ECONOMY",
            "OLD_IDX", "BUILDING",
            "ROAD_EFFECT", "BUS_EFFECT",
            "ACADEMY", "POP",
            "BIGROAD_EFFECT", "MART_DEPARTMENT", "HARMFUL_FACILITY_EFFECT",
            "SCHOOL_EFFECT", "SECURITY"]

In [None]:
my_loadings = rotated_loading_matrix.loc[new_order]

In [None]:
my_loadings

## 주성분 loadings 시각화

In [None]:
# 로딩값 히트맵 시각화
plt.figure(figsize=(15, 15))
sns.heatmap(loading_matrix, annot=True, fmt=".2f", cmap='coolwarm', center=0,
    annot_kws={"size": 12})
plt.title('PCA Loadings Heatmap')
plt.xlabel('Principal Components')
plt.ylabel('Features')
plt.savefig('pca_loadings_heatmap.png', dpi=300, bbox_inches='tight')

plt.show()

In [None]:
# 로딩값 히트맵 시각화
plt.figure(figsize=(15, 15))
sns.heatmap(my_loadings, annot=True, fmt=".2f", cmap='coolwarm', center=0,
    annot_kws={"size": 12})
plt.title('PCA Loadings Heatmap')
plt.xlabel('Principal Components')
plt.ylabel('Features')
plt.savefig('pca_loadings_heatmap.png', dpi=300, bbox_inches='tight')

plt.show()

## 주성분점수 붙인 데이터 만들기

In [None]:
pca_result

In [None]:
# 주성분 점수를 데이터프레임으로 변환
principal_df = pd.DataFrame(data=pca_result, columns=[f'PC{i+1}' for i in range(6)])

# 원래 데이터프레임에 주성분 점수 추가
new_data = pd.concat([data, principal_df], axis=1)

In [None]:
new_data

In [None]:
new_data.drop(columns = ['OLD_IDX', 'BIGROAD_EFFECT', 'ROAD_EFFECT',
       'BUS_EFFECT', 'ACADEMY', 'HOSPITAL', 'BANK', 'MART_DEPARTMENT',
       'SECURITY', 'HARMFUL_FACILITY_EFFECT', 'SCHOOL_EFFECT', 'POP',
       'ECONOMY', 'BUILDING', 'BIGROAD_N', 'ROAD_N', 'BUS_N'], inplace = True)

In [None]:
new_data.columns

In [None]:
new_data

## 주성분 이름 지어주기

In [None]:
new_data.rename(columns = {"PC1": "MEDICAL_ECONOMY",
                      "PC2" : "BUILDING_SILVER",
                      "PC3" : "TRANS",
                      "PC4" : "POP_ACADEMY",
                      "PC5" : "INFRA",
                      "PC6" : "SCHOOL_SECURITY"}, inplace = True)

## 데이터 저장 

In [None]:
new_data.to_csv("/Users/sunjaelee/Library/CloudStorage/Dropbox/교과/다차원자료분석PBL/기말고사/데이터/pca_data.csv", index = False, encoding = 'utf-8-sig')

# 군집분석

In [None]:
from sklearn.cluster import KMeans

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
clust_data = new_data[['MEDICAL_ECONOMY', 'BUILDING_SILVER', 'TRANS',
                       'POP_ACADEMY', 'INFRA', 'SCHOOL_SECURITY']]

## 엘보우 방법을 통한 적절한 군집 개수 설정

In [None]:
inertia = []
K = range(1, 11)

for k in K:
    kmeans = KMeans(n_clusters=k, n_init = 10, random_state=715)
    kmeans.fit(clust_data)
    inertia.append(kmeans.inertia_)

# 엘보우 방법 시각화
plt.figure(figsize=(10, 6))
plt.plot(K, inertia, 'bo-')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
k = 3
kmeans = KMeans(n_clusters=k, random_state=715)
clusters = kmeans.fit_predict(clust_data)

# 원래 데이터프레임에 군집 결과 추가
new_data['CLUSTER'] = clusters

# 군집화 결과 시각화 (PC1과 PC2 사용)
plt.figure(figsize=(10, 8))
sns.scatterplot(data=new_data, x='MEDICAL_ECONOMY', y='BUILDING_SILVER', hue='CLUSTER', palette='viridis', s=50)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=200, c='red', marker='X')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('K-means Clustering on PCA-transformed Data')
plt.show()

## 법정동 -> 행정동 변환

In [None]:
new_data

In [None]:
mac_path = "/Users/sunjaelee/Library/CloudStorage/Dropbox/교과/다차원자료분석PBL/기말고사/데이터/KIKmix.20210401.xlsx"
window_path = "C:/Users/cbskust/Dropbox/교과/다차원자료분석PBL/기말고사/데이터/KIKmix.20210401.xlsx"
change = pd.read_excel(mac_path)

In [None]:
change = change.dropna()

In [None]:
change

In [None]:
merged_data = pd.merge(new_data, change, left_on='DONG_CODE', right_on='법정동코드', how='left')

In [None]:
merged_data.drop(columns = ["시도명", "시군구명", "법정동코드", "동리명", "생성일자"], inplace = True)

In [None]:
merged_data

In [None]:
merged_data["FULL_DONG_NAME"] = "서울특별시 " + merged_data["SIGUNGU_NAME"] + " " + merged_data["읍면동명"]

In [None]:
merged_data

In [None]:
merged_data['FULL_DONG_NAME'] = merged_data['FULL_DONG_NAME'].str.replace('제', '')

In [None]:
grouped_df = merged_data.groupby('FULL_DONG_NAME')[['MEDICAL_ECONOMY', 'BUILDING_SILVER', 'TRANS',
                       'POP_ACADEMY', 'INFRA', 'SCHOOL_SECURITY']].mean().reset_index()

In [None]:
grouped_df

In [None]:
# 적절한 군집 개수 찾기 (엘보우 방법)
inertia = []
silhouette_avg = []
K = range(2, 11)

for k in K:
    kmeans = KMeans(n_clusters=k, n_init=10, random_state=715)
    cluster_labels = kmeans.fit_predict(grouped_df[['MEDICAL_ECONOMY', 'BUILDING_SILVER', 'TRANS',
                       'POP_ACADEMY', 'INFRA', 'SCHOOL_SECURITY']])
    inertia.append(kmeans.inertia_)
    silhouette_avg.append(silhouette_score(grouped_df[['MEDICAL_ECONOMY', 'BUILDING_SILVER', 'TRANS',
                       'POP_ACADEMY', 'INFRA', 'SCHOOL_SECURITY']], cluster_labels))

# 엘보우 방법 시각화
plt.figure(figsize=(10, 6))
plt.plot(K, inertia, 'bo-')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
# 실루엣 점수 시각화
plt.figure(figsize=(10, 6))
plt.plot(K, silhouette_avg, 'bo-')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Analysis For Optimal k')
plt.show()

In [None]:
k = 4
kmeans = KMeans(n_clusters=k, n_init=10, random_state=715)
clusters = kmeans.fit_predict(grouped_df[['MEDICAL_ECONOMY', 'BUILDING_SILVER', 'TRANS',
                       'POP_ACADEMY', 'INFRA', 'SCHOOL_SECURITY']])

# 군집 결과를 그룹화된 데이터프레임에 추가
grouped_df['Cluster'] = clusters

# 군집화 결과 시각화 (PC1과 PC2 사용)
plt.figure(figsize=(10, 8))
sns.scatterplot(data=grouped_df, x='MEDICAL_ECONOMY', y='BUILDING_SILVER', hue='Cluster', palette='viridis', s=100)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red', marker='X')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('K-means Clustering of Administrative Districts')
plt.legend(title='Cluster')
plt.show()

# 결과 출력
grouped_df.head()

## 군집화 결과 시각화

In [None]:
import json 
import requests

In [None]:
geo_path_mac = "/Users/sunjaelee/Library/CloudStorage/Dropbox/교과/다차원자료분석PBL/기말고사/데이터/hangjeongdong_서울특별시.geojson"
geo_path_window = "C:/Users/cbskust/Dropbox/교과/다차원자료분석PBL/기말고사/데이터/hangjeongdong_서울특별시.geojson"
geo_data = json.load(open(geo_path_mac, encoding = 'utf-8'))

In [None]:
geo_data

In [None]:
grouped_df.loc[grouped_df['FULL_DONG_NAME'] == '서울특별시 서대문구 홍1동', 'FULL_DONG_NAME'] = '서울특별시 서대문구 홍제1동'
grouped_df.loc[grouped_df['FULL_DONG_NAME'] == '서울특별시 서대문구 홍2동', 'FULL_DONG_NAME'] = '서울특별시 서대문구 홍제2동'
grouped_df.loc[grouped_df['FULL_DONG_NAME'] == '서울특별시 서대문구 홍3동', 'FULL_DONG_NAME'] = '서울특별시 서대문구 홍제3동'
grouped_df.loc[grouped_df['FULL_DONG_NAME'] == '서울특별시 종로구 종로1.2.3.4가동', 'FULL_DONG_NAME'] = '서울특별시 종로구 종로1·2·3·4가동'
grouped_df.loc[grouped_df['FULL_DONG_NAME'] == '서울특별시 종로구 종로5.6가동', 'FULL_DONG_NAME'] = '서울특별시 종로구 종로5·6가동'
grouped_df.loc[grouped_df['FULL_DONG_NAME'] == '서울특별시 성동구 금호2.3가동', 'FULL_DONG_NAME'] = '서울특별시 성동구 금호2·3가동'
grouped_df.loc[grouped_df['FULL_DONG_NAME'] == '서울특별시 노원구 중계2.3동', 'FULL_DONG_NAME'] = '서울특별시 노원구 중계2·3동'
grouped_df.loc[grouped_df['FULL_DONG_NAME'] == '서울특별시 노원구 상계6.7동', 'FULL_DONG_NAME'] = '서울특별시 노원구 상계6·7동'
grouped_df.loc[grouped_df['FULL_DONG_NAME'] == '서울특별시 중랑구 면목3.8동', 'FULL_DONG_NAME'] = '서울특별시 중랑구 면목3·8동'
grouped_df.loc[grouped_df['FULL_DONG_NAME'] == '서울특별시 노원구 상계3.4동', 'FULL_DONG_NAME'] = '서울특별시 노원구 상계3·4동'
grouped_df.loc[grouped_df['FULL_DONG_NAME'] == '서울특별시 동대문구 기동', 'FULL_DONG_NAME'] = '서울특별시 동대문구 제기동'

In [None]:
import plotly.express as px

In [None]:
grouped_0 = grouped_df[grouped_df["Cluster"] == 0]
grouped_1 = grouped_df[grouped_df["Cluster"] == 1]
grouped_2 = grouped_df[grouped_df["Cluster"] == 2]
grouped_3 = grouped_df[grouped_df["Cluster"] == 3]

In [None]:
grouped_df

In [None]:
# 특정 클러스터 지정
target_cluster = 0

# 컬러맵 설정
grouped_df['Color'] = grouped_df['Cluster'].apply(lambda x: target_cluster if x == target_cluster else 'Other')

# 색상 매핑 설정
color_discrete_map = {
    target_cluster: 'rgba(255, 0, 0, 0.8)',  # 클러스터 색상
    'Other': 'rgba(169, 169, 169, 1)'  # 투명 회색
}

# 지도 그리기
fig_0 = px.choropleth_mapbox(grouped_df,
                             geojson=geo_data,
                             locations='FULL_DONG_NAME',
                             color='Color',
                             color_discrete_map=color_discrete_map,
                             featureidkey='properties.adm_nm',
                             mapbox_style='carto-positron',
                             zoom=10,
                             center={"lat": 37.563383, "lon": 126.996039},
                             opacity=0.5
                            )

fig_0.show()

In [None]:
# 특정 클러스터 지정
target_cluster = 1

# 컬러맵 설정
grouped_df['Color'] = grouped_df['Cluster'].apply(lambda x: target_cluster if x == target_cluster else 'Other')

# 색상 매핑 설정
color_discrete_map = {
    target_cluster: 'rgba(255, 0, 0, 0.8)',  # 클러스터 색상
    'Other': 'rgba(169, 169, 169, 1)'  # 투명 회색
}

# 지도 그리기
fig_1 = px.choropleth_mapbox(grouped_df,
                             geojson=geo_data,
                             locations='FULL_DONG_NAME',
                             color='Color',
                             color_discrete_map=color_discrete_map,
                             featureidkey='properties.adm_nm',
                             mapbox_style='carto-positron',
                             zoom=10,
                             center={"lat": 37.563383, "lon": 126.996039},
                             opacity=0.5
                            )

fig_1.show()

In [None]:
# 특정 클러스터 지정
target_cluster = 2

# 컬러맵 설정
grouped_df['Color'] = grouped_df['Cluster'].apply(lambda x: target_cluster if x == target_cluster else 'Other')

# 색상 매핑 설정
color_discrete_map = {
    target_cluster: 'rgba(255, 0, 0, 0.8)',  # 클러스터 색상
    'Other': 'rgba(169, 169, 169, 1)'  # 투명 회색
}

# 지도 그리기
fig_2 = px.choropleth_mapbox(grouped_df,
                             geojson=geo_data,
                             locations='FULL_DONG_NAME',
                             color='Color',
                             color_discrete_map=color_discrete_map,
                             featureidkey='properties.adm_nm',
                             mapbox_style='carto-positron',
                             zoom=10,
                             center={"lat": 37.563383, "lon": 126.996039},
                             opacity=0.5
                            )

fig_2.show()

In [None]:
# 특정 클러스터 지정
target_cluster = 3

# 컬러맵 설정
grouped_df['Color'] = grouped_df['Cluster'].apply(lambda x: target_cluster if x == target_cluster else 'Other')

# 색상 매핑 설정
color_discrete_map = {
    target_cluster: 'rgba(255, 0, 0, 0.8)',  # 클러스터 색상
    'Other': 'rgba(169, 169, 169, 1)'  # 투명 회색
}

# 지도 그리기
fig_3 = px.choropleth_mapbox(grouped_df,
                             geojson=geo_data,
                             locations='FULL_DONG_NAME',
                             color='Color',
                             color_discrete_map=color_discrete_map,
                             featureidkey='properties.adm_nm',
                             mapbox_style='carto-positron',
                             zoom=10,
                             center={"lat": 37.563383, "lon": 126.996039},
                             opacity=0.5
                            )

fig_3.show()

In [None]:
fig = px.choropleth_mapbox(grouped_df,
                           geojson=geo_data,
                           locations='FULL_DONG_NAME',
                           color='Cluster',
                           color_continuous_scale='viridis', featureidkey = 'properties.adm_nm',
                           mapbox_style='carto-positron',
                           zoom=10,
                           center = {"lat": 37.563383, "lon": 126.996039},
                           opacity=0.5
                          )

fig

In [None]:
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

## 클러스터별 특징 파악

In [None]:
grouped_df.to_csv("/Users/sunjaelee/Library/CloudStorage/Dropbox/교과/다차원자료분석PBL/기말고사/데이터/grouped.csv", index = False, encoding = 'utf-8-sig')