모듈 다운로드

In [None]:
import pandas as pd   

In [None]:
import matplotlib.pyplot as plt

In [None]:
import numpy as np

In [None]:
import seaborn as sns

In [None]:
from sklearn.decomposition import PCA

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
from sklearn.decomposition import FactorAnalysis

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

In [None]:
!pip install plotly pandas

In [None]:
import plotly.express as px

In [None]:
!pip install kaleido

# 교수님 피드백

## 데이터 불러오기(이상치 제거된 데이터)

In [None]:
path = "C:/Users/leese/Downloads/data_outlier_eliminated_csv.csv"

In [None]:
data = pd.read_csv(path)

In [None]:
data.columns

## 헥사곤 플롯

In [None]:
x = data["TOTAL_WORKER"]
y = data["PRICE"]
   
plt.hexbin(x, y, gridsize = 30, cmap = 'Blues')
plt.colorbar(label = 'count')
plt.xlabel("Total Worker")
plt.ylabel("Price")
plt.savefig('hexagon_totalworker_price.png', dpi = 300)
plt.show()


In [None]:
x = data["BUILDING"]
y = data["TOTAL_POP"]

plt.hexbin(x, y, gridsize = 30, cmap = 'Blues')
plt.colorbar(label = 'count')
plt.xlabel("Building")
plt.ylabel("Total_Pop")
plt.savefig('hexagon_building_total_pop.png', dpi = 300)
plt.show()

## 전체 상관행렬

In [None]:
selected_columns = ['PRICE',
                   'TOTAL_POP', 'OLD_IDX', 'WORKABLE_POP', 'HOUSEHOLD_N',
                   'TOTAL_BUISNESS', 'TOTAL_WORKER',
                   'BUILDING', 'HOUSE', 'OLD_BUILDING', 'OLD_HOUSE', 'BUILDING_DENSITY', 'BUILDING_COMPLEXITY',
                   'BIGROAD_N', 'BIGROAD_EFFECT', 'ROAD_N', 'ROAD_EFFECT', 'BUS_N', 'BUS_EFFECT',
                   'ELEMENT_EFFECT', 'MIDDLE_EFFECT', 'HIGH_EFFECT', 'ACADEMY', 'HOSPITAL', 'BANK', 'MART_DEPARTMENT', 'SECURITY', 'CONVENIENT',
                   'CREMATORIUM_EFFECT', 'COLUMBARIUM_EFFECT', 'WASTE_EFFECT']
selected_data = data[selected_columns]

# 상관 행렬 계산
corr_matrix = selected_data.corr()

# 상관 행렬 시각화
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Selected Variables')
plt.savefig('heatmap.png', dpi = 300)
plt.show()

# 변수 개수 줄이기

## 유해시설 변수 = 봉안시설, 화장시설, 폐기물시설 평균

In [None]:
new_data = pd.read_csv(path)

In [None]:
new_data["HARMFUL_FACILITY_EFFECT"] = (new_data["CREMATORIUM_EFFECT"] + new_data["COLUMBARIUM_EFFECT"] + new_data["WASTE_EFFECT"]) / 3

In [None]:
fig = px.histogram(new_data, x='HARMFUL_FACILITY_EFFECT', nbins=500, title='Histogram')
fig.show()

## 학교 변수 = 초등학교, 중학교, 고등학교 평균

In [None]:
new_data["EDUCATION_EFFECT"] = (new_data["ELEMENT_EFFECT"] + new_data["MIDDLE_EFFECT"] + new_data["HIGH_EFFECT"]) / 3

In [None]:
fig = px.histogram(new_data, x='EDUCATION_EFFECT', nbins=500, title='Histogram')
fig.show()

# 주성분분석

In [None]:
data.columns

In [None]:
selected_columns = ['TOTAL_POP', 'OLD_IDX', 'WORKABLE_POP',
       'HOUSEHOLD_N', 'TOTAL_BUISNESS', 'TOTAL_WORKER', 'BUILDING', 'HOUSE',
       'OLD_BUILDING', 'OLD_HOUSE', 'BUILDING_DENSITY', 'BUILDING_COMPLEXITY',
       'BIGROAD_N', 'BIGROAD_EFFECT', 'ROAD_N', 'ROAD_EFFECT', 'BUS_N',
       'BUS_EFFECT', 'ELEMENT_EFFECT', 'MIDDLE_EFFECT', 'HIGH_EFFECT',
       'ACADEMY', 'HOSPITAL', 'BANK', 'MART_DEPARTMENT', 'SECURITY',
       'CONVENIENT', 'CREMATORIUM_EFFECT', 'COLUMBARIUM_EFFECT',
       'WASTE_EFFECT']

In [None]:
selected_data = data[selected_columns]

In [None]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(selected_data)

In [None]:
pca = PCA(n_components = 10)
pca_result = pca.fit_transform(scaled_data)

In [None]:
explained_variance = pca.explained_variance_ratio_
eigenvalues = pca.explained_variance_
print('Explained variance ratio by each component:', explained_variance)
print('Eigenvalues (Variance) for each component:', eigenvalues)

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), np.cumsum(pca.explained_variance_ratio_), marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Explained Variance')
plt.grid(True)
plt.xticks(range(1, len(pca.explained_variance_ratio_) + 1))
plt.savefig('pca_var.png', dpi = 300)
plt.show()

In [None]:
# Scree plot 그리기
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(eigenvalues) + 1), eigenvalues, marker='o', linestyle='--')
plt.xlabel('Principal Component')
plt.ylabel('Eigenvalue')
plt.title('Scree Plot')
plt.grid(True)
plt.show()

In [None]:
loadings = pd.DataFrame(pca.components_.T, columns=[f'PC{i+1}' for i in range(pca.n_components_)], index=selected_columns)
print(loadings)

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(loadings, annot=True, cmap='coolwarm', center=0)
plt.title('PCA Loadings Heatmap')
plt.show()

# 인자분석

In [None]:
fa = FactorAnalysis(n_components=4, random_state=0)  # 원하는 인자 수 선택
fa_result = fa.fit_transform(scaled_data)

In [None]:
factor_loadings = pd.DataFrame(fa.components_.T, columns=[f'Factor{i+1}' for i in range(fa.components_.shape[0])], index=selected_columns)


In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(factor_loadings, annot=True, cmap='coolwarm', center=0, annot_kws={"size": 10}, fmt=".2f")
plt.xticks(rotation=45, ha='right', size=12)
plt.yticks(size=12)
plt.title('Factor Analysis Loadings Heatmap', size=15)
plt.savefig('fa_result.png', dpi = 300)
plt.show()

# 다중공선성 확인 및 해결

In [None]:
selected_columns = ['TOTAL_POP', 'OLD_IDX', 'WORKABLE_POP',
       'HOUSEHOLD_N', 'TOTAL_BUISNESS', 'TOTAL_WORKER', 'BUILDING', 'HOUSE',
       'OLD_BUILDING', 'OLD_HOUSE', 'BUILDING_DENSITY', 'BUILDING_COMPLEXITY',
       'BIGROAD_N', 'BIGROAD_EFFECT', 'ROAD_N', 'ROAD_EFFECT', 'BUS_N',
       'BUS_EFFECT', 'ELEMENT_EFFECT', 'MIDDLE_EFFECT', 'HIGH_EFFECT',
       'ACADEMY', 'HOSPITAL', 'BANK', 'MART_DEPARTMENT', 'SECURITY',
       'CONVENIENT', 'CREMATORIUM_EFFECT', 'COLUMBARIUM_EFFECT',
       'WASTE_EFFECT']

In [None]:
X = add_constant(data[selected_columns])

# VIF 계산
vif_data = pd.DataFrame()
vif_data['feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_data)

In [None]:
# VIF 값이 10 이상인 항목 필터링
high_vif = vif_data[vif_data['VIF'] > 7]
high_vif.drop(index = 0, inplace = True)

# 시각화
plt.figure(figsize=(10, 6))
sns.barplot(x='VIF', y='feature', data=high_vif, palette='coolwarm')
plt.title('High Variance Inflation Factors (VIF)')
plt.xlabel('Variance Inflation Factor')
plt.ylabel('Features')
plt.grid(True)
plt.savefig('high_vif.png', dpi = 300)
plt.show()

## 인구관련 변수 주성분분석을 통해 묶기

In [None]:
selected = ["TOTAL_POP", "WORKABLE_POP", "HOUSEHOLD_N"]

In [None]:
selected_data = data[selected]

In [None]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(selected_data)

In [None]:
pca = PCA(n_components = 3)
pca_result = pca.fit_transform(scaled_data)

In [None]:
loadings = pd.DataFrame(pca.components_.T, columns=[f'PC{i+1}' for i in range(pca.n_components_)], index=selected)

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(loadings, annot=True, cmap='coolwarm', center=0)
plt.title('PCA Loadings Heatmap')
plt.show()

In [None]:
explained_variance = pca.explained_variance_ratio_
eigenvalues = pca.explained_variance_
print('Explained variance ratio by each component:', explained_variance)
print('Eigenvalues (Variance) for each component:', eigenvalues)

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), np.cumsum(pca.explained_variance_ratio_), marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Explained Variance')
plt.grid(True)
plt.xticks(range(1, len(pca.explained_variance_ratio_) + 1))
plt.show()

In [None]:
new_data["POP_PCA_SCORE"] = pca_result[:, 0]

In [None]:
new_data

## 경제활동 관련 변수 하나로 묶기

In [None]:
selected = ["TOTAL_BUISNESS", "TOTAL_WORKER"]

In [None]:
selected_data = data[selected]

In [None]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(selected_data)

In [None]:
pca = PCA(n_components = 2)
pca_result = pca.fit_transform(scaled_data)

In [None]:
loadings = pd.DataFrame(pca.components_.T, columns=[f'PC{i+1}' for i in range(pca.n_components_)], index=selected)

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(loadings, annot=True, cmap='coolwarm', center=0)
plt.title('PCA Loadings Heatmap')
plt.show()

In [None]:
explained_variance = pca.explained_variance_ratio_
eigenvalues = pca.explained_variance_
print('Explained variance ratio by each component:', explained_variance)
print('Eigenvalues (Variance) for each component:', eigenvalues)

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), np.cumsum(pca.explained_variance_ratio_), marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Explained Variance')
plt.grid(True)
plt.xticks(range(1, len(pca.explained_variance_ratio_) + 1))
plt.show()

In [None]:
new_data["ECONOMY_PCA_SCORE"] = pca_result[:, 0]

In [None]:
new_data

## 건물 관련 변수 하나로 묶기

In [None]:
selected = ["BUILDING", "HOUSE", "OLD_BUILDING", "OLD_HOUSE"]

In [None]:
selected_data = data[selected]

In [None]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(selected_data)

In [None]:
pca = PCA(n_components = 4)
pca_result = pca.fit_transform(scaled_data)

In [None]:
loadings = pd.DataFrame(pca.components_.T, columns=[f'PC{i+1}' for i in range(pca.n_components_)], index=selected)

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(loadings, annot=True, cmap='coolwarm', center=0)
plt.title('PCA Loadings Heatmap')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), np.cumsum(pca.explained_variance_ratio_), marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Explained Variance')
plt.grid(True)
plt.xticks(range(1, len(pca.explained_variance_ratio_) + 1))
plt.show()

In [None]:
explained_variance = pca.explained_variance_ratio_
eigenvalues = pca.explained_variance_
print('Explained variance ratio by each component:', explained_variance)
print('Eigenvalues (Variance) for each component:', eigenvalues)

In [None]:
new_data["BUILDING_PCA_SCORE"] = pca_result[:, 0]

In [None]:
new_data

# 최종 데이터 만들기

In [None]:
new_data.columns

In [None]:
eliminate = ["TOTAL_POP", "WORKABLE_POP", "HOUSEHOLD_N", 'TOTAL_BUISNESS', 'TOTAL_WORKER', 'BUILDING', 'HOUSE',
       'OLD_BUILDING', 'OLD_HOUSE', 'BUILDING_DENSITY', 'BUILDING_COMPLEXITY', 'ELEMENT_EFFECT', 'MIDDLE_EFFECT', 'HIGH_EFFECT',
            'CREMATORIUM_EFFECT', 'COLUMBARIUM_EFFECT',
       'WASTE_EFFECT', "CONVENIENT"]

In [None]:
final_data = new_data.drop(columns = eliminate)

In [None]:
final_data.columns

## 다중공선성 확인

In [None]:
selected_columns = ['OLD_IDX', 'BIGROAD_N', 'BIGROAD_EFFECT',
       'ROAD_N', 'ROAD_EFFECT', 'BUS_N', 'BUS_EFFECT', 'ACADEMY', 'HOSPITAL',
       'BANK', 'MART_DEPARTMENT', 'SECURITY',
       'HARMFUL_FACILITY_EFFECT', 'EDUCATION_EFFECT', 'POP_PCA_SCORE',
       'ECONOMY_PCA_SCORE', 'BUILDING_PCA_SCORE']

In [None]:
X = add_constant(final_data[selected_columns])

# VIF 계산
vif_data = pd.DataFrame()
vif_data['feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_data)

In [None]:
vif_data.drop(index = 0, inplace = True)

# 시각화
plt.figure(figsize=(10, 6))
sns.barplot(x='VIF', y='feature', data=vif_data, palette='coolwarm')
plt.title('Variance Inflation Factors (VIF)')
plt.xlabel('Variance Inflation Factor')
plt.ylabel('Features')
plt.grid(True)
plt.show()

## 상관행렬 확인

In [None]:
final_data.columns

In [None]:
selected = ['OLD_IDX', 'BIGROAD_N', 'BIGROAD_EFFECT',
       'ROAD_N', 'ROAD_EFFECT', 'BUS_N', 'BUS_EFFECT', 'ACADEMY', 'HOSPITAL',
       'BANK', 'MART_DEPARTMENT', 'SECURITY', 
       'HARMFUL_FACILITY_EFFECT', 'EDUCATION_EFFECT', 'POP_PCA_SCORE',
       'ECONOMY_PCA_SCORE', 'BUILDING_PCA_SCORE']

In [None]:
selected_data = final_data[selected]

In [None]:
# 상관 행렬 계산
corr_matrix = selected_data.corr()

# 상관 행렬 시각화
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Selected Variables')
plt.savefig('heatmap.png', dpi = 300)
plt.show()

In [None]:
final_data.to_csv('C:/Users/leese/Downloads/final_data.csv', index = False, encoding = 'utf-8-sig')

In [None]:
final_data