In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.datasets import fetch_california_housing

# 데이터셋 불러오기
housing = fetch_california_housing()

# 데이터프레임으로 변환
df_housing = pd.DataFrame(data=housing.data, columns=housing.feature_names)
df_housing['MedHouseVal'] = housing.target
df_housing

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [3]:
# MedInc (중위 소득): 특정 블록 그룹 내의 중위 소득을 나타냅니다. 즉, 해당 지역의 소득 분포에서 중간값에 해당하는 소득 수준입니다.
# HouseAge (중위 주택 연식): 특정 블록 그룹 내의 주택의 중위 연식을 의미합니다. 즉, 해당 지역의 주택들이 얼마나 오래되었는지를 나타냅니다.
# AveRooms (가구당 평균 방 개수): 한 가구당 평균적으로 몇 개의 방이 있는지를 나타냅니다. 이는 주택의 크기와 관련된 지표입니다.
# AveBedrms (가구당 평균 침실 개수): 한 가구당 평균적으로 몇 개의 침실이 있는지를 나타냅니다.
# Population (인구 수): 특정 블록 그룹 내의 총 인구 수를 의미합니다. 이는 해당 지역의 인구 밀도를 이해하는 데 도움이 됩니다.
# AveOccup (가구당 평균 거주자 수): 한 가구당 평균적으로 몇 명이 거주하는지를 나타냅니다.
# Latitude (위도): 주택이 북쪽으로 얼마나 떨어져 있는지를 측정한 값입니다. 지리적 위치를 나타내는 데 사용됩니다.
# Longitude (경도): 주택이 서쪽으로 얼마나 떨어져 있는지를 측정한 값입니다. 역시 지리적 위치를 나타냅니다.
# MedHouseVal (중위 주택 가치): 캘리포니아 지역의 주택에 대한 중위 가치를 의미합니다. 이는 주택 가격의 분포에서 중간값에 해당하는 가격 수준입니다.

In [4]:
df_housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [5]:
df_housing.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [6]:
# MedInc (중위 소득) 오름차순 정렬
df_housing.sort_values(by = 'MedInc', ascending = True)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
4861,0.4999,29.0,2.373272,1.055300,2690.0,12.396313,34.02,-118.28,5.00001
7125,0.4999,36.0,2.500000,0.833333,15.0,2.500000,33.90,-118.04,1.62500
6688,0.4999,28.0,7.677419,1.870968,142.0,4.580645,34.15,-118.08,5.00001
19800,0.4999,15.0,11.596491,2.561404,131.0,2.298246,40.43,-123.32,0.56700
6343,0.4999,52.0,3.875000,0.562500,44.0,2.750000,34.06,-117.75,1.12500
...,...,...,...,...,...,...,...,...,...
4605,15.0001,52.0,8.483019,0.962264,813.0,3.067925,34.07,-118.33,5.00001
4606,15.0001,52.0,9.204969,1.062112,531.0,3.298137,34.07,-118.33,5.00001
4626,15.0001,52.0,7.958333,0.833333,457.0,3.808333,34.06,-118.32,5.00001
8848,15.0001,52.0,8.907583,1.099526,1407.0,3.334123,34.08,-118.39,5.00001


In [7]:
# MedInc (중위 소득) 내림차순 정렬
df_housing.sort_values(by = 'MedInc', ascending=False)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
4352,15.0001,37.0,8.659574,1.425532,100.0,2.127660,34.10,-118.37,5.00001
10673,15.0001,13.0,7.842900,0.993958,1865.0,2.817221,33.62,-117.85,5.00001
8849,15.0001,52.0,9.237288,1.062954,1266.0,3.065375,34.08,-118.40,5.00001
4606,15.0001,52.0,9.204969,1.062112,531.0,3.298137,34.07,-118.33,5.00001
5257,15.0001,42.0,9.229032,1.161290,829.0,2.674194,34.06,-118.49,5.00001
...,...,...,...,...,...,...,...,...,...
3249,0.4999,23.0,6.054545,1.672727,198.0,3.600000,36.09,-119.99,1.00000
3258,0.4999,16.0,21.631579,6.000000,26.0,1.368421,39.42,-122.89,0.73500
6343,0.4999,52.0,3.875000,0.562500,44.0,2.750000,34.06,-117.75,1.12500
5213,0.4999,52.0,2.600000,0.733333,74.0,1.644444,33.93,-118.28,0.90600


In [8]:
# MedInc 컬럼의 기술 통계 1

df_housing["MedInc"].describe()

count    20640.000000
mean         3.870671
std          1.899822
min          0.499900
25%          2.563400
50%          3.534800
75%          4.743250
max         15.000100
Name: MedInc, dtype: float64

In [9]:
# MedInc 컬럼의 기술 통계 2

#.2f : 소수점 이하 2자리까지 반올림

mean_medinc = df_housing["MedInc"].mean()  # 평균
print(f"평균: {mean_medinc:.2f}")

std_medinc = df_housing["MedInc"].std()    # 표준편차
print(f"표준편차: {std_medinc:.2f}")

min_medinc = df_housing["MedInc"].min()    # 최소값
print(f"최소값: {min_medinc:.2f}")

max_medinc = df_housing["MedInc"].max()    # 최대값
print(f"최대값: {max_medinc:.2f}")

median_medinc = df_housing["MedInc"].median()  # 중위수
print(f"중위수: {median_medinc:.2f}")

quantile_25 = df_housing["MedInc"].quantile(0.25)  # 25% 분위수
print(f"25% 분위수: {quantile_25:.2f}")

quantile_75 = df_housing["MedInc"].quantile(0.75)  # 75% 분위수
print(f"75% 분위수: {quantile_75:.2f}")

평균: 3.87
표준편차: 1.90
최소값: 0.50
최대값: 15.00
중위수: 3.53
25% 분위수: 2.56
75% 분위수: 4.74


In [10]:
# 평당 가격
df_housing['Value_per_Room'] = df_housing['MedHouseVal'] / df_housing['AveRooms']
df_housing['Value_per_Room']

0        0.648041
1        0.574691
2        0.424824
3        0.586693
4        0.544744
           ...   
20635    0.154793
20636    0.126103
20637    0.177311
20638    0.158926
20639    0.170133
Name: Value_per_Room, Length: 20640, dtype: float64

In [11]:
# 주택 연식 대비 평균 방 개수 비율 (HouseAge_to_AveRooms_Ratio)
df_housing['HouseAge_to_AveRooms_Ratio'] = df_housing['HouseAge'] / df_housing['AveRooms']
df_housing['HouseAge_to_AveRooms_Ratio']

0        5.870455
1        3.366390
2        6.274029
3        8.938776
4        8.277812
           ...   
20635    4.954955
20636    2.944046
20637    3.265750
20638    3.377419
20639    3.044883
Name: HouseAge_to_AveRooms_Ratio, Length: 20640, dtype: float64

In [12]:
# 방당 평균 인구 수 (Population_per_Room):
df_housing['Population_per_Room'] = df_housing['Population'] / df_housing['AveRooms']
df_housing['Population_per_Room']

0         46.104545
1        384.890548
2         59.844581
3         95.919937
4         89.941610
            ...    
20635    167.477477
20636     58.226686
20637    193.447649
20638    139.037097
20639    263.953321
Name: Population_per_Room, Length: 20640, dtype: float64

In [13]:
df_housing

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal,Value_per_Room,HouseAge_to_AveRooms_Ratio,Population_per_Room
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526,0.648041,5.870455,46.104545
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585,0.574691,3.366390,384.890548
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521,0.424824,6.274029,59.844581
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413,0.586693,8.938776,95.919937
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422,0.544744,8.277812,89.941610
...,...,...,...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781,0.154793,4.954955,167.477477
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771,0.126103,2.944046,58.226686
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923,0.177311,3.265750,193.447649
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847,0.158926,3.377419,139.037097
