In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [37]:
import pandas as pd

train = pd.read_csv('/content/drive/MyDrive/aistage/train.csv',low_memory=False)

In [38]:
train['rounded_X'] = train['좌표X'].round(3)
train['rounded_Y'] = train['좌표Y'].round(3)

In [39]:
library = pd.read_csv('/content/drive/MyDrive/aistage/library.csv', header=None)

library.columns = [
    '도서관일련번호', '도서관명', '구번호', '구', '주소', '도서관전화번호',
    '홈페이지URL', '운영시간', '정기휴관일', '도서관구분', '도서관구분명',
    '위도', '경도', '경찰서명', '경찰서부서명', '경찰서부서코드', '시도경찰청명'
]

library['rounded_X'] = library['경도'].round(3) #좌표X 경도
library['rounded_Y'] = library['위도'].round(3) #좌표Y 위도

# Merge based on rounded coordinates and count the occurrences
merged_df = pd.merge(train, library, how='left', left_on=['rounded_X', 'rounded_Y'], right_on=['rounded_X', 'rounded_Y'])
count_df = merged_df.groupby(['rounded_X', 'rounded_Y']).size().reset_index(name='좌표별_도서관_숫자')
train = train.merge(count_df, on=['rounded_X', 'rounded_Y'], how='left')


library_count = library['구'].value_counts().reset_index()
library_count.columns = ['구', '구_도서관_숫자']

# Extract '구' from '시군구' column in train_df
train['구'] = train['시군구'].apply(lambda x: x.split()[1])

# Merge the train_df with the library_summary_df on the '구' column
train = pd.merge(train, library_count, on='구', how='left')

In [42]:
train['구_도서관_숫자']

0          54
1          54
2          54
3          54
4          54
           ..
1118817    80
1118818    80
1118819    80
1118820    80
1118821    25
Name: 구_도서관_숫자, Length: 1118822, dtype: int64

In [41]:
train.head()

Unnamed: 0,시군구,번지,본번,부번,아파트명,전용면적(㎡),계약년월,계약일,층,건축년도,...,관리비 업로드,좌표X,좌표Y,단지신청일,target,rounded_X,rounded_Y,좌표별_도서관_숫자,구,구_도서관_숫자
0,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,8,3,1987,...,N,127.05721,37.476763,2022-11-17 10:19:06.0,124000,127.057,37.477,101.0,강남구,54
1,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,22,4,1987,...,N,127.05721,37.476763,2022-11-17 10:19:06.0,123500,127.057,37.477,101.0,강남구,54
2,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,54.98,201712,28,5,1987,...,N,127.05721,37.476763,2022-11-17 10:19:06.0,91500,127.057,37.477,101.0,강남구,54
3,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,3,4,1987,...,N,127.05721,37.476763,2022-11-17 10:19:06.0,130000,127.057,37.477,101.0,강남구,54
4,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,8,2,1987,...,N,127.05721,37.476763,2022-11-17 10:19:06.0,117000,127.057,37.477,101.0,강남구,54


In [43]:
facilities = pd.read_csv('/content/drive/MyDrive/aistage/50시설.csv', header=None)
facilities.columns = [
    '기준일', '위도', '경도', '구분', '센터명', '주소', '설립일자',
    '추후수정', '추후수정2', '중요업무내용', '경찰서명', '경찰서부서명',
    '경찰서부서코드', '시도경찰청명'
]

In [44]:
# Round the coordinates to 3 decimal places for comparison
facilities['rounded_Y'] = facilities['위도'].round(3)
facilities['rounded_X'] = facilities['경도'].round(3)
train['rounded_Y'] = train['좌표Y'].round(3)
train['rounded_X'] = train['좌표X'].round(3)

In [45]:
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

# One-hot encode the 중요업무내용
mlb = MultiLabelBinarizer()
encoded_labels = mlb.fit_transform(facilities['중요업무내용'].str.split('/'))
encoded_df = pd.DataFrame(encoded_labels, columns=mlb.classes_)
facilities_df = pd.concat([facilities, encoded_df], axis=1)

# Initialize the result columns
result_columns = mlb.classes_
for col in result_columns:
    train[col] = 0

In [46]:
# Process each row in temp_df
for index, row in train.iterrows():
    nearby_facilities = facilities_df[
        (facilities['rounded_X'] == row['rounded_X']) &
        (facilities['rounded_Y'] == row['rounded_Y'])
    ]
    if not nearby_facilities.empty:
        summed_labels = nearby_facilities[result_columns].sum()
        train.loc[index, result_columns] = summed_labels

In [47]:
train.head()

Unnamed: 0,시군구,번지,본번,부번,아파트명,전용면적(㎡),계약년월,계약일,층,건축년도,...,구,구_도서관_숫자,50+세대 삶의 전환을 위한 교육,50+정책 및 사업총괄 기획,사회공헌 아카데미,운영,"인생재설계 교육, 상담, 사회참여활동",일자리 플랫폼,지원,커뮤니티 지원
0,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,8,3,1987,...,강남구,54,0,0,0,0,0,0,0,0
1,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,22,4,1987,...,강남구,54,0,0,0,0,0,0,0,0
2,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,54.98,201712,28,5,1987,...,강남구,54,0,0,0,0,0,0,0,0
3,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,3,4,1987,...,강남구,54,0,0,0,0,0,0,0,0
4,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,8,2,1987,...,강남구,54,0,0,0,0,0,0,0,0


In [48]:
bike = pd.read_csv('/content/drive/MyDrive/aistage/따릉이대여소.csv', header=None)
bike.columns = [
    '대여소번호', '대여소명', '구', '상세주소', '위도', '경도',
    '설치시기', '거치대수_LCD', '거치대수_QR', '운영방식명', '경찰서명',
    '경찰서부서명', '경찰서부서코드', '시도경찰청명'
]

# Calculate the total number of docking stations per district
bike['구별_전체거치대수'] = bike['거치대수_LCD'].fillna(0) + bike['거치대수_QR'].fillna(0)
total_docks_per_district = bike.groupby('구')['구별_전체거치대수'].sum().reset_index()

# Merge the total docking stations information with temp_df
train = train.merge(total_docks_per_district, on='구', how='left')

In [49]:
train.head()

Unnamed: 0,시군구,번지,본번,부번,아파트명,전용면적(㎡),계약년월,계약일,층,건축년도,...,구_도서관_숫자,50+세대 삶의 전환을 위한 교육,50+정책 및 사업총괄 기획,사회공헌 아카데미,운영,"인생재설계 교육, 상담, 사회참여활동",일자리 플랫폼,지원,커뮤니티 지원,구별_전체거치대수
0,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,8,3,1987,...,54,0,0,0,0,0,0,0,0,1754.0
1,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,22,4,1987,...,54,0,0,0,0,0,0,0,0,1754.0
2,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,54.98,201712,28,5,1987,...,54,0,0,0,0,0,0,0,0,1754.0
3,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,3,4,1987,...,54,0,0,0,0,0,0,0,0,1754.0
4,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,8,2,1987,...,54,0,0,0,0,0,0,0,0,1754.0


In [50]:
bike['rounded_Y'] = bike['위도'].round(3)
bike['rounded_X'] = bike['경도'].round(3)

# Calculate the total docking stations per rounded coordinate
bike['좌표별_전체거치대수'] = bike['거치대수_LCD'].fillna(0) + bike['거치대수_QR'].fillna(0)
total_docks_per_coord = bike.groupby(['rounded_Y', 'rounded_X'])['좌표별_전체거치대수'].sum().reset_index()

# Merge the total docking stations information with temp_df
train = train.merge(total_docks_per_coord, on=['rounded_Y', 'rounded_X'], how='left')

In [51]:
train.head()

Unnamed: 0,시군구,번지,본번,부번,아파트명,전용면적(㎡),계약년월,계약일,층,건축년도,...,50+세대 삶의 전환을 위한 교육,50+정책 및 사업총괄 기획,사회공헌 아카데미,운영,"인생재설계 교육, 상담, 사회참여활동",일자리 플랫폼,지원,커뮤니티 지원,구별_전체거치대수,좌표별_전체거치대수
0,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,8,3,1987,...,0,0,0,0,0,0,0,0,1754.0,
1,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,22,4,1987,...,0,0,0,0,0,0,0,0,1754.0,
2,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,54.98,201712,28,5,1987,...,0,0,0,0,0,0,0,0,1754.0,
3,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,3,4,1987,...,0,0,0,0,0,0,0,0,1754.0,
4,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,8,2,1987,...,0,0,0,0,0,0,0,0,1754.0,


In [52]:
movement = pd.read_csv('/content/drive/MyDrive/aistage/생활이동.csv',header=None)

movement.columns = [
    '년월', '구', '유입인구수', '유입인구증감률', '유출인구수', '유출인구증감률'
]

train['년월'] = train['계약년월'].astype(str).str[:6]

# Convert '년월' column in living_movement_df to string for merging
movement['년월'] = movement['년월'].astype(str)


# Merge the living movement information with temp_df based on '년월' and '자치구명'
train = train.merge(movement, on=['년월', '구'], how='left')

# Clean up the resulting DataFrame
train.drop(columns=['년월'], inplace=True)



In [53]:
train[['구','계약년월']]

Unnamed: 0,구,계약년월
0,강남구,201712
1,강남구,201712
2,강남구,201712
3,강남구,201801
4,강남구,201801
...,...,...
1118817,은평구,200707
1118818,은평구,200708
1118819,은평구,200708
1118820,은평구,200709


In [54]:
population = pd.read_csv('/content/drive/MyDrive/aistage/인구통계.csv',header=None)
population.columns = [
    '년도', '행정구역읍면동코드', '행정구역읍면동명', '노인인구수', '노인인구증감율',
    '미성년자인구수', '미성년자인구증감율', '여자인구수', '여자인구증감율'
]

population['년도'] = population['년도'].astype(str)
# Extract district and sub-district information from '시군구' in temp_df
train['년도'] = train['계약년월'].astype(str).str[:4]
train['동'] = train['시군구'].apply(lambda x: x.split()[2] if len(x.split()) > 2 else '')

# Extract district and sub-district information from '행정구역읍면동명' in population_stats_df
population['동'] = population['행정구역읍면동명']

# Merge the population statistics information with temp_df based on '구' and '동'
train = train.merge(population, on=['년도', '동'], how='left')

# Clean up the resulting DataFrame
train.drop(columns=['행정구역읍면동코드', '행정구역읍면동명', '년도'], inplace=True)


In [55]:
train.head()

Unnamed: 0,시군구,번지,본번,부번,아파트명,전용면적(㎡),계약년월,계약일,층,건축년도,...,유입인구증감률,유출인구수,유출인구증감률,동,노인인구수,노인인구증감율,미성년자인구수,미성년자인구증감율,여자인구수,여자인구증감율
0,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,8,3,1987,...,,,,개포동,,,,,,
1,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,22,4,1987,...,,,,개포동,,,,,,
2,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,54.98,201712,28,5,1987,...,,,,개포동,,,,,,
3,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,3,4,1987,...,,,,개포동,,,,,,
4,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,8,2,1987,...,,,,개포동,,,,,,


In [66]:
train.to_csv('/content/drive/MyDrive/aistage/train_final.csv', index=False)

In [51]:
train

Unnamed: 0,시군구,번지,본번,부번,아파트명,전용면적(㎡),계약년월,계약일,층,건축년도,...,유입인구증감률,유출인구수,유출인구증감률,동,노인인구수,노인인구증감율,미성년자인구수,미성년자인구증감율,여자인구수,여자인구증감율
0,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,8,3,1987,...,,,,개포동,,,,,,
1,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,22,4,1987,...,,,,개포동,,,,,,
2,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,54.98,201712,28,5,1987,...,,,,개포동,,,,,,
3,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,3,4,1987,...,,,,개포동,,,,,,
4,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,8,2,1987,...,,,,개포동,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1122598,서울특별시 은평구 구산동,382,382.0,0.0,갈현현대,59.94,200707,12,11,1998,...,,,,구산동,,,,,,
1122599,서울특별시 은평구 구산동,382,382.0,0.0,갈현현대,59.94,200708,25,10,1998,...,,,,구산동,,,,,,
1122600,서울특별시 은평구 구산동,382,382.0,0.0,갈현현대,84.83,200708,31,20,1998,...,,,,구산동,,,,,,
1122601,서울특별시 은평구 구산동,382,382.0,0.0,갈현현대,84.83,200709,15,8,1998,...,,,,구산동,,,,,,


In [67]:
# EDA to check for NaN values
nan_counts = train.isna().sum()
nan_counts = nan_counts[nan_counts > 0]

# Display the columns with NaN values and their counts
print("Columns with NaN values and their counts:")
print(nan_counts)

# Display the percentage of NaN values in each column
nan_percentage = (train.isna().sum() / len(train)) * 100
nan_percentage = nan_percentage[nan_percentage > 0]

print("\nPercentage of NaN values in each column:")
print(nan_percentage)

Columns with NaN values and their counts:
번지                            225
본번                             75
부번                             75
아파트명                         2132
해제사유발생일                   1116602
k-단지분류(아파트,주상복합등등)         873920
k-전화번호                     873503
k-팩스번호                     875971
단지소개기존clob                1053922
k-세대타입(분양형태)               872792
k-관리방식                     872792
k-복도유형                     873119
k-난방방식                     872792
k-전체동수                     873859
k-전체세대수                    872792
k-건설사(시공사)                 874287
k-시행사                      874483
k-사용검사일-사용승인일              872925
k-연면적                      872792
k-주거전용면적                   872837
k-관리비부과면적                  872792
k-전용면적별세대현황(60㎡이하)         872837
k-전용면적별세대현황(60㎡~85㎡이하)     872837
k-85㎡~135㎡이하               872837
k-135㎡초과                  1122276
k-홈페이지                    1009210
k-등록일자                    1111460
k-수정일자                     872837
고용보험관리

test 데이터셋만들기

In [68]:
test = pd.read_csv('/content/drive/MyDrive/aistage/test.csv',low_memory=False)

In [69]:
test['rounded_X'] = test['좌표X'].round(3)
test['rounded_Y'] = test['좌표Y'].round(3)

In [70]:
library = pd.read_csv('/content/drive/MyDrive/aistage/library.csv', header=None)

library.columns = [
    '도서관일련번호', '도서관명', '구번호', '구', '주소', '도서관전화번호',
    '홈페이지URL', '운영시간', '정기휴관일', '도서관구분', '도서관구분명',
    '위도', '경도', '경찰서명', '경찰서부서명', '경찰서부서코드', '시도경찰청명'
]

library['rounded_X'] = library['경도'].round(3) #좌표X 경도
library['rounded_Y'] = library['위도'].round(3) #좌표Y 위도

# Merge based on rounded coordinates and count the occurrences
merged_df = pd.merge(test, library, how='left', left_on=['rounded_X', 'rounded_Y'], right_on=['rounded_X', 'rounded_Y'])
count_df = merged_df.groupby(['rounded_X', 'rounded_Y']).size().reset_index(name='좌표별_도서관_숫자')
test = test.merge(count_df, on=['rounded_X', 'rounded_Y'], how='left')


library_count = library['구'].value_counts().reset_index()
library_count.columns = ['구', '구_도서관_숫자']

# Extract '구' from '시군구' column in train_df
test['구'] = test['시군구'].apply(lambda x: x.split()[1])

# Merge the train_df with the library_summary_df on the '구' column
test = pd.merge(test, library_count, on='구', how='left')

In [71]:
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

facilities = pd.read_csv('/content/drive/MyDrive/aistage/50시설.csv', header=None)
facilities.columns = [
    '기준일', '위도', '경도', '구분', '센터명', '주소', '설립일자',
    '추후수정', '추후수정2', '중요업무내용', '경찰서명', '경찰서부서명',
    '경찰서부서코드', '시도경찰청명'
]

# Round the coordinates to 3 decimal places for comparison
facilities['rounded_Y'] = facilities['위도'].round(3)
facilities['rounded_X'] = facilities['경도'].round(3)
test['rounded_Y'] = test['좌표Y'].round(3)
test['rounded_X'] = test['좌표X'].round(3)


# One-hot encode the 중요업무내용
mlb = MultiLabelBinarizer()
encoded_labels = mlb.fit_transform(facilities['중요업무내용'].str.split('/'))
encoded_df = pd.DataFrame(encoded_labels, columns=mlb.classes_)
facilities_df = pd.concat([facilities, encoded_df], axis=1)

# Initialize the result columns
result_columns = mlb.classes_
for col in result_columns:
    test[col] = 0

# Process each row in temp_df
for index, row in test.iterrows():
    nearby_facilities = facilities_df[
        (facilities['rounded_X'] == row['rounded_X']) &
        (facilities['rounded_Y'] == row['rounded_Y'])
    ]
    if not nearby_facilities.empty:
        summed_labels = nearby_facilities[result_columns].sum()
        test.loc[index, result_columns] = summed_labels

In [72]:
bike = pd.read_csv('/content/drive/MyDrive/aistage/따릉이대여소.csv', header=None)
bike.columns = [
    '대여소번호', '대여소명', '구', '상세주소', '위도', '경도',
    '설치시기', '거치대수_LCD', '거치대수_QR', '운영방식명', '경찰서명',
    '경찰서부서명', '경찰서부서코드', '시도경찰청명'
]

# Calculate the total number of docking stations per district
bike['구별_전체거치대수'] = bike['거치대수_LCD'].fillna(0) + bike['거치대수_QR'].fillna(0)
total_docks_per_district = bike.groupby('구')['구별_전체거치대수'].sum().reset_index()

# Merge the total docking stations information with temp_df
test = test.merge(total_docks_per_district, on='구', how='left')

In [73]:
bike['rounded_Y'] = bike['위도'].round(3)
bike['rounded_X'] = bike['경도'].round(3)

# Calculate the total docking stations per rounded coordinate
bike['좌표별_전체거치대수'] = bike['거치대수_LCD'].fillna(0) + bike['거치대수_QR'].fillna(0)
total_docks_per_coord = bike.groupby(['rounded_Y', 'rounded_X'])['좌표별_전체거치대수'].sum().reset_index()

# Merge the total docking stations information with temp_df
test = test.merge(total_docks_per_coord, on=['rounded_Y', 'rounded_X'], how='left')

In [74]:
movement = pd.read_csv('/content/drive/MyDrive/aistage/생활이동.csv',header=None)

movement.columns = [
    '년월', '구', '유입인구수', '유입인구증감률', '유출인구수', '유출인구증감률'
]

test['년월'] = test['계약년월'].astype(str).str[:6]

# Convert '년월' column in living_movement_df to string for merging
movement['년월'] = movement['년월'].astype(str)


# Merge the living movement information with temp_df based on '년월' and '자치구명'
test = test.merge(movement, on=['년월', '구'], how='left')

# Clean up the resulting DataFrame
test.drop(columns=['년월'], inplace=True)

In [75]:
population = pd.read_csv('/content/drive/MyDrive/aistage/인구통계.csv',header=None)
population.columns = [
    '년도', '행정구역읍면동코드', '행정구역읍면동명', '노인인구수', '노인인구증감율',
    '미성년자인구수', '미성년자인구증감율', '여자인구수', '여자인구증감율'
]

population['년도'] = population['년도'].astype(str)
# Extract district and sub-district information from '시군구' in temp_df
test['년도'] = test['계약년월'].astype(str).str[:4]
test['동'] = test['시군구'].apply(lambda x: x.split()[2] if len(x.split()) > 2 else '')

# Extract district and sub-district information from '행정구역읍면동명' in population_stats_df
population['동'] = population['행정구역읍면동명']

# Merge the population statistics information with temp_df based on '구' and '동'
test = test.merge(population, on=['년도', '동'], how='left')

# Clean up the resulting DataFrame
test.drop(columns=['행정구역읍면동코드', '행정구역읍면동명', '년도'], inplace=True)

In [76]:
test.to_csv('/content/drive/MyDrive/aistage/test_final.csv', index=False)

In [77]:
test

Unnamed: 0,시군구,번지,본번,부번,아파트명,전용면적(㎡),계약년월,계약일,층,건축년도,...,유입인구증감률,유출인구수,유출인구증감률,동,노인인구수,노인인구증감율,미성년자인구수,미성년자인구증감율,여자인구수,여자인구증감율
0,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.9700,202307,26,5,1987,...,,,,개포동,,,,,,
1,서울특별시 강남구 개포동,651-1,651.0,1.0,개포더샵트리에,108.2017,202308,15,10,2021,...,,,,개포동,,,,,,
2,서울특별시 강남구 개포동,652,652.0,0.0,개포우성3차,161.0000,202307,28,15,1984,...,,,,개포동,,,,,,
3,서울특별시 강남구 개포동,652,652.0,0.0,개포우성3차,133.4600,202308,10,14,1984,...,,,,개포동,,,,,,
4,서울특별시 강남구 개포동,652,652.0,0.0,개포우성3차,104.4300,202308,18,6,1984,...,,,,개포동,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9267,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,84.6500,202307,19,13,2014,...,,,,신내동,,,,,,
9268,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,84.6200,202307,25,12,2014,...,,,,신내동,,,,,,
9269,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,101.6500,202308,27,12,2014,...,,,,신내동,,,,,,
9270,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,84.9400,202309,2,18,2014,...,,,,신내동,,,,,,


In [78]:
# EDA to check for NaN values
nan_counts = test.isna().sum()
nan_counts = nan_counts[nan_counts > 0]

# Display the columns with NaN values and their counts
print("Columns with NaN values and their counts:")
print(nan_counts)

# Display the percentage of NaN values in each column
nan_percentage = (test.isna().sum() / len(test)) * 100
nan_percentage = nan_percentage[nan_percentage > 0]

print("\nPercentage of NaN values in each column:")
print(nan_percentage)

Columns with NaN values and their counts:
번지                           2
아파트명                        10
해제사유발생일                   9060
k-단지분류(아파트,주상복합등등)        6582
k-전화번호                    6576
k-팩스번호                    6606
단지소개기존clob                8718
k-세대타입(분양형태)              6562
k-관리방식                    6562
k-복도유형                    6564
k-난방방식                    6562
k-전체동수                    6577
k-전체세대수                   6562
k-건설사(시공사)                6579
k-시행사                     6580
k-사용검사일-사용승인일             6563
k-연면적                     6562
k-주거전용면적                  6562
k-관리비부과면적                 6562
k-전용면적별세대현황(60㎡이하)        6562
k-전용면적별세대현황(60㎡~85㎡이하)    6562
k-85㎡~135㎡이하              6562
k-135㎡초과                  9270
k-홈페이지                    7876
k-등록일자                    8554
k-수정일자                    6562
고용보험관리번호                  7453
경비비관리형태                   6573
세대전기계약방법                  6642
청소비관리형태                   6573
건축면적                      65