In [0]:
import pandas as pd
import numpy as np
from pyspark.sql import Window
from pyspark.sql.functions import col, lead, when, sum as Fsum, round
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

#  PySpark에서 데이터 불러오기
df_silver = spark.table("postgres_team5_catalog.silver.silver_district_traffic_stats")

# '전체' 값 제거
df_silver = df_silver.filter(df_silver["gugun_nm"] != "전체")
df_silver = df_silver.drop("casualty_rate")
display(df_silver)

searchyear,gugun_code,gugun_nm,occrrnc_cnt,occrrnc_per10kcar_cnt,dth_cnt,dth_per100kpop_cnt,injpsn_cnt,injpsn_per100kpop_cnt,dth_rate,injpsn_rate
2005,110,종로구,1502,191.3,24,13.8,2185,1256.8,1.6,145.47
2005,140,중구,1333,210.2,13,9.7,1857,1381.5,0.98,139.31
2005,170,용산구,1360,158.1,20,8.3,2111,879.3,1.47,155.22
2005,200,성동구,1146,107.0,13,3.8,1596,465.7,1.13,139.27
2005,215,광진구,1564,146.4,21,5.5,2282,599.8,1.34,145.91
2005,230,동대문구,1700,155.4,21,5.4,2489,644.4,1.24,146.41
2005,260,중랑구,1850,153.6,11,2.6,2739,637.1,0.59,148.05
2005,290,성북구,1550,123.0,19,4.1,2251,481.7,1.23,145.23
2005,305,강북구,1136,124.4,18,5.1,1606,452.0,1.58,141.37
2005,320,도봉구,928,87.7,8,2.1,1409,367.5,0.86,151.83


In [0]:
window_total = Window.partitionBy("gugun_nm").orderBy("searchyear")

# 연도별 총 발생건수(total.occrrnc_cnt), 발생률(occrrnc_rate) 컬럼제작 -> 기존의 silver 테이블에 없어서 추가
# ML 학습용 정답(발생건수, 사망자수, 부상자수, 발생률, 사망률, 부상률) 만들기
df_gugun = df_silver.withColumn(
    "occrrnc_cnt_next", lead("occrrnc_cnt", 1).over(window_total)
).withColumn(
    "dth_cnt_next", lead("dth_cnt", 1).over(window_total)
).withColumn(
    "injpsn_cnt_next", lead("injpsn_cnt", 1).over(window_total)
).withColumn(
    "total_occrrnc_cnt", Fsum("occrrnc_cnt").over(window_total)
).withColumn(
    "occrrnc_rate", round(col("occrrnc_cnt") * 100.0 / col("total_occrrnc_cnt"), 2) # 연도별/지역별 사고 발생률
).withColumn(
    "occrrnc_rate_next", lead("occrrnc_rate", 1).over(window_total)
).withColumn(
    "dth_rate_next", lead("dth_rate", 1).over(window_total)
).withColumn(
    "injpsn_rate_next", lead("injpsn_rate", 1).over(window_total))

# 결측치(null) 제거
df_gugun = df_gugun.na.drop(subset=[
    "occrrnc_cnt_next", "dth_cnt_next", "injpsn_cnt_next",
    "occrrnc_rate_next", "dth_rate_next", "injpsn_rate_next"
])

# Pandas로 변환
df_pd = df_gugun.select(df_gugun.columns).toPandas()

# 불필요한 컬럼 제거
df_pd = df_pd.drop(columns=["gugun_code"], errors='ignore')

# **연도별로 같은구가 나오게 정렬**

# 1. 컬럼 순서를 저장
original_columns = df_pd.columns.tolist()

# 2. gugun_nm → searchyear 순으로 정렬
df_pd_sorted = df_pd.sort_values(by=["gugun_nm", "searchyear"]).reset_index(drop=True)

# 3. 컬럼 순서 원래대로 유지
df_pd_sorted = df_pd_sorted[original_columns]

# 4. 결과 확인
display(df_pd_sorted)

searchyear,gugun_nm,occrrnc_cnt,occrrnc_per10kcar_cnt,dth_cnt,dth_per100kpop_cnt,injpsn_cnt,injpsn_per100kpop_cnt,dth_rate,injpsn_rate,occrrnc_cnt_next,dth_cnt_next,injpsn_cnt_next,total_occrrnc_cnt,occrrnc_rate,occrrnc_rate_next,dth_rate_next,injpsn_rate_next
2005,강남구,3126,128.9,28,5.1,4682,854.7,0.9,149.78,3321,37,4869,3126,100.0,51.51,1.11,146.61
2006,강남구,3321,132.3,37,6.6,4869,862.3,1.11,146.61,3311,30,4836,6447,51.51,33.93,0.91,146.06
2007,강남구,3311,128.6,30,5.3,4836,849.6,0.91,146.06,3131,25,4469,9758,33.93,24.29,0.8,142.73
2008,강남구,3131,120.8,25,4.4,4469,790.1,0.8,142.73,3723,27,5455,12889,24.29,22.41,0.73,146.52
2009,강남구,3723,143.1,27,4.7,5455,957.9,0.73,146.52,3551,31,5176,16612,22.41,17.61,0.87,145.76
2010,강남구,3551,131.9,31,5.4,5176,896.9,0.87,145.76,3562,36,5184,20163,17.61,15.01,1.01,145.54
2011,강남구,3562,136.1,36,6.3,5184,904.7,1.01,145.54,3843,25,5666,23725,15.01,13.94,0.65,147.44
2012,강남구,3843,151.4,25,4.4,5666,994.0,0.65,147.44,3610,31,5326,27568,13.94,11.58,0.86,147.53
2013,강남구,3610,143.2,31,5.4,5326,935.8,0.86,147.53,3624,17,5269,31178,11.58,10.41,0.47,145.39
2014,강남구,3624,140.3,17,2.9,5269,903.1,0.47,145.39,3970,20,5686,34802,10.41,10.24,0.5,143.22


In [0]:
# 다음 해의 발생건수 예측
X = df_pd_sorted[["searchyear", "gugun_nm", "occrrnc_cnt", "dth_cnt", "injpsn_cnt","occrrnc_rate", "dth_rate", "injpsn_rate"]]
y_cnt_occ = np.log1p(df_pd_sorted["occrrnc_cnt_next"]) # RMSE를 낮추기 위한 log변환

# gugun_nm은 문자열이므로 숫자로 인코딩
from sklearn.preprocessing import LabelEncoder

labeling = LabelEncoder()
X["gugun_nm"] = labeling.fit_transform(X["gugun_nm"])

# 학습용, 테스트용으로 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y_cnt_occ, test_size=0.2, random_state=42)
model_cnt_occ = RandomForestRegressor(n_estimators=100, random_state=42)
# 모델 학습
model_cnt_occ.fit(X_train, y_train)

y_pred_log = model_cnt_occ.predict(X)

#모델이 log1p()로 바꾼 값을 예측했으니, 예측 결과는 반드시 expm1()으로 복원해야 원래 스케일로 돌아옴.
df_pd_sorted["occrrnc_cnt_pred"] = np.expm1(y_pred_log).round(0).astype(int) 

print("발생건수 RMSE:", np.sqrt(mean_squared_error(y_test, model_cnt_occ.predict(X_test))).round(2))

display(df_pd_sorted)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["gugun_nm"] = labeling.fit_transform(X["gugun_nm"])


Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

🏃 View run judicious-worm-55 at: https://adb-2553999575715580.0.azuredatabricks.net/ml/experiments/1683994105126437/runs/65c50f9584054fc58dd2d1a237eec7f7
🧪 View experiment at: https://adb-2553999575715580.0.azuredatabricks.net/ml/experiments/1683994105126437
발생건수 RMSE: 0.09


searchyear,gugun_nm,occrrnc_cnt,occrrnc_per10kcar_cnt,dth_cnt,dth_per100kpop_cnt,injpsn_cnt,injpsn_per100kpop_cnt,dth_rate,injpsn_rate,occrrnc_cnt_next,dth_cnt_next,injpsn_cnt_next,total_occrrnc_cnt,occrrnc_rate,occrrnc_rate_next,dth_rate_next,injpsn_rate_next,occrrnc_cnt_pred
2005,강남구,3126,128.9,28,5.1,4682,854.7,0.9,149.78,3321,37,4869,3126,100.0,51.51,1.11,146.61,3254
2006,강남구,3321,132.3,37,6.6,4869,862.3,1.11,146.61,3311,30,4836,6447,51.51,33.93,0.91,146.06,3325
2007,강남구,3311,128.6,30,5.3,4836,849.6,0.91,146.06,3131,25,4469,9758,33.93,24.29,0.8,142.73,3261
2008,강남구,3131,120.8,25,4.4,4469,790.1,0.8,142.73,3723,27,5455,12889,24.29,22.41,0.73,146.52,3496
2009,강남구,3723,143.1,27,4.7,5455,957.9,0.73,146.52,3551,31,5176,16612,22.41,17.61,0.87,145.76,3571
2010,강남구,3551,131.9,31,5.4,5176,896.9,0.87,145.76,3562,36,5184,20163,17.61,15.01,1.01,145.54,3594
2011,강남구,3562,136.1,36,6.3,5184,904.7,1.01,145.54,3843,25,5666,23725,15.01,13.94,0.65,147.44,3716
2012,강남구,3843,151.4,25,4.4,5666,994.0,0.65,147.44,3610,31,5326,27568,13.94,11.58,0.86,147.53,3611
2013,강남구,3610,143.2,31,5.4,5326,935.8,0.86,147.53,3624,17,5269,31178,11.58,10.41,0.47,145.39,3604
2014,강남구,3624,140.3,17,2.9,5269,903.1,0.47,145.39,3970,20,5686,34802,10.41,10.24,0.5,143.22,3754


In [0]:
# 2. 사망자수 예측
y_cnt_dth =  np.log1p(df_pd_sorted["dth_cnt_next"]) # RMSE를 낮추기 위한 log변환

X_train, X_test, y_train, y_test = train_test_split(X, y_cnt_dth, test_size=0.2, random_state=42)
model_cnt_dth = RandomForestRegressor(n_estimators=100, random_state=42)
model_cnt_dth.fit(X_train, y_train)

y_pred_log = model_cnt_dth.predict(X)
df_pd_sorted["dth_cnt_pred"] = np.expm1(y_pred_log).round(0).astype(int)
print("사망자수 RMSE:", np.sqrt(mean_squared_error(y_test, model_cnt_dth.predict(X_test))).round(2))


Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

🏃 View run sincere-stoat-77 at: https://adb-2553999575715580.0.azuredatabricks.net/ml/experiments/1683994105126437/runs/1262aaafd15e458c82c5e7a41b7d5609
🧪 View experiment at: https://adb-2553999575715580.0.azuredatabricks.net/ml/experiments/1683994105126437
사망자수 RMSE: 0.36


In [0]:
# 3. 부상자수 예측
y_cnt_inj =  np.log1p(df_pd_sorted["injpsn_cnt_next"]) # RMSE를 낮추기 위한 log변환

X_train, X_test, y_train, y_test = train_test_split(X, y_cnt_inj, test_size=0.2, random_state=42)
model_cnt_inj = RandomForestRegressor(n_estimators=100, random_state=42)
model_cnt_inj.fit(X_train, y_train)

y_pred_log = model_cnt_inj.predict(X)
df_pd_sorted["injpsn_cnt_pred"] = np.expm1(y_pred_log).round(0).astype(int)

print("부상자수 RMSE:", np.sqrt(mean_squared_error(y_test, model_cnt_inj.predict(X_test))).round(2))

display(df_pd_sorted)

Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

🏃 View run gaudy-seal-120 at: https://adb-2553999575715580.0.azuredatabricks.net/ml/experiments/1683994105126437/runs/85f2c1945d3248ce822fcd8b6c39fce9
🧪 View experiment at: https://adb-2553999575715580.0.azuredatabricks.net/ml/experiments/1683994105126437
부상자수 RMSE: 0.09


searchyear,gugun_nm,occrrnc_cnt,occrrnc_per10kcar_cnt,dth_cnt,dth_per100kpop_cnt,injpsn_cnt,injpsn_per100kpop_cnt,dth_rate,injpsn_rate,occrrnc_cnt_next,dth_cnt_next,injpsn_cnt_next,total_occrrnc_cnt,occrrnc_rate,occrrnc_rate_next,dth_rate_next,injpsn_rate_next,occrrnc_cnt_pred,dth_cnt_pred,injpsn_cnt_pred
2005,강남구,3126,128.9,28,5.1,4682,854.7,0.9,149.78,3321,37,4869,3126,100.0,51.51,1.11,146.61,3254,22,4754
2006,강남구,3321,132.3,37,6.6,4869,862.3,1.11,146.61,3311,30,4836,6447,51.51,33.93,0.91,146.06,3325,28,4850
2007,강남구,3311,128.6,30,5.3,4836,849.6,0.91,146.06,3131,25,4469,9758,33.93,24.29,0.8,142.73,3261,28,4725
2008,강남구,3131,120.8,25,4.4,4469,790.1,0.8,142.73,3723,27,5455,12889,24.29,22.41,0.73,146.52,3496,26,5037
2009,강남구,3723,143.1,27,4.7,5455,957.9,0.73,146.52,3551,31,5176,16612,22.41,17.61,0.87,145.76,3571,30,5220
2010,강남구,3551,131.9,31,5.4,5176,896.9,0.87,145.76,3562,36,5184,20163,17.61,15.01,1.01,145.54,3594,32,5211
2011,강남구,3562,136.1,36,6.3,5184,904.7,1.01,145.54,3843,25,5666,23725,15.01,13.94,0.65,147.44,3716,27,5387
2012,강남구,3843,151.4,25,4.4,5666,994.0,0.65,147.44,3610,31,5326,27568,13.94,11.58,0.86,147.53,3611,26,5296
2013,강남구,3610,143.2,31,5.4,5326,935.8,0.86,147.53,3624,17,5269,31178,11.58,10.41,0.47,145.39,3604,21,5236
2014,강남구,3624,140.3,17,2.9,5269,903.1,0.47,145.39,3970,20,5686,34802,10.41,10.24,0.5,143.22,3754,17,5237


In [0]:
# 다음 해의 발생률 예측
y_occ = np.log1p(df_pd_sorted["occrrnc_rate_next"])

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y_occ, test_size=0.2, random_state=42)

# 모델 학습
model_occ = RandomForestRegressor(n_estimators=100, random_state=42)
model_occ.fit(X_train, y_train)

y_pred_log = model_occ.predict(X)
df_pd_sorted["occrrnc_rate_pred"] = np.expm1(y_pred_log).round(2).astype(float)

# 예측 및 성능 평가
print("발생률 RMSE:", np.sqrt(mean_squared_error(y_test, model_occ.predict(X_test))).round(2))

display(df_pd_sorted)


Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

🏃 View run orderly-robin-873 at: https://adb-2553999575715580.0.azuredatabricks.net/ml/experiments/1683994105126437/runs/9a5b20b6e5244d769a5468f3f1d4eb26
🧪 View experiment at: https://adb-2553999575715580.0.azuredatabricks.net/ml/experiments/1683994105126437
발생률 RMSE: 0.06


searchyear,gugun_nm,occrrnc_cnt,occrrnc_per10kcar_cnt,dth_cnt,dth_per100kpop_cnt,injpsn_cnt,injpsn_per100kpop_cnt,dth_rate,injpsn_rate,occrrnc_cnt_next,dth_cnt_next,injpsn_cnt_next,total_occrrnc_cnt,occrrnc_rate,occrrnc_rate_next,dth_rate_next,injpsn_rate_next,occrrnc_cnt_pred,dth_cnt_pred,injpsn_cnt_pred,occrrnc_rate_pred
2005,강남구,3126,128.9,28,5.1,4682,854.7,0.9,149.78,3321,37,4869,3126,100.0,51.51,1.11,146.61,3254,22,4754,49.31
2006,강남구,3321,132.3,37,6.6,4869,862.3,1.11,146.61,3311,30,4836,6447,51.51,33.93,0.91,146.06,3325,28,4850,33.98
2007,강남구,3311,128.6,30,5.3,4836,849.6,0.91,146.06,3131,25,4469,9758,33.93,24.29,0.8,142.73,3261,28,4725,25.32
2008,강남구,3131,120.8,25,4.4,4469,790.1,0.8,142.73,3723,27,5455,12889,24.29,22.41,0.73,146.52,3496,26,5037,21.96
2009,강남구,3723,143.1,27,4.7,5455,957.9,0.73,146.52,3551,31,5176,16612,22.41,17.61,0.87,145.76,3571,30,5220,17.38
2010,강남구,3551,131.9,31,5.4,5176,896.9,0.87,145.76,3562,36,5184,20163,17.61,15.01,1.01,145.54,3594,32,5211,14.8
2011,강남구,3562,136.1,36,6.3,5184,904.7,1.01,145.54,3843,25,5666,23725,15.01,13.94,0.65,147.44,3716,27,5387,13.78
2012,강남구,3843,151.4,25,4.4,5666,994.0,0.65,147.44,3610,31,5326,27568,13.94,11.58,0.86,147.53,3611,26,5296,11.5
2013,강남구,3610,143.2,31,5.4,5326,935.8,0.86,147.53,3624,17,5269,31178,11.58,10.41,0.47,145.39,3604,21,5236,10.23
2014,강남구,3624,140.3,17,2.9,5269,903.1,0.47,145.39,3970,20,5686,34802,10.41,10.24,0.5,143.22,3754,17,5237,9.41


In [0]:
#  다음 해의 사망률 예측
y_dth = np.log1p(df_pd_sorted["dth_rate_next"])

X_train, X_test, y_train, y_test = train_test_split(X, y_dth, test_size=0.2, random_state=42)
model_dth = RandomForestRegressor(n_estimators=100, random_state=42)
model_dth.fit(X_train, y_train)

y_pred_log = model_dth.predict(X)
df_pd_sorted["dth_rate_pred"] = np.expm1(y_pred_log).round(2).astype(float)

print("사망률 RMSE:", np.sqrt(mean_squared_error(y_test, model_dth.predict(X_test))).round(2))

display(df_pd_sorted)


Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

🏃 View run polite-rat-150 at: https://adb-2553999575715580.0.azuredatabricks.net/ml/experiments/1683994105126437/runs/5370e73c5bf642058fb79d26ac529168
🧪 View experiment at: https://adb-2553999575715580.0.azuredatabricks.net/ml/experiments/1683994105126437
사망률 RMSE: 0.17


searchyear,gugun_nm,occrrnc_cnt,occrrnc_per10kcar_cnt,dth_cnt,dth_per100kpop_cnt,injpsn_cnt,injpsn_per100kpop_cnt,dth_rate,injpsn_rate,occrrnc_cnt_next,dth_cnt_next,injpsn_cnt_next,total_occrrnc_cnt,occrrnc_rate,occrrnc_rate_next,dth_rate_next,injpsn_rate_next,occrrnc_cnt_pred,dth_cnt_pred,injpsn_cnt_pred,occrrnc_rate_pred,dth_rate_pred
2005,강남구,3126,128.9,28,5.1,4682,854.7,0.9,149.78,3321,37,4869,3126,100.0,51.51,1.11,146.61,3254,22,4754,49.31,0.85
2006,강남구,3321,132.3,37,6.6,4869,862.3,1.11,146.61,3311,30,4836,6447,51.51,33.93,0.91,146.06,3325,28,4850,33.98,0.89
2007,강남구,3311,128.6,30,5.3,4836,849.6,0.91,146.06,3131,25,4469,9758,33.93,24.29,0.8,142.73,3261,28,4725,25.32,0.83
2008,강남구,3131,120.8,25,4.4,4469,790.1,0.8,142.73,3723,27,5455,12889,24.29,22.41,0.73,146.52,3496,26,5037,21.96,0.79
2009,강남구,3723,143.1,27,4.7,5455,957.9,0.73,146.52,3551,31,5176,16612,22.41,17.61,0.87,145.76,3571,30,5220,17.38,0.87
2010,강남구,3551,131.9,31,5.4,5176,896.9,0.87,145.76,3562,36,5184,20163,17.61,15.01,1.01,145.54,3594,32,5211,14.8,0.92
2011,강남구,3562,136.1,36,6.3,5184,904.7,1.01,145.54,3843,25,5666,23725,15.01,13.94,0.65,147.44,3716,27,5387,13.78,0.76
2012,강남구,3843,151.4,25,4.4,5666,994.0,0.65,147.44,3610,31,5326,27568,13.94,11.58,0.86,147.53,3611,26,5296,11.5,0.77
2013,강남구,3610,143.2,31,5.4,5326,935.8,0.86,147.53,3624,17,5269,31178,11.58,10.41,0.47,145.39,3604,21,5236,10.23,0.6
2014,강남구,3624,140.3,17,2.9,5269,903.1,0.47,145.39,3970,20,5686,34802,10.41,10.24,0.5,143.22,3754,17,5237,9.41,0.74


In [0]:
# 다음 해의 부상률 예측
y_inj = np.log1p(df_pd_sorted["injpsn_rate_next"])

X_train, X_test, y_train, y_test = train_test_split(X, y_inj, test_size=0.2, random_state=42)
model_inj = RandomForestRegressor(n_estimators=100, random_state=42)
model_inj.fit(X_train, y_train)

y_pred_log = model_inj.predict(X)
df_pd_sorted["injpsn_rate_pred"] = np.expm1(y_pred_log).round(2).astype(float)

print("부상률 RMSE:", np.sqrt(mean_squared_error(y_test, model_inj.predict(X_test))).round(2))

display(df_pd_sorted)


Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

🏃 View run salty-elk-729 at: https://adb-2553999575715580.0.azuredatabricks.net/ml/experiments/1683994105126437/runs/89a6761d3f0d4b78b19beea4253f9be5
🧪 View experiment at: https://adb-2553999575715580.0.azuredatabricks.net/ml/experiments/1683994105126437
부상률 RMSE: 0.03


searchyear,gugun_nm,occrrnc_cnt,occrrnc_per10kcar_cnt,dth_cnt,dth_per100kpop_cnt,injpsn_cnt,injpsn_per100kpop_cnt,dth_rate,injpsn_rate,occrrnc_cnt_next,dth_cnt_next,injpsn_cnt_next,total_occrrnc_cnt,occrrnc_rate,occrrnc_rate_next,dth_rate_next,injpsn_rate_next,occrrnc_cnt_pred,dth_cnt_pred,injpsn_cnt_pred,occrrnc_rate_pred,dth_rate_pred,injpsn_rate_pred
2005,강남구,3126,128.9,28,5.1,4682,854.7,0.9,149.78,3321,37,4869,3126,100.0,51.51,1.11,146.61,3254,22,4754,49.31,0.85,146.53
2006,강남구,3321,132.3,37,6.6,4869,862.3,1.11,146.61,3311,30,4836,6447,51.51,33.93,0.91,146.06,3325,28,4850,33.98,0.89,145.68
2007,강남구,3311,128.6,30,5.3,4836,849.6,0.91,146.06,3131,25,4469,9758,33.93,24.29,0.8,142.73,3261,28,4725,25.32,0.83,144.3
2008,강남구,3131,120.8,25,4.4,4469,790.1,0.8,142.73,3723,27,5455,12889,24.29,22.41,0.73,146.52,3496,26,5037,21.96,0.79,144.65
2009,강남구,3723,143.1,27,4.7,5455,957.9,0.73,146.52,3551,31,5176,16612,22.41,17.61,0.87,145.76,3571,30,5220,17.38,0.87,145.63
2010,강남구,3551,131.9,31,5.4,5176,896.9,0.87,145.76,3562,36,5184,20163,17.61,15.01,1.01,145.54,3594,32,5211,14.8,0.92,145.46
2011,강남구,3562,136.1,36,6.3,5184,904.7,1.01,145.54,3843,25,5666,23725,15.01,13.94,0.65,147.44,3716,27,5387,13.78,0.76,146.75
2012,강남구,3843,151.4,25,4.4,5666,994.0,0.65,147.44,3610,31,5326,27568,13.94,11.58,0.86,147.53,3611,26,5296,11.5,0.77,147.04
2013,강남구,3610,143.2,31,5.4,5326,935.8,0.86,147.53,3624,17,5269,31178,11.58,10.41,0.47,145.39,3604,21,5236,10.23,0.6,145.94
2014,강남구,3624,140.3,17,2.9,5269,903.1,0.47,145.39,3970,20,5686,34802,10.41,10.24,0.5,143.22,3754,17,5237,9.41,0.74,143.65


In [0]:
display(df_pd_sorted)

searchyear,gugun_nm,occrrnc_cnt,occrrnc_per10kcar_cnt,dth_cnt,dth_per100kpop_cnt,injpsn_cnt,injpsn_per100kpop_cnt,dth_rate,injpsn_rate,occrrnc_cnt_next,dth_cnt_next,injpsn_cnt_next,total_occrrnc_cnt,occrrnc_rate,occrrnc_rate_next,dth_rate_next,injpsn_rate_next,occrrnc_cnt_pred,dth_cnt_pred,injpsn_cnt_pred,occrrnc_rate_pred,dth_rate_pred,injpsn_rate_pred
2005,강남구,3126,128.9,28,5.1,4682,854.7,0.9,149.78,3321,37,4869,3126,100.0,51.51,1.11,146.61,3254,22,4754,49.31,0.85,146.53
2006,강남구,3321,132.3,37,6.6,4869,862.3,1.11,146.61,3311,30,4836,6447,51.51,33.93,0.91,146.06,3325,28,4850,33.98,0.89,145.68
2007,강남구,3311,128.6,30,5.3,4836,849.6,0.91,146.06,3131,25,4469,9758,33.93,24.29,0.8,142.73,3261,28,4725,25.32,0.83,144.3
2008,강남구,3131,120.8,25,4.4,4469,790.1,0.8,142.73,3723,27,5455,12889,24.29,22.41,0.73,146.52,3496,26,5037,21.96,0.79,144.65
2009,강남구,3723,143.1,27,4.7,5455,957.9,0.73,146.52,3551,31,5176,16612,22.41,17.61,0.87,145.76,3571,30,5220,17.38,0.87,145.63
2010,강남구,3551,131.9,31,5.4,5176,896.9,0.87,145.76,3562,36,5184,20163,17.61,15.01,1.01,145.54,3594,32,5211,14.8,0.92,145.46
2011,강남구,3562,136.1,36,6.3,5184,904.7,1.01,145.54,3843,25,5666,23725,15.01,13.94,0.65,147.44,3716,27,5387,13.78,0.76,146.75
2012,강남구,3843,151.4,25,4.4,5666,994.0,0.65,147.44,3610,31,5326,27568,13.94,11.58,0.86,147.53,3611,26,5296,11.5,0.77,147.04
2013,강남구,3610,143.2,31,5.4,5326,935.8,0.86,147.53,3624,17,5269,31178,11.58,10.41,0.47,145.39,3604,21,5236,10.23,0.6,145.94
2014,강남구,3624,140.3,17,2.9,5269,903.1,0.47,145.39,3970,20,5686,34802,10.41,10.24,0.5,143.22,3754,17,5237,9.41,0.74,143.65


In [0]:

df_spark = spark.createDataFrame(df_pd_sorted)

In [0]:

df_spark.write.mode("overwrite").saveAsTable("1dt_team5_databricks_traffic.gold_managed.gold_silver_district_traffic_stats")

In [0]:
df_a = spark.table("1dt_team5_databricks_traffic.gold_managed.gold_district_traffic_stats")
df_b = spark.table("postgres_team5_catalog.silver.silver_gugun_latitude_longitude")

df_joined = df_a.join(df_b, on="gugun_nm", how="inner")

df_joined.write.mode("overwrite").saveAsTable("1dt_team5_databricks_traffic.gold_managed.joined_district_traffic_stats")
