In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [0]:
from pyspark.sql.functions import col, sum as spark_sum

# === [0] 테이블 불러오기 ===
monthly_df = spark.table("postgres_team5_catalog.bronze.monthly_acid_stats")
hourly_df = spark.table("postgres_team5_catalog.bronze.hourly_accident_stats")
age_df = spark.table("postgres_team5_catalog.tra.traffic_acid_stats_age")

# === [1] stat_type별로 pivot 후 prefix 붙여서 병합 ===
def pivot_by_stat_type(df, stat_type, group_cols, pivot_col, prefix):
    return (
        df.filter(col("stat_type") == stat_type)
          .groupBy(group_cols)
          .pivot(pivot_col)
          .agg(spark_sum("data"))
          .na.fill(0)
          .toDF(*(group_cols + [f"{prefix}_{col}" for col in df.filter(col("stat_type") == stat_type).select(pivot_col).distinct().rdd.flatMap(lambda x: x).collect()]))
    )

# === [2] 각 테이블에 대해 stat_type별 피벗 생성 ===
# 월별

monthly_death = pivot_by_stat_type(monthly_df, "사망자수 (명)", ["searchyear", "gugun_nm"], "searchmonth", "월_사망")
monthly_injury = pivot_by_stat_type(monthly_df, "부상자수 (명)", ["searchyear", "gugun_nm"], "searchmonth", "월_부상")
monthly_total = pivot_by_stat_type(monthly_df, "발생건수 (건)", ["searchyear", "gugun_nm"], "searchmonth", "월_발생")

# 시간대별
hourly_death = pivot_by_stat_type(hourly_df, "사망자수 (명)", ["searchyear", "gugun_nm"], "hour", "시간_사망")
hourly_injury = pivot_by_stat_type(hourly_df, "부상자수 (명)", ["searchyear", "gugun_nm"], "hour", "시간_부상")
hourly_total = pivot_by_stat_type(hourly_df, "발생건수 (건)", ["searchyear", "gugun_nm"], "hour", "시간_발생")

# 연령대별
age_death = pivot_by_stat_type(age_df, "사망자수", ["searchyear", "gugun_nm"], "ages", "연령_사망")
age_injury = pivot_by_stat_type(age_df, "부상자수", ["searchyear", "gugun_nm"], "ages", "연령_부상")

# === [3] 전체 병합 ===
feature_df = monthly_death \
    .join(monthly_injury, ["searchyear", "gugun_nm"], "inner") \
    .join(monthly_total, ["searchyear", "gugun_nm"], "inner") \
    .join(hourly_death, ["searchyear", "gugun_nm"], "inner") \
    .join(hourly_injury, ["searchyear", "gugun_nm"], "inner") \
    .join(hourly_total, ["searchyear", "gugun_nm"], "inner") \
    .join(age_death, ["searchyear", "gugun_nm"], "inner") \
    .join(age_injury, ["searchyear", "gugun_nm"], "inner") \
    .na.fill(0)

# 결과 미리 보기
display(feature_df)

searchyear,gugun_nm,월_사망_1,월_사망_9,월_사망_4,월_사망_8,월_사망_12,월_사망_6,월_사망_3,월_사망_10,월_사망_11,월_사망_2,월_사망_7,월_사망_5,월_부상_1,월_부상_9,월_부상_4,월_부상_8,월_부상_12,월_부상_6,월_부상_3,월_부상_10,월_부상_11,월_부상_2,월_부상_7,월_부상_5,월_발생_1,월_발생_9,월_발생_4,월_발생_8,월_발생_12,월_발생_6,월_발생_3,월_발생_10,월_발생_11,월_발생_2,월_발생_7,월_발생_5,시간_사망_08시-10시,시간_사망_02시-04시,시간_사망_00시-02시,시간_사망_20시-22시,시간_사망_22시-24시,시간_사망_14시-16시,시간_사망_04시-06시,시간_사망_16시-18시,시간_사망_12시-14시,시간_사망_전체,시간_사망_10시-12시,시간_사망_18시-20시,시간_사망_06시-08시,시간_부상_08시-10시,시간_부상_02시-04시,시간_부상_00시-02시,시간_부상_20시-22시,시간_부상_22시-24시,시간_부상_14시-16시,시간_부상_04시-06시,시간_부상_16시-18시,시간_부상_12시-14시,시간_부상_전체,시간_부상_10시-12시,시간_부상_18시-20시,시간_부상_06시-08시,시간_발생_08시-10시,시간_발생_02시-04시,시간_발생_00시-02시,시간_발생_20시-22시,시간_발생_22시-24시,시간_발생_14시-16시,시간_발생_04시-06시,시간_발생_16시-18시,시간_발생_12시-14시,시간_발생_전체,시간_발생_10시-12시,시간_발생_18시-20시,시간_발생_06시-08시,연령_사망_13~20세,연령_사망_41~50세,연령_사망_21~30세,연령_사망_12세이하,연령_사망_불명,연령_사망_71세이상,연령_사망_51~60세,연령_사망_15~20세,연령_사망_65~70세,연령_사망_31~40세,연령_사망_14세 이하,연령_사망_전체,연령_사망_61~64세,연령_부상_13~20세,연령_부상_41~50세,연령_부상_21~30세,연령_부상_12세이하,연령_부상_71세이상,연령_부상_불명,연령_부상_51~60세,연령_부상_15~20세,연령_부상_65~70세,연령_부상_31~40세,연령_부상_14세 이하,연령_부상_전체,연령_부상_61~64세
2017,용산구,0,1,2,1,0,0,0,1,4,2,0,1,169,216,157,200,122,135,153,161,135,157,155,189,109,150,102,120,89,89,111,117,103,101,112,134,2,2,3,0,0,1,0,1,0,0,1,2,12,176,120,100,117,167,163,142,216,171,216,139,222,1949,105,67,63,76,113,121,105,154,134,148,108,143,1337,0,0,0,0,6,2,0,2,0,0,2,0,12,0,0,24,76,464,450,343,295,92,86,84,35,1949
2006,동대문구,2,2,3,1,1,1,2,3,0,3,5,2,169,208,202,203,171,191,202,204,219,226,212,208,119,149,140,137,124,139,149,156,160,160,164,151,4,2,3,1,3,2,0,3,2,1,1,3,25,263,189,153,106,202,178,156,210,191,226,262,279,2415,171,112,96,85,155,137,132,165,152,178,176,189,1748,0,0,0,0,3,5,4,4,0,1,8,0,25,0,0,90,169,531,516,554,343,77,77,58,0,2415
2014,중구,0,1,0,1,2,0,0,2,0,0,0,1,121,171,156,169,100,163,123,147,146,162,167,160,93,124,109,119,74,126,95,112,107,119,115,112,2,2,1,0,0,0,0,2,0,0,0,0,7,151,85,90,70,179,179,186,174,168,189,151,163,1785,99,58,58,56,129,138,132,140,127,143,113,112,1305,0,0,0,0,0,2,1,1,2,0,1,0,7,0,0,35,69,324,424,343,326,98,106,60,0,1785
2007,성북구,1,0,4,0,0,1,0,1,2,0,0,0,143,152,192,158,117,129,140,196,176,173,157,210,102,112,141,114,79,97,114,141,123,115,113,140,2,1,0,1,0,1,1,1,1,0,0,1,9,187,146,125,105,154,145,130,176,162,179,191,243,1943,120,100,81,78,109,104,108,127,127,139,137,161,1391,0,0,1,2,1,0,1,1,1,0,2,0,9,0,0,107,148,415,411,371,295,74,68,54,0,1943
2006,전체,24,42,53,37,19,32,38,40,32,45,43,47,3865,4964,4978,4527,3815,4413,4780,5154,4759,5018,5163,5086,2513,3273,3467,3068,2560,3006,3263,3498,3262,3359,3505,3524,40,48,68,37,31,25,30,40,28,38,26,41,452,6152,4397,3538,3295,4224,3924,4231,4608,4837,5473,5289,6554,56522,3743,2687,2227,2229,3101,2739,2999,3285,3550,3906,3603,4229,38298,0,0,17,35,67,56,73,66,31,40,66,1,452,0,0,2961,3793,13214,12678,12549,7492,1578,1331,926,0,56522
2010,광진구,1,3,3,2,1,2,0,2,1,1,0,1,140,230,188,169,138,170,201,198,161,203,203,211,101,150,138,119,90,118,148,139,114,136,151,136,1,2,1,2,0,2,1,2,2,0,2,2,17,238,159,114,133,208,153,156,179,182,222,189,279,2212,138,106,78,81,152,110,113,138,138,161,137,188,1540,0,0,2,1,3,4,1,2,0,2,2,0,17,0,0,104,159,522,439,450,351,68,62,57,0,2212
2020,성동구,0,0,1,1,0,2,0,0,0,2,2,1,123,123,131,95,102,103,133,137,140,141,125,128,86,101,102,83,76,77,89,96,104,108,99,98,0,1,1,2,0,0,1,2,0,1,1,0,9,112,34,45,91,129,116,137,165,131,223,163,135,1481,71,23,30,71,108,96,105,129,107,161,119,99,1119,0,0,0,0,0,2,0,1,0,1,5,0,9,0,0,27,63,343,303,265,230,91,78,81,0,1481
2006,은평구,1,6,1,3,0,1,0,0,0,3,1,2,167,161,171,151,123,165,171,189,130,182,181,173,111,110,125,109,82,118,118,138,104,123,128,125,2,0,3,3,2,2,1,2,1,1,0,1,18,209,127,108,108,141,107,121,231,185,201,183,243,1964,120,82,70,82,113,76,95,143,142,163,142,163,1391,0,0,0,0,4,2,3,2,1,4,2,0,18,0,0,158,157,379,415,423,280,56,56,40,0,1964
2011,전체,36,25,40,42,32,20,44,36,37,36,50,37,4429,5244,5150,5000,3734,4513,4905,5036,4831,5067,4823,4893,3038,3734,3701,3514,2573,3126,3450,3603,3390,3529,3360,3433,60,31,70,33,26,41,21,23,24,26,40,40,435,5662,3907,3208,3300,4708,3963,4157,5077,5129,5876,5974,6664,57625,3618,2383,2097,2295,3471,2953,3129,3738,3846,4320,4188,4413,40451,0,0,4,33,72,40,67,69,27,38,85,0,435,0,0,2549,4767,11620,11500,11268,9815,2315,2133,1658,0,57625
2008,은평구,1,1,5,2,0,1,0,0,2,1,1,2,133,187,187,182,122,122,165,233,205,210,215,199,105,140,143,125,88,90,120,168,159,153,155,150,0,3,0,0,0,0,2,0,3,1,4,3,16,186,117,93,107,172,137,177,231,257,231,201,251,2160,120,80,74,88,122,99,144,156,194,175,169,175,1596,0,0,2,1,2,0,4,2,1,2,2,0,16,0,0,152,236,375,386,461,310,78,86,76,0,2160


['searchyear',
 'gugun_nm',
 '월_사망_1',
 '월_사망_9',
 '월_사망_4',
 '월_사망_8',
 '월_사망_12',
 '월_사망_6',
 '월_사망_3',
 '월_사망_10',
 '월_사망_11',
 '월_사망_2',
 '월_사망_7',
 '월_사망_5',
 '월_부상_1',
 '월_부상_9',
 '월_부상_4',
 '월_부상_8',
 '월_부상_12',
 '월_부상_6',
 '월_부상_3',
 '월_부상_10',
 '월_부상_11',
 '월_부상_2',
 '월_부상_7',
 '월_부상_5',
 '월_발생_1',
 '월_발생_9',
 '월_발생_4',
 '월_발생_8',
 '월_발생_12',
 '월_발생_6',
 '월_발생_3',
 '월_발생_10',
 '월_발생_11',
 '월_발생_2',
 '월_발생_7',
 '월_발생_5',
 '시간_사망_08시-10시',
 '시간_사망_02시-04시',
 '시간_사망_00시-02시',
 '시간_사망_20시-22시',
 '시간_사망_22시-24시',
 '시간_사망_14시-16시',
 '시간_사망_04시-06시',
 '시간_사망_16시-18시',
 '시간_사망_12시-14시',
 '시간_사망_전체',
 '시간_사망_10시-12시',
 '시간_사망_18시-20시',
 '시간_사망_06시-08시',
 '시간_부상_08시-10시',
 '시간_부상_02시-04시',
 '시간_부상_00시-02시',
 '시간_부상_20시-22시',
 '시간_부상_22시-24시',
 '시간_부상_14시-16시',
 '시간_부상_04시-06시',
 '시간_부상_16시-18시',
 '시간_부상_12시-14시',
 '시간_부상_전체',
 '시간_부상_10시-12시',
 '시간_부상_18시-20시',
 '시간_부상_06시-08시',
 '시간_발생_08시-10시',
 '시간_발생_02시-04시',
 '시간_발생_00시-02시',
 '시간_발생_20시-22시',
 '시간_발생_22시-24시',
 '시간_발생_14시-16시',
 

In [0]:
from pyspark.sql.functions import col, sum as spark_sum, when
from functools import reduce

# 월별 발생 총합
monthly_total_cols = [c for c in feature_df.columns if c.startswith("월_발생_")]
feature_df = feature_df.withColumn("total_monthly_accidents",
    reduce(lambda a, b: a + b, [col(c) for c in monthly_total_cols])
)

# 사망률 = 총 사망자수 / 총 발생건수
monthly_death_cols = [c for c in feature_df.columns if c.startswith("월_사망_")]
feature_df = feature_df.withColumn("total_monthly_deaths",
    reduce(lambda a, b: a + b, [col(c) for c in monthly_death_cols])
)

feature_df = feature_df.withColumn("monthly_death_ratio",
    when(col("total_monthly_accidents") == 0, 0)
    .otherwise(col("total_monthly_deaths") / col("total_monthly_accidents"))
)

# 연령대 취약층 비율 (예: 고령자 65세 이상 비중)
elderly_cols = [c for c in feature_df.columns if c.startswith("연령_부상_") and ("65~70세" in c or "71세이상" in c)]
all_injury_cols = [c for c in feature_df.columns if c.startswith("연령_부상_") and "전체" not in c]

feature_df = feature_df.withColumn("elderly_injuries", reduce(lambda a, b: a + b, [col(c) for c in elderly_cols]))
feature_df = feature_df.withColumn("total_injuries_by_age", reduce(lambda a, b: a + b, [col(c) for c in all_injury_cols]))
feature_df = feature_df.withColumn("elderly_ratio",
    when(col("total_injuries_by_age") == 0, 0).otherwise(col("elderly_injuries") / col("total_injuries_by_age"))
)

display(feature_df)
feature_df.columns

searchyear,gugun_nm,월_사망_1,월_사망_9,월_사망_4,월_사망_8,월_사망_12,월_사망_6,월_사망_3,월_사망_10,월_사망_11,월_사망_2,월_사망_7,월_사망_5,월_부상_1,월_부상_9,월_부상_4,월_부상_8,월_부상_12,월_부상_6,월_부상_3,월_부상_10,월_부상_11,월_부상_2,월_부상_7,월_부상_5,월_발생_1,월_발생_9,월_발생_4,월_발생_8,월_발생_12,월_발생_6,월_발생_3,월_발생_10,월_발생_11,월_발생_2,월_발생_7,월_발생_5,시간_사망_08시-10시,시간_사망_02시-04시,시간_사망_00시-02시,시간_사망_20시-22시,시간_사망_22시-24시,시간_사망_14시-16시,시간_사망_04시-06시,시간_사망_16시-18시,시간_사망_12시-14시,시간_사망_전체,시간_사망_10시-12시,시간_사망_18시-20시,시간_사망_06시-08시,시간_부상_08시-10시,시간_부상_02시-04시,시간_부상_00시-02시,시간_부상_20시-22시,시간_부상_22시-24시,시간_부상_14시-16시,시간_부상_04시-06시,시간_부상_16시-18시,시간_부상_12시-14시,시간_부상_전체,시간_부상_10시-12시,시간_부상_18시-20시,시간_부상_06시-08시,시간_발생_08시-10시,시간_발생_02시-04시,시간_발생_00시-02시,시간_발생_20시-22시,시간_발생_22시-24시,시간_발생_14시-16시,시간_발생_04시-06시,시간_발생_16시-18시,시간_발생_12시-14시,시간_발생_전체,시간_발생_10시-12시,시간_발생_18시-20시,시간_발생_06시-08시,연령_사망_13~20세,연령_사망_41~50세,연령_사망_21~30세,연령_사망_12세이하,연령_사망_불명,연령_사망_71세이상,연령_사망_51~60세,연령_사망_15~20세,연령_사망_65~70세,연령_사망_31~40세,연령_사망_14세 이하,연령_사망_전체,연령_사망_61~64세,연령_부상_13~20세,연령_부상_41~50세,연령_부상_21~30세,연령_부상_12세이하,연령_부상_71세이상,연령_부상_불명,연령_부상_51~60세,연령_부상_15~20세,연령_부상_65~70세,연령_부상_31~40세,연령_부상_14세 이하,연령_부상_전체,연령_부상_61~64세,total_monthly_accidents,total_monthly_deaths,monthly_death_ratio,elderly_injuries,total_injuries_by_age,elderly_ratio
2017,용산구,0,1,2,1,0,0,0,1,4,2,0,1,169,216,157,200,122,135,153,161,135,157,155,189,109,150,102,120,89,89,111,117,103,101,112,134,2,2,3,0,0,1,0,1,0,0,1,2,12,176,120,100,117,167,163,142,216,171,216,139,222,1949,105,67,63,76,113,121,105,154,134,148,108,143,1337,0,0,0,0,6,2,0,2,0,0,2,0,12,0,0,24,76,464,450,343,295,92,86,84,35,1949,1337,12,0.0089753178758414,556,3863,0.1439295884027957
2006,동대문구,2,2,3,1,1,1,2,3,0,3,5,2,169,208,202,203,171,191,202,204,219,226,212,208,119,149,140,137,124,139,149,156,160,160,164,151,4,2,3,1,3,2,0,3,2,1,1,3,25,263,189,153,106,202,178,156,210,191,226,262,279,2415,171,112,96,85,155,137,132,165,152,178,176,189,1748,0,0,0,0,3,5,4,4,0,1,8,0,25,0,0,90,169,531,516,554,343,77,77,58,0,2415,1748,25,0.0143020594965675,608,4830,0.125879917184265
2014,중구,0,1,0,1,2,0,0,2,0,0,0,1,121,171,156,169,100,163,123,147,146,162,167,160,93,124,109,119,74,126,95,112,107,119,115,112,2,2,1,0,0,0,0,2,0,0,0,0,7,151,85,90,70,179,179,186,174,168,189,151,163,1785,99,58,58,56,129,138,132,140,127,143,113,112,1305,0,0,0,0,0,2,1,1,2,0,1,0,7,0,0,35,69,324,424,343,326,98,106,60,0,1785,1305,7,0.0053639846743295,422,3570,0.1182072829131652
2007,성북구,1,0,4,0,0,1,0,1,2,0,0,0,143,152,192,158,117,129,140,196,176,173,157,210,102,112,141,114,79,97,114,141,123,115,113,140,2,1,0,1,0,1,1,1,1,0,0,1,9,187,146,125,105,154,145,130,176,162,179,191,243,1943,120,100,81,78,109,104,108,127,127,139,137,161,1391,0,0,1,2,1,0,1,1,1,0,2,0,9,0,0,107,148,415,411,371,295,74,68,54,0,1943,1391,9,0.00647016534867,489,3886,0.1258363355635615
2006,전체,24,42,53,37,19,32,38,40,32,45,43,47,3865,4964,4978,4527,3815,4413,4780,5154,4759,5018,5163,5086,2513,3273,3467,3068,2560,3006,3263,3498,3262,3359,3505,3524,40,48,68,37,31,25,30,40,28,38,26,41,452,6152,4397,3538,3295,4224,3924,4231,4608,4837,5473,5289,6554,56522,3743,2687,2227,2229,3101,2739,2999,3285,3550,3906,3603,4229,38298,0,0,17,35,67,56,73,66,31,40,66,1,452,0,0,2961,3793,13214,12678,12549,7492,1578,1331,926,0,56522,38298,452,0.0118021828816126,14792,113044,0.1308517037613672
2010,광진구,1,3,3,2,1,2,0,2,1,1,0,1,140,230,188,169,138,170,201,198,161,203,203,211,101,150,138,119,90,118,148,139,114,136,151,136,1,2,1,2,0,2,1,2,2,0,2,2,17,238,159,114,133,208,153,156,179,182,222,189,279,2212,138,106,78,81,152,110,113,138,138,161,137,188,1540,0,0,2,1,3,4,1,2,0,2,2,0,17,0,0,104,159,522,439,450,351,68,62,57,0,2212,1540,17,0.011038961038961,590,4424,0.1333634719710669
2020,성동구,0,0,1,1,0,2,0,0,0,2,2,1,123,123,131,95,102,103,133,137,140,141,125,128,86,101,102,83,76,77,89,96,104,108,99,98,0,1,1,2,0,0,1,2,0,1,1,0,9,112,34,45,91,129,116,137,165,131,223,163,135,1481,71,23,30,71,108,96,105,129,107,161,119,99,1119,0,0,0,0,0,2,0,1,0,1,5,0,9,0,0,27,63,343,303,265,230,91,78,81,0,1481,1119,9,0.0080428954423592,434,2962,0.1465226198514517
2006,은평구,1,6,1,3,0,1,0,0,0,3,1,2,167,161,171,151,123,165,171,189,130,182,181,173,111,110,125,109,82,118,118,138,104,123,128,125,2,0,3,3,2,2,1,2,1,1,0,1,18,209,127,108,108,141,107,121,231,185,201,183,243,1964,120,82,70,82,113,76,95,143,142,163,142,163,1391,0,0,0,0,4,2,3,2,1,4,2,0,18,0,0,158,157,379,415,423,280,56,56,40,0,1964,1391,18,0.01294033069734,435,3928,0.1107433808553971
2011,전체,36,25,40,42,32,20,44,36,37,36,50,37,4429,5244,5150,5000,3734,4513,4905,5036,4831,5067,4823,4893,3038,3734,3701,3514,2573,3126,3450,3603,3390,3529,3360,3433,60,31,70,33,26,41,21,23,24,26,40,40,435,5662,3907,3208,3300,4708,3963,4157,5077,5129,5876,5974,6664,57625,3618,2383,2097,2295,3471,2953,3129,3738,3846,4320,4188,4413,40451,0,0,4,33,72,40,67,69,27,38,85,0,435,0,0,2549,4767,11620,11500,11268,9815,2315,2133,1658,0,57625,40451,435,0.0107537514523744,13935,115250,0.1209110629067245
2008,은평구,1,1,5,2,0,1,0,0,2,1,1,2,133,187,187,182,122,122,165,233,205,210,215,199,105,140,143,125,88,90,120,168,159,153,155,150,0,3,0,0,0,0,2,0,3,1,4,3,16,186,117,93,107,172,137,177,231,257,231,201,251,2160,120,80,74,88,122,99,144,156,194,175,169,175,1596,0,0,2,1,2,0,4,2,1,2,2,0,16,0,0,152,236,375,386,461,310,78,86,76,0,2160,1596,16,0.0100250626566416,453,4320,0.1048611111111111


['searchyear',
 'gugun_nm',
 '월_사망_1',
 '월_사망_9',
 '월_사망_4',
 '월_사망_8',
 '월_사망_12',
 '월_사망_6',
 '월_사망_3',
 '월_사망_10',
 '월_사망_11',
 '월_사망_2',
 '월_사망_7',
 '월_사망_5',
 '월_부상_1',
 '월_부상_9',
 '월_부상_4',
 '월_부상_8',
 '월_부상_12',
 '월_부상_6',
 '월_부상_3',
 '월_부상_10',
 '월_부상_11',
 '월_부상_2',
 '월_부상_7',
 '월_부상_5',
 '월_발생_1',
 '월_발생_9',
 '월_발생_4',
 '월_발생_8',
 '월_발생_12',
 '월_발생_6',
 '월_발생_3',
 '월_발생_10',
 '월_발생_11',
 '월_발생_2',
 '월_발생_7',
 '월_발생_5',
 '시간_사망_08시-10시',
 '시간_사망_02시-04시',
 '시간_사망_00시-02시',
 '시간_사망_20시-22시',
 '시간_사망_22시-24시',
 '시간_사망_14시-16시',
 '시간_사망_04시-06시',
 '시간_사망_16시-18시',
 '시간_사망_12시-14시',
 '시간_사망_전체',
 '시간_사망_10시-12시',
 '시간_사망_18시-20시',
 '시간_사망_06시-08시',
 '시간_부상_08시-10시',
 '시간_부상_02시-04시',
 '시간_부상_00시-02시',
 '시간_부상_20시-22시',
 '시간_부상_22시-24시',
 '시간_부상_14시-16시',
 '시간_부상_04시-06시',
 '시간_부상_16시-18시',
 '시간_부상_12시-14시',
 '시간_부상_전체',
 '시간_부상_10시-12시',
 '시간_부상_18시-20시',
 '시간_부상_06시-08시',
 '시간_발생_08시-10시',
 '시간_발생_02시-04시',
 '시간_발생_00시-02시',
 '시간_발생_20시-22시',
 '시간_발생_22시-24시',
 '시간_발생_14시-16시',
 

In [0]:
from pyspark.sql.functions import col, when
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline

# 1. 사고 심각도 등급 파생 컬럼 생성
feature_df = feature_df.withColumn("severity_level",
    when(col("monthly_death_ratio") < 0.01, "Low")
    .when(col("monthly_death_ratio") < 0.03, "Medium")
    .otherwise("High")
)

# 2. feature 컬럼 추출
exclude_cols = {"searchyear", "gugun_nm", "severity_level"}
feature_cols = [c for c in feature_df.columns if c not in exclude_cols]

# 3. feature vector 생성
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# 4. target label 인코딩 # 'severity_level' 컬럼이 "Low", "Medium", "High"일 경우 0, 1, 2로 변환
label_indexer = StringIndexer(inputCol="severity_level", outputCol="label")

# 5. 분류 모델 정의 (랜덤 포레스트)
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100)

# 6. 파이프라인 구성
pipeline = Pipeline(stages=[assembler, label_indexer, rf])

# 7. 모델 훈련
model = pipeline.fit(feature_df)

# 8. 예측 결과 생성
predictions = model.transform(feature_df)

# 9. 예측 결과 확인 -> 사고 심각도 등급 파생 컬럼
predictions.select("searchyear", "gugun_nm", "severity_level", "prediction").show(20, truncate=False)

Downloading artifacts:   0%|          | 0/35 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

+----------+--------+--------------+----------+
|searchyear|gugun_nm|severity_level|prediction|
+----------+--------+--------------+----------+
|2017      |용산구  |Low           |0.0       |
|2006      |동대문구|Medium        |1.0       |
|2014      |중구    |Low           |0.0       |
|2007      |성북구  |Low           |0.0       |
|2006      |전체    |Medium        |1.0       |
|2010      |광진구  |Medium        |1.0       |
|2020      |성동구  |Low           |0.0       |
|2006      |은평구  |Medium        |1.0       |
|2011      |전체    |Medium        |1.0       |
|2008      |은평구  |Medium        |1.0       |
|2020      |관악구  |Low           |0.0       |
|2019      |노원구  |Low           |0.0       |
|2019      |서대문구|Medium        |1.0       |
|2011      |성동구  |Medium        |1.0       |
|2015      |중랑구  |Low           |0.0       |
|2008      |용산구  |Medium        |1.0       |
|2009      |서대문구|Medium        |1.0       |
|2009      |서초구  |Medium        |1.0       |
|2014      |동작구  |Low           |0.0       |
|

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline

label_col = "total_monthly_deaths"
feature_cols = [col for col in feature_df.columns 
                if col not in ["searchyear", "gugun_nm", "severity_level", "total_monthly_deaths"]]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")


rf = RandomForestRegressor(labelCol=label_col, featuresCol="features", numTrees=100)
pipeline = Pipeline(stages=[assembler, rf])

model = pipeline.fit(feature_df)

predictions = model.transform(feature_df)
predictions.select("gugun_nm", "searchyear", label_col, "prediction").show(10, truncate=False)

from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol=label_col, predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("RMSE:", rmse)


Downloading artifacts:   0%|          | 0/25 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

+--------+----------+--------------------+------------------+
|gugun_nm|searchyear|total_monthly_deaths|prediction        |
+--------+----------+--------------------+------------------+
|용산구  |2017      |12                  |12.39235095570002 |
|동대문구|2006      |25                  |24.90372767828693 |
|중구    |2014      |7                   |7.648144811426026 |
|성북구  |2007      |9                   |9.540060040823885 |
|전체    |2006      |452                 |441.944           |
|광진구  |2010      |17                  |17.24053072570582 |
|성동구  |2020      |9                   |8.901425041373315 |
|은평구  |2006      |18                  |20.16161762949308 |
|전체    |2011      |435                 |426.119           |
|은평구  |2008      |16                  |15.550804096533145|
+--------+----------+--------------------+------------------+
only showing top 10 rows
RMSE: 1.9842624815409942


In [0]:
# 아 귀찮다~ 해줘~~~ 응애!!