# 라이브러리

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

# 1. 데이터 로드

In [2]:
df = pd.read_excel('화재 위험도 계산 데이터/최종본.xlsx')
df.head()

Unnamed: 0,year,month,day,ws,ta,hm,rn,result
0,2019,1,1,3.0,4.1,36,0.0,0
1,2019,1,2,3.9,4.2,30,0.0,1
2,2019,1,3,5.1,5.3,20,0.0,1
3,2019,1,4,5.8,6.3,20,0.0,0
4,2019,1,5,4.4,7.0,40,0.0,1


In [3]:
result_counts = df['result'].value_counts(normalize=True) * 100  # 비율(%)
print("result 0과 1의 비율:")
print(result_counts)

result 0과 1의 비율:
result
0    72.109589
1    27.890411
Name: proportion, dtype: float64


# 2. 특징과 레이블 선택

In [5]:
X = df[['year', 'month', 'day', 'ws', 'ta', 'hm', 'rn']]  # 특징
y = df['result']  # 레이블

# 3. 데이터 분할
(훈련/검증 세트, 비율: 90:10)   
(훈련/테스트 세트, 비율: 80:20) 

In [14]:
# 훈련 세트와 테스트 세트로 나누기
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

In [15]:
# 훈련 세트와 검증 세트로 나누기
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42, stratify=y_train_val)

# 4. 모델 생성

In [18]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 5. 예측 및 평가

In [21]:
y_val_pred = model.predict(X_val)

# 모델 평가
print("검증 세트 정확도:", accuracy_score(y_val, y_val_pred))
print("검증 세트 ROC AUC 점수:", roc_auc_score(y_val, model.predict_proba(X_val)[:, 1]))
print("검증 세트 보고서:\n", classification_report(y_val, y_val_pred))

검증 세트 정확도: 0.7112462006079028
검증 세트 ROC AUC 점수: 0.5944551458447991
검증 세트 보고서:
               precision    recall  f1-score   support

           0       0.74      0.93      0.82       237
           1       0.45      0.15      0.23        92

    accuracy                           0.71       329
   macro avg       0.59      0.54      0.53       329
weighted avg       0.66      0.71      0.66       329



In [26]:
y_test_pred = model.predict(X_test)
y_test_pred_proba = model.predict_proba(X_test)[:, 1]  # 양성 클래스에 대한 확률

# 7. 테스트 세트와 예측 결과 출력
test_results = pd.DataFrame({
    '정답': y_test,
    '예측': y_test_pred,
    '예측 확률': y_test_pred_proba,
    '날짜': pd.to_datetime(X_test[['year', 'month', 'day']]).dt.strftime('%Y-%m-%d'),
    '풍속 (ws)': X_test['ws'].values,
    '기온 (ta)': X_test['ta'].values,
    '습도 (hm)': X_test['hm'].values,
    '일일강수량 (rn)': X_test['rn'].values
})


테스트 세트 결과:


Unnamed: 0,정답,예측,예측 확률,날짜,풍속 (ws),기온 (ta),습도 (hm),일일강수량 (rn)
813,1,0,0.19,2021-03-25,3.3,19.3,33,0.0
88,0,0,0.26,2019-03-30,4.9,15.8,52,1.1
1370,0,0,0.15,2022-10-03,8.8,24.2,91,2.9
540,0,0,0.46,2020-06-24,1.4,25.6,66,0.0
965,0,0,0.41,2021-08-24,12.0,25.9,96,35.8
1198,0,0,0.36,2022-04-14,5.3,12.1,78,0.6
1153,0,0,0.24,2022-02-28,4.5,14.8,21,0.0
815,1,0,0.18,2021-03-27,1.5,16.7,78,0.0
126,0,0,0.39,2019-05-07,7.1,18.9,39,0.0
161,0,0,0.18,2019-06-11,3.0,19.4,70,0.0


In [29]:
print("\n테스트 세트 결과:")
test_results.sample(15)


테스트 세트 결과:


Unnamed: 0,정답,예측,예측 확률,날짜,풍속 (ws),기온 (ta),습도 (hm),일일강수량 (rn)
168,1,0,0.11,2019-06-18,2.6,24.2,69,0.0
752,1,0,0.37,2021-01-22,1.3,11.5,98,2.9
1485,0,1,0.53,2023-01-26,3.4,3.5,47,0.0
27,0,0,0.26,2019-01-28,5.4,9.5,36,0.0
1529,0,0,0.12,2023-03-11,2.9,19.8,56,0.0
1192,0,0,0.45,2022-04-08,4.0,16.6,57,0.0
801,0,0,0.34,2021-03-13,5.1,14.0,53,0.0
1111,0,0,0.27,2022-01-17,4.7,2.5,32,0.0
1448,0,1,0.78,2022-12-20,1.7,6.2,38,0.0
1515,0,0,0.29,2023-02-25,3.4,9.1,29,0.0
