In [8]:
import numpy as np
import pandas as pd
import re

# Plots
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [9]:
# 데이터 부르기
dfa = pd.read_csv('.\dataset\침수데이터.csv')
dfb = pd.read_csv('.\dataset\호우데이터.csv')

# 중복 제거
dfa = dfa.drop_duplicates()
dfb = dfb.drop_duplicates()
dfb = dfb.drop(labels = "습도(%)", axis = 1)

dfa['침수'] = 1
dfb['침수'] = 0

def replace_rainfall_with_mean(dataframe, column):
    mean_value = dataframe[column][dataframe[column] != 0.0].mean()
    dataframe[column] = dataframe[column].replace(0.0, mean_value)
    return dataframe

# 강수량(mm) 특성의 0.0 값을 평균값으로 대체
column_to_replace = '강수량(mm)'
dfa = replace_rainfall_with_mean(dfa, column_to_replace)

df = pd.concat([dfa, dfb])
df = df.drop(labels = '일시', axis = 1)

df['기온(°C)'] = df['기온(°C)'].fillna(df['기온(°C)'].mean())
df['풍향(deg)'] = df['풍향(deg)'].fillna(df['풍향(deg)'].mean())
df['풍속(m/s)'] = df['풍속(m/s)'].fillna(df['풍속(m/s)'].mean())

# # 정규화할 강수량 데이터가 담긴 열을 선택
# rainfall_data = df['강수량(mm)']

# # Min-Max 정규화 객체 생성
# min_max_scaler = MinMaxScaler(feature_range=(-1, 1))

# # 강수량 데이터를 Min-Max 정규화
# normalized_data = min_max_scaler.fit_transform(rainfall_data.values.reshape(-1, 1))

# # 표준화 객체 생성
# standard_scaler = StandardScaler()

# # Min-Max 정규화된 데이터를 표준화
# standardized_data = standard_scaler.fit_transform(normalized_data)

# # 정규화된 데이터를 새로운 열로 추가
# df['강수량_정규화'] = normalized_data.flatten()
# df['강수량_표준화'] = standardized_data.flatten()

X = df.drop(labels = ['침수'], axis = 1)
y = df['침수']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [10]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

logreg_y_pred = logreg.predict(X_test)
logreg_accuracy = logreg.score(X_train, y_train)
logreg_accuracy

0.8349378881987578

In [11]:
base_models = [
    ('svc', SVC()),
    ('gnb', GaussianNB()),
    ('mlp', MLPClassifier())
]

stacking_model = StackingClassifier(estimators=base_models)

# StackingClassifier 훈련
stacking_model.fit(X_train, y_train)

# StackingClassifier 예측
predictions = stacking_model.predict(X_test)
stacking_model.score(X_test, y_test)

0.8658593386120168

In [12]:
from sklearn.metrics import roc_auc_score

class_index = 1
y_pred_proba = stacking_model.predict_proba(X_val)[:, class_index]
print(f'Test AUC for class "{stacking_model.classes_[class_index]}":')
print(roc_auc_score(y_val, y_pred_proba)) # 범위는 0-1, 수치는 높을 수록 좋습니다

Test AUC for class "1":
0.9465864813030989


In [13]:
from sklearn.metrics import classification_report

y_test_pred = stacking_model.predict(X_val)
print(classification_report(y_val, y_test_pred))

              precision    recall  f1-score   support

           0       0.88      0.83      0.86      1065
           1       0.84      0.89      0.87      1082

    accuracy                           0.86      2147
   macro avg       0.86      0.86      0.86      2147
weighted avg       0.86      0.86      0.86      2147



In [14]:
from joblib import dump, load

# 훈련된 모델 저장
dump(stacking_model, 'stacking_model.joblib')

# 저장된 모델 로드
loaded_stacking = load('stacking_model.joblib')

data = {
    '기온(°C)': [22.6],
    '풍향(deg)': [43.0],
    '풍속(m/s)': [0.8],
    '강수량(mm)': [0.0]
}

new_data = pd.DataFrame(data)

# # 정규화할 강수량 데이터가 담긴 열을 선택
# rainfall_data = new_data['강수량(mm)']

# # Min-Max 정규화 객체 생성
# min_max_scaler = MinMaxScaler(feature_range=(-1, 1))

# # 강수량 데이터를 Min-Max 정규화
# normalized_data = min_max_scaler.fit_transform(rainfall_data.values.reshape(-1, 1))

# # 표준화 객체 생성
# standard_scaler = StandardScaler()

# # Min-Max 정규화된 데이터를 표준화
# standardized_data = standard_scaler.fit_transform(normalized_data)

# # 정규화된 데이터를 새로운 열로 추가
# new_data['강수량_정규화'] = normalized_data.flatten()
# new_data['강수량_표준화'] = standardized_data.flatten()

# new_data = new_data.drop(labels = '강수량(mm)', axis= 1)
predictions = loaded_stacking.predict(new_data)

probabilities = stacking_model.predict_proba(new_data)
print(predictions)
print(probabilities)

[0]
[[0.96945927 0.03054073]]
