In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

---
# Correlation

In [2]:
data = pd.read_csv("./concat_seoul_weather_ver2.csv",encoding='ms949')
data = data.drop(columns='발생지시도')
corr = pd.read_csv("./weather_corr.csv",encoding='ms949')


def search_corr(feature,n):
    corr_list = list(corr[feature])
    rev_sort_corr = list(reversed(sorted(corr_list)))
    high_corr = list(corr_list.index(i) for i in rev_sort_corr[1:n+1])
    
    return list(corr.columns[idx] for idx in high_corr)

In [3]:
data= data.drop([200,1297])

In [4]:
search_corr('사상자수',10)

['경상자수',
 '중상자수',
 '부상신고자수',
 '사망자수',
 '사고유형_대분류',
 '당사자종별_2당_대분류',
 '발생분',
 '평균기온(℃)',
 '최대기온(℃)',
 '최저기온(℃)']

In [5]:
X = data[['사상자수','중상자수','경상자수', '부상신고자수', '사고유형_대분류']]
y = data['사망자수']

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Support Vector Classification (SVC)

In [6]:
from sklearn.svm import SVC

svc = SVC()

In [7]:
from sklearn.model_selection import cross_val_score, KFold, cross_val_predict

kfold = KFold(n_splits=10)
scores = cross_val_score(svc, X, y, cv=kfold)
print(scores)
scores.mean()

[0.98984772 0.98477157 0.99492386 1.         0.98477157 0.98984772
 0.99492386 1.         1.         1.        ]


0.9939086294416242

In [8]:
prediction_cv = cross_val_predict(svc, X, y, cv=kfold)

In [9]:
len(prediction_cv)

1970

In [10]:
prediction_cv[1:1000]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [11]:
np.array(y)[1:1000]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [12]:
svc_pred

NameError: name 'svc_pred' is not defined

In [None]:
np.array(y_test)

In [None]:
# svc.fit(X_train, y_train)
# svc_pred=svc.predict(X_test)

# svc_acc=accuracy_score(y_test, svc_pred )
# print(svc_acc*100)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

LogR = LogisticRegression()
LogR.fit(X_train, y_train)
LogR_predict = LogR.predict(X_test)

In [None]:
LogR_acc = accuracy_score(y_test, LogR_predict)
print(LogR_acc*100)

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)
dt_predict = dt.predict(X_test)

In [None]:
dt_acc = accuracy_score(y_test,dt_predict)
print(dt_acc*100)

# LDA

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis(solver='svd')
lda.fit(X_train, y_train)
lda_predict = lda.predict(X_test)

In [None]:
lda_acc=accuracy_score(y_test, lda_predict)
print(lda_acc*100)

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier


knn=KNeighborsClassifier(n_neighbors=13)
knn.fit(X_train, y_train)
knn_predict = knn.predict(X_test)

In [None]:
knn_acc=accuracy_score(y_test, knn_predict)
print(knn_acc*100)