In [1]:
import warnings
warnings.filterwarnings('ignore')
import glob
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import mutual_info_regression
import random
import platform
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from matplotlib import font_manager , rc
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Windows' :
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname = path).get_name()
    rc('font' , family = font_name)

In [2]:
roadData = pd.read_csv('경상북도_구미시_가로수길정보.csv' , encoding = 'euc-kr')
roadData.head()
roadData = roadData.loc[:,[ '가로수길시작위도','가로수길시작경도','가로수길종료위도','가로수길종료경도','도로명']]
roadData.head()

Unnamed: 0,가로수길시작위도,가로수길시작경도,가로수길종료위도,가로수길종료경도,도로명
0,36.086722,128.361282,36.123435,128.362869,구미대로
1,36.117772,128.362639,36.118492,128.373445,구미대로 30길
2,36.070873,128.348511,36.086722,128.361282,금오대로
3,36.083575,128.389847,36.138013,128.451643,산호대로
4,36.136446,128.398603,36.141185,128.395483,대학로


In [3]:
roadData.dtypes

가로수길시작위도    float64
가로수길시작경도    float64
가로수길종료위도    float64
가로수길종료경도    float64
도로명          object
dtype: object

In [4]:
object_col = []
for col in roadData.columns:
    if roadData[col].dtype == 'object':
        object_col.append(col)

In [5]:
from sklearn import preprocessing
label_encoder=preprocessing.LabelEncoder()
#roadData['도로명']=label_encoder.fit_transform(roadData['도로명'])

In [6]:
roadData.head()

Unnamed: 0,가로수길시작위도,가로수길시작경도,가로수길종료위도,가로수길종료경도,도로명
0,36.086722,128.361282,36.123435,128.362869,구미대로
1,36.117772,128.362639,36.118492,128.373445,구미대로 30길
2,36.070873,128.348511,36.086722,128.361282,금오대로
3,36.083575,128.389847,36.138013,128.451643,산호대로
4,36.136446,128.398603,36.141185,128.395483,대학로


In [7]:
object_col

['도로명']

In [8]:
enc = OneHotEncoder()
enc.fit(roadData.loc[:,object_col])


roadData_onehot_df = pd.DataFrame(enc.transform(roadData.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))

roadData.drop(object_col, axis=1, inplace=True)
roadData_ = pd.concat([roadData, roadData_onehot_df], axis=1)

In [9]:
roadData_.head()

Unnamed: 0,가로수길시작위도,가로수길시작경도,가로수길종료위도,가로수길종료경도,도로명_1공단로,도로명_1공단로3길,도로명_1공단로4길,도로명_1공단로6길,도로명_1공단로9길,도로명_3공단1로,...,도로명_칠성로,도로명_칠성로2길,도로명_해마루공원로,도로명_형곡동로,도로명_형곡로,도로명_형곡서로,도로명_형곡중앙로,도로명_화신로,도로명_황들길,도로명_흥안로
0,36.086722,128.361282,36.123435,128.362869,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,36.117772,128.362639,36.118492,128.373445,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,36.070873,128.348511,36.086722,128.361282,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,36.083575,128.389847,36.138013,128.451643,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,36.136446,128.398603,36.141185,128.395483,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
roadData_.to_csv("roadData.csv" , index = False , encoding = 'euc-kr')

In [11]:
roadData_.head()

Unnamed: 0,가로수길시작위도,가로수길시작경도,가로수길종료위도,가로수길종료경도,도로명_1공단로,도로명_1공단로3길,도로명_1공단로4길,도로명_1공단로6길,도로명_1공단로9길,도로명_3공단1로,...,도로명_칠성로,도로명_칠성로2길,도로명_해마루공원로,도로명_형곡동로,도로명_형곡로,도로명_형곡서로,도로명_형곡중앙로,도로명_화신로,도로명_황들길,도로명_흥안로
0,36.086722,128.361282,36.123435,128.362869,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,36.117772,128.362639,36.118492,128.373445,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,36.070873,128.348511,36.086722,128.361282,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,36.083575,128.389847,36.138013,128.451643,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,36.136446,128.398603,36.141185,128.395483,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
AccidentData = pd.read_csv('AccidentInfo.csv', encoding = 'euc-kr')
AccidentData.head()
AccidentData = AccidentData.loc[:,['사고번호' ,'도로명','사고내용' ,'기상상태']]
AccidentData.head()

Unnamed: 0,사고번호,도로명,사고내용,기상상태
0,2019103100100662,구미중앙로,중상사고,맑음
1,2019051600100309,구미중앙로,경상사고,맑음
2,2019021800100048,경부고속도로,중상사고,맑음
3,2019111100100500,낙동강변로,경상사고,맑음
4,2019053100100646,구미중앙로,부상신고사고,맑음


In [13]:
AccidentData.loc[:,'사고번호'] = AccidentData.loc[:,'사고번호']//100000000

In [14]:
AccidentData.head()

Unnamed: 0,사고번호,도로명,사고내용,기상상태
0,20191031,구미중앙로,중상사고,맑음
1,20190516,구미중앙로,경상사고,맑음
2,20190218,경부고속도로,중상사고,맑음
3,20191111,낙동강변로,경상사고,맑음
4,20190531,구미중앙로,부상신고사고,맑음


In [15]:
AccidentData.dtypes

사고번호     int64
도로명     object
사고내용    object
기상상태    object
dtype: object

In [16]:
object_col = []
for col in AccidentData.columns:
    if AccidentData[col].dtype == 'object':
        object_col.append(col)
enc.fit(AccidentData.loc[:,object_col])

AccidentData_onehot_df = pd.DataFrame(enc.transform(AccidentData.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))

AccidentData.drop(object_col, axis=1, inplace=True)
AccidentData_ = pd.concat([AccidentData, AccidentData_onehot_df], axis=1)

In [17]:
AccidentData_.head()

Unnamed: 0,사고번호,도로명_1공단로,도로명_1공단로10길,도로명_1공단로3길,도로명_1공단로4길,도로명_1공단로5길,도로명_1공단로6길,도로명_1공단로7길,도로명_1공단로9길,도로명_3공단1로,...,사고내용_경상사고,사고내용_부상신고사고,사고내용_사망사고,사고내용_중상사고,기상상태_기타,기상상태_눈,기상상태_맑음,기상상태_비,기상상태_안개,기상상태_흐림
0,20191031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,20190516,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,20190218,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,20191111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,20190531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [18]:
AccidentData_.to_csv("Accident_info_T.csv" , encoding = 'euc-kr')

In [19]:
from urllib.request import urlopen
from urllib.parse import urlencode, unquote, quote_plus 
import urllib 
import requests 
import json 
import pandas as pd


In [20]:
url = 'http://apis.data.go.kr/1360000/AsosDalyInfoService/getWthrDataList'

In [21]:
params = '?' + urlencode({
    quote_plus("serviceKey"):"15crzzHwfhs3F6j4cXu4wyD%2F0gzkgLm9STKC2pBe9%2FnH%2FMrTBhBrwEJIwsQzXGTp4rTMQz53N72zPXl3uLg00Q%3D%3D",
    quote_plus("numOfRows"): "365",
    quote_plus("pageNo"): "1",
    quote_plus("dataType"): "JSON",
    quote_plus("dataCd") : "ASOS",
    quote_plus("dateCd") : "DAY",
    quote_plus("startDt"): "20190101",
    quote_plus("endDt"): "20191231",
    quote_plus("stnIds"): "279"
})

In [22]:
req = urllib.request.Request(url+unquote(params))

response_body = urlopen(req).read()
data = json.loads(response_body)

#print(data)

In [23]:
import json
import ujson

In [24]:
#df = pd.read_json(data , orient = 'index')
data.keys()

dict_keys(['response'])

In [25]:
data['response'].keys()

dict_keys(['header', 'body'])

In [26]:
data['response']['body'].keys()

dict_keys(['dataType', 'items', 'pageNo', 'numOfRows', 'totalCount'])

In [27]:
data['response']['body']['items'].keys()

dict_keys(['item'])

In [28]:
df = pd.DataFrame(data['response']['body']['items']['item'])

In [29]:
#df = pd.DataFrame(data)

In [30]:
df.head()

Unnamed: 0,stnId,stnNm,tm,avgTa,minTa,minTaHrmt,maxTa,maxTaHrmt,mi10MaxRn,mi10MaxRnHrmt,...,avgM05Te,avgM10Te,avgM15Te,avgM30Te,avgM50Te,sumLrgEv,sumSmlEv,n99Rn,iscs,sumFogDur
0,279,구미,2019-01-01,-0.8,-3.8,126,2.4,1237,,,...,,,,,,,,,,
1,279,구미,2019-01-02,-1.8,-6.3,625,3.7,1416,,,...,,,,,,,,,,
2,279,구미,2019-01-03,-1.2,-6.3,341,5.2,1503,,,...,,,,,,,,,,
3,279,구미,2019-01-04,-0.2,-7.5,735,6.7,1338,,,...,,,,,,,,,,
4,279,구미,2019-01-05,0.5,-4.8,2345,5.9,1434,,,...,,,,,,,,,,


In [31]:
df = df.loc[:,['tm' , 'avgTa' , 'sumRn' , 'avgWs' , 'avgTd'  , 'avgTca']]

In [32]:
df.fillna('NAN', inplace=True) 
df.head()

Unnamed: 0,tm,avgTa,sumRn,avgWs,avgTd,avgTca
0,2019-01-01,-0.8,0.0,1.8,-7.6,5.5
1,2019-01-02,-1.8,,1.1,-10.4,0.0
2,2019-01-03,-1.2,,1.1,-13.8,0.0
3,2019-01-04,-0.2,,0.6,-10.2,1.5
4,2019-01-05,0.5,,1.0,-9.9,2.6


In [33]:
df.dtypes

tm        object
avgTa     object
sumRn     object
avgWs     object
avgTd     object
avgTca    object
dtype: object

In [34]:
df["tm"] = df["tm"].str.replace(pat=r'[^\w]', repl=r'', regex=True)
#df = df.astype(float)
#df = df.astype({'avgTa': np.float, 'sumRn': np.float, 'avgWs': np.float, 'avgTd': np.float, 'avgTca': np.float})

In [35]:
df.head()

Unnamed: 0,tm,avgTa,sumRn,avgWs,avgTd,avgTca
0,20190101,-0.8,0.0,1.8,-7.6,5.5
1,20190102,-1.8,,1.1,-10.4,0.0
2,20190103,-1.2,,1.1,-13.8,0.0
3,20190104,-0.2,,0.6,-10.2,1.5
4,20190105,0.5,,1.0,-9.9,2.6


In [36]:
df.dtypes
#df.to_numeric(errors='coerce')

tm        object
avgTa     object
sumRn     object
avgWs     object
avgTd     object
avgTca    object
dtype: object

In [37]:
df.rename(columns={'tm': '날짜', 'avgTa': '평균기온' , 'sumRn' : '일강수량' , 'avgWs' : '평균풍속' , 'avgTd' : '평균이슬점온도' , 'avgTca' : '평균전운량'}, inplace=True)
df.to_csv("2019weather.csv" , encoding = 'euc-kr')

In [38]:
print(url+unquote(params))

http://apis.data.go.kr/1360000/AsosDalyInfoService/getWthrDataList?serviceKey=15crzzHwfhs3F6j4cXu4wyD%2F0gzkgLm9STKC2pBe9%2FnH%2FMrTBhBrwEJIwsQzXGTp4rTMQz53N72zPXl3uLg00Q%3D%3D&numOfRows=365&pageNo=1&dataType=JSON&dataCd=ASOS&dateCd=DAY&startDt=20190101&endDt=20191231&stnIds=279


In [39]:
#http://apis.data.go.kr/1360000/AsosDalyInfoService/getWthrDataList?serviceKey=15crzzHwfhs3F6j4cXu4wyD%2F0gzkgLm9STKC2pBe9%2FnH%2FMrTBhBrwEJIwsQzXGTp4rTMQz53N72zPXl3uLg00Q%3D%3D&numOfRows=10&pageNo=1&dataType=JSON&dataCd=ASOS&dateCd=DAY&startDt=20100101&endDt=20100102&stnIds=279

In [40]:
roadData.head()

Unnamed: 0,가로수길시작위도,가로수길시작경도,가로수길종료위도,가로수길종료경도
0,36.086722,128.361282,36.123435,128.362869
1,36.117772,128.362639,36.118492,128.373445
2,36.070873,128.348511,36.086722,128.361282
3,36.083575,128.389847,36.138013,128.451643
4,36.136446,128.398603,36.141185,128.395483


In [41]:
AccidentData = pd.read_csv('AccidentInfo.csv', encoding = 'euc-kr')
AccidentData.head()
AccidentData = AccidentData.loc[:,['사고번호' ,'도로명','사고내용' ,'기상상태']]
AccidentData.loc[:,'사고번호'] = AccidentData.loc[:,'사고번호']//100000000
AccidentData.head()

Unnamed: 0,사고번호,도로명,사고내용,기상상태
0,20191031,구미중앙로,중상사고,맑음
1,20190516,구미중앙로,경상사고,맑음
2,20190218,경부고속도로,중상사고,맑음
3,20191111,낙동강변로,경상사고,맑음
4,20190531,구미중앙로,부상신고사고,맑음


In [42]:
roadData = pd.read_csv('경상북도_구미시_가로수길정보.csv' , encoding = 'euc-kr')
roadData.head()
roadData = roadData.loc[:,[ '가로수길시작위도','가로수길시작경도','가로수길종료위도','가로수길종료경도','도로명']]
roadData.head()

Unnamed: 0,가로수길시작위도,가로수길시작경도,가로수길종료위도,가로수길종료경도,도로명
0,36.086722,128.361282,36.123435,128.362869,구미대로
1,36.117772,128.362639,36.118492,128.373445,구미대로 30길
2,36.070873,128.348511,36.086722,128.361282,금오대로
3,36.083575,128.389847,36.138013,128.451643,산호대로
4,36.136446,128.398603,36.141185,128.395483,대학로


In [43]:
AR_OUTER_JOIN = pd.merge(AccidentData, roadData, left_on='도로명', right_on='도로명', how='inner')

In [44]:
AR_OUTER_JOIN.head()

Unnamed: 0,사고번호,도로명,사고내용,기상상태,가로수길시작위도,가로수길시작경도,가로수길종료위도,가로수길종료경도
0,20191031,구미중앙로,중상사고,맑음,36.132378,128.323464,36.118854,128.362618
1,20190516,구미중앙로,경상사고,맑음,36.132378,128.323464,36.118854,128.362618
2,20190531,구미중앙로,부상신고사고,맑음,36.132378,128.323464,36.118854,128.362618
3,20191106,구미중앙로,경상사고,맑음,36.132378,128.323464,36.118854,128.362618
4,20190816,구미중앙로,중상사고,맑음,36.132378,128.323464,36.118854,128.362618


In [45]:
#AR_OUTER_JOIN.to_csv("AR.csv" , encoding = 'euc-kr')

In [46]:
AR_OUTER_JOIN.dtypes

사고번호          int64
도로명          object
사고내용         object
기상상태         object
가로수길시작위도    float64
가로수길시작경도    float64
가로수길종료위도    float64
가로수길종료경도    float64
dtype: object

In [47]:
df.dtypes

날짜         object
평균기온       object
일강수량       object
평균풍속       object
평균이슬점온도    object
평균전운량      object
dtype: object

In [48]:
df.head()

Unnamed: 0,날짜,평균기온,일강수량,평균풍속,평균이슬점온도,평균전운량
0,20190101,-0.8,0.0,1.8,-7.6,5.5
1,20190102,-1.8,,1.1,-10.4,0.0
2,20190103,-1.2,,1.1,-13.8,0.0
3,20190104,-0.2,,0.6,-10.2,1.5
4,20190105,0.5,,1.0,-9.9,2.6


In [49]:
df = pd.read_csv('2019weather.csv' , encoding = 'euc-kr')

In [50]:
df.dtypes

Unnamed: 0      int64
날짜              int64
평균기온          float64
일강수량          float64
평균풍속          float64
평균이슬점온도       float64
평균전운량         float64
dtype: object

In [51]:
ARW_JOIN = pd.merge(AR_OUTER_JOIN, df, left_on='사고번호', right_on='날짜', how='inner')

In [52]:
ARW_JOIN.head()

Unnamed: 0.1,사고번호,도로명,사고내용,기상상태,가로수길시작위도,가로수길시작경도,가로수길종료위도,가로수길종료경도,Unnamed: 0,날짜,평균기온,일강수량,평균풍속,평균이슬점온도,평균전운량
0,20191031,구미중앙로,중상사고,맑음,36.132378,128.323464,36.118854,128.362618,303,20191031,12.7,,0.7,8.7,1.1
1,20191031,문장로,경상사고,맑음,36.135922,128.332311,36.153781,128.328704,303,20191031,12.7,,0.7,8.7,1.1
2,20191031,박정희로,경상사고,맑음,36.078578,128.355424,36.124974,128.337068,303,20191031,12.7,,0.7,8.7,1.1
3,20191031,옥계2공단로,중상사고,맑음,36.110903,128.417441,36.147214,128.418521,303,20191031,12.7,,0.7,8.7,1.1
4,20191031,해마루공원로,경상사고,맑음,36.133301,128.428911,36.154072,128.435086,303,20191031,12.7,,0.7,8.7,1.1


In [53]:
ARW_JOIN.dtypes

사고번호            int64
도로명            object
사고내용           object
기상상태           object
가로수길시작위도      float64
가로수길시작경도      float64
가로수길종료위도      float64
가로수길종료경도      float64
Unnamed: 0      int64
날짜              int64
평균기온          float64
일강수량          float64
평균풍속          float64
평균이슬점온도       float64
평균전운량         float64
dtype: object

In [54]:
ARW_JOIN.drop('Unnamed: 0', axis=1, inplace=True)

In [55]:
ARW_JOIN.drop('날짜' , axis = 1 , inplace = True)

In [56]:
ARW_JOIN.drop('도로명' , axis = 1 , inplace = True)

In [57]:
ARW_JOIN.head()

Unnamed: 0,사고번호,사고내용,기상상태,가로수길시작위도,가로수길시작경도,가로수길종료위도,가로수길종료경도,평균기온,일강수량,평균풍속,평균이슬점온도,평균전운량
0,20191031,중상사고,맑음,36.132378,128.323464,36.118854,128.362618,12.7,,0.7,8.7,1.1
1,20191031,경상사고,맑음,36.135922,128.332311,36.153781,128.328704,12.7,,0.7,8.7,1.1
2,20191031,경상사고,맑음,36.078578,128.355424,36.124974,128.337068,12.7,,0.7,8.7,1.1
3,20191031,중상사고,맑음,36.110903,128.417441,36.147214,128.418521,12.7,,0.7,8.7,1.1
4,20191031,경상사고,맑음,36.133301,128.428911,36.154072,128.435086,12.7,,0.7,8.7,1.1


In [58]:
label_encoder=preprocessing.LabelEncoder()
ARW_JOIN['사고내용']=label_encoder.fit_transform(ARW_JOIN['사고내용'])

In [59]:
ARW_JOIN.drop('기상상태' , axis = 1 , inplace = True)

In [60]:
ARW_JOIN.head()

Unnamed: 0,사고번호,사고내용,가로수길시작위도,가로수길시작경도,가로수길종료위도,가로수길종료경도,평균기온,일강수량,평균풍속,평균이슬점온도,평균전운량
0,20191031,3,36.132378,128.323464,36.118854,128.362618,12.7,,0.7,8.7,1.1
1,20191031,0,36.135922,128.332311,36.153781,128.328704,12.7,,0.7,8.7,1.1
2,20191031,0,36.078578,128.355424,36.124974,128.337068,12.7,,0.7,8.7,1.1
3,20191031,3,36.110903,128.417441,36.147214,128.418521,12.7,,0.7,8.7,1.1
4,20191031,0,36.133301,128.428911,36.154072,128.435086,12.7,,0.7,8.7,1.1


In [61]:
object_col = []
for col in ARW_JOIN.columns:
    if ARW_JOIN[col].dtype == 'object':
        object_col.append(col)
enc.fit(ARW_JOIN.loc[:,object_col])

ARW_ONE_HOT = pd.DataFrame(enc.transform(ARW_JOIN.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))

ARW_JOIN.drop(object_col, axis=1, inplace=True)
ARW_JOIN = pd.concat([ARW_JOIN, ARW_ONE_HOT], axis=1)

In [62]:
ARW_JOIN

Unnamed: 0,사고번호,사고내용,가로수길시작위도,가로수길시작경도,가로수길종료위도,가로수길종료경도,평균기온,일강수량,평균풍속,평균이슬점온도,평균전운량
0,20191031,3,36.132378,128.323464,36.118854,128.362618,12.7,,0.7,8.7,1.1
1,20191031,0,36.135922,128.332311,36.153781,128.328704,12.7,,0.7,8.7,1.1
2,20191031,0,36.078578,128.355424,36.124974,128.337068,12.7,,0.7,8.7,1.1
3,20191031,3,36.110903,128.417441,36.147214,128.418521,12.7,,0.7,8.7,1.1
4,20191031,0,36.133301,128.428911,36.154072,128.435086,12.7,,0.7,8.7,1.1
...,...,...,...,...,...,...,...,...,...,...,...
1388,20190603,3,36.242973,128.301209,36.242476,128.398108,23.6,,0.8,13.4,0.0
1389,20190603,0,36.140799,128.448072,36.141299,128.453656,23.6,,0.8,13.4,0.0
1390,20190306,2,36.239021,128.290724,36.241329,128.275534,11.1,0.0,1.1,0.9,8.0
1391,20191204,3,36.238461,128.303016,36.236381,128.309677,4.9,0.0,0.9,-3.8,2.3


In [63]:
ARW_JOIN.fillna(0)

Unnamed: 0,사고번호,사고내용,가로수길시작위도,가로수길시작경도,가로수길종료위도,가로수길종료경도,평균기온,일강수량,평균풍속,평균이슬점온도,평균전운량
0,20191031,3,36.132378,128.323464,36.118854,128.362618,12.7,0.0,0.7,8.7,1.1
1,20191031,0,36.135922,128.332311,36.153781,128.328704,12.7,0.0,0.7,8.7,1.1
2,20191031,0,36.078578,128.355424,36.124974,128.337068,12.7,0.0,0.7,8.7,1.1
3,20191031,3,36.110903,128.417441,36.147214,128.418521,12.7,0.0,0.7,8.7,1.1
4,20191031,0,36.133301,128.428911,36.154072,128.435086,12.7,0.0,0.7,8.7,1.1
...,...,...,...,...,...,...,...,...,...,...,...
1388,20190603,3,36.242973,128.301209,36.242476,128.398108,23.6,0.0,0.8,13.4,0.0
1389,20190603,0,36.140799,128.448072,36.141299,128.453656,23.6,0.0,0.8,13.4,0.0
1390,20190306,2,36.239021,128.290724,36.241329,128.275534,11.1,0.0,1.1,0.9,8.0
1391,20191204,3,36.238461,128.303016,36.236381,128.309677,4.9,0.0,0.9,-3.8,2.3


In [64]:
ARW_JOIN.dtypes

사고번호          int64
사고내용          int32
가로수길시작위도    float64
가로수길시작경도    float64
가로수길종료위도    float64
가로수길종료경도    float64
평균기온        float64
일강수량        float64
평균풍속        float64
평균이슬점온도     float64
평균전운량       float64
dtype: object

In [65]:
#ARW_JOIN.dropna()

In [66]:
ARW_JOIN.to_csv("final.csv" , encoding  ='euc-kr')

In [67]:
final = pd.read_csv('final.csv' , encoding = 'euc-kr').fillna(0).astype(np.float32)

In [68]:
final.dtypes

Unnamed: 0    float32
사고번호          float32
사고내용          float32
가로수길시작위도      float32
가로수길시작경도      float32
가로수길종료위도      float32
가로수길종료경도      float32
평균기온          float32
일강수량          float32
평균풍속          float32
평균이슬점온도       float32
평균전운량         float32
dtype: object

In [69]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
clf.fit(final.drop(['사고내용'],axis=1),final['사고내용'])

DecisionTreeClassifier(random_state=0)

In [70]:
clf.predict_proba(final.drop(['사고내용'],axis=1))

array([[0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       ...,
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]])

In [71]:
import pickle
import joblib

In [72]:
joblib.dump(clf, 'model.pkl') 

['model.pkl']

In [73]:
from sklearn.model_selection import train_test_split


# shuffle = False

X_train, X_test, y_train, y_test = train_test_split(final.drop(['사고내용'],axis=1),final['사고내용'],

                                                    test_size=0.1, 

                                                    shuffle=True, 

                                                    random_state=1004)



In [103]:
clf = DecisionTreeClassifier( random_state=0 , max_depth = 4)
clf.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=4, random_state=0)

In [104]:
#clf.predict_proba(X_test)

In [105]:
print("테스트 세트 정확도: {:.3f}".format(clf.score(X_test,y_test)))

테스트 세트 정확도: 0.521


In [184]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=4, random_state=221,n_estimators =100 ,min_samples_split = 4096,min_samples_leaf = 4096 )

In [185]:
clf.fit(X_train,y_train)

RandomForestClassifier(max_depth=4, min_samples_leaf=4096,
                       min_samples_split=4096, random_state=221)

In [186]:
print("테스트 세트 정확도: {:.3f}".format(clf.score(X_test,y_test)))

테스트 세트 정확도: 0.550
