# Load Package

In [1]:
import pandas as pd
import numpy as np
import os
import sklearn
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
plt.style.use("ggplot")
plt.rcParams['font.family'] = 'Noto Sans CJK TC'
plt.rcParams['axes.unicode_minus']=False

# Load Data

In [99]:
data = pd.read_csv("道路交通事故原因傷亡統計107-110_有路燈.csv", dtype='unicode')
data.head(3)

Unnamed: 0,案件編號,發生日期,發生時間,GPS經度,GPS緯度,案件類別,地址類型,發生地點,24小時內死亡人數,受傷人數,...,事故位置,號誌種類,事故類型及型態,肇因研判,區域,季節,路燈數量,道路型態_大類別,路名或街名,里
0,10701AC191A0013,20180119,6-9時,120.2212,22.963416,交通事故,一般地址,臺南市東區生產路68號,1,0,...,機車優先道,無號誌,撞護欄(樁),未注意車前狀態,東區,冬天,0,直路,生產路,仁和里
1,10701AC191B0003,20180101,0-3時,120.235561,22.987356,交通事故,交叉路口,臺南市東區富農街二段西側與富農街二段12巷處,0,1,...,交叉路口內,無號誌,側撞,酒醉(後)駕駛失控,東區,冬天,1,交岔路,富農街二段,東聖里
2,10701AC191B0023,20180101,18-21時,120.233903,22.991879,交通事故,一般地址,臺南市東區中華東路1段前0公尺,0,2,...,機車優先道,無號誌,追撞,未保持行車安全距離,東區,冬天,0,直路,中華東路一段,東光里


# 台南市所有里的資料
https://data.tainan.gov.tw/dataset/tainan-vil-code/resource/f222d100-e67f-42de-9072-130be35fa363

In [4]:
suburb_data = pd.read_csv("tainanvilcode.csv", dtype='unicode')

all_suburb = {}
town, suburb = [], []
for col in range(suburb_data.shape[0]):
    a = suburb_data['AreaName'][col]
    start = a.find("區") + 1
    """
    town.append(a[3:start])
    suburb.append(a[start:-1]+'里')  
    """
    town = a[3:start]
    suburb = a[start:-1]+'里'
    if town not in all_suburb:
        all_suburb[town] = []
    all_suburb[town].append(suburb)

In [90]:
# 所有的里都是 3 個字
[x for x in suburb if len(x)!=3]

[]

In [87]:
#d = pd.DataFrame({"區":town, "里":suburb})
#d.to_csv("台南市區里資料統整.csv", index=False, encoding='utf_8_sig')

# 找「里」出來
利用 python 套件 [geopy 2.2.0](https://pypi.org/project/geopy/)，透過資料中的經緯度轉成地址後，擷取里的資訊  
※因為是免費的找地址程式，相較google地圖並不會完全精準，因此只用來找尋"里"，沒有用在尋找路名

In [5]:
# 新增「里」欄位
suburb = []
town_string = list(data['區域'])
spot_string = list(data['發生地點'])

for col in range(data.shape[0]):
    spot = spot_string[col]
    town = town_string[col]
    target = all_suburb[town]  # 找出該區域所有的里
    
    Q = '無註明'
    if spot.find("里")+1:
        find_suburb = [x for x in target if spot.find(x)+1]
        if find_suburb:
            Q = find_suburb[0]
    suburb.append(Q)

In [6]:
pd.Series(suburb).value_counts()

無註明    119856
新和里       736
光文里       716
社內里       701
仁德里       583
        ...  
漳洲里         1
國宅里         1
金華里         1
赤嵌里         1
溪心里         1
Length: 499, dtype: int64

In [7]:
# 讀取經緯度
lat = list(data['GPS經度'].astype('float'))
lon = list(data['GPS緯度'].astype('float'))

In [8]:
suburb_data = pd.read_csv("里_jupyter_1.csv")
neighbourhood = suburb_data.iloc[:, 0]

In [9]:
7936 + 7770 + 493 + 6035 + 34298

56532

In [10]:
# 經緯度轉地址
from tqdm import tqdm, trange
from geopy.geocoders import Nominatim
geolocation = Nominatim(user_agent="geotest")

#neighbourhood = suburb.copy()

for i in tqdm(range(56532, 67053)):  # 119000, data.shape[0]
    if (suburb[i]=='無註明') or (suburb[i]=="經緯度錯誤"):
        if (lat[i]<=122) & (lat[i]>=120) & (lon[i]<=25) & (lon[i]>=22):
            location = geolocation.reverse("{}, {}".format(lon[i], lat[i])) ## lon, lat

            if 'neighbourhood' in location.raw['address']:
                neighbourhood[i] = location.raw['address']['neighbourhood']

            elif 'suburb' in location.raw['address']:
                neighbourhood[i] = location.raw['address']['suburb']

            elif 'village' in location.raw['address']:
                neighbourhood[i] = location.raw['address']['village']

            else:
                neighbourhood[i] = "無"
        else:
            neighbourhood[i] = "經緯度錯誤"
            
    if ((i+1)%1000 == 0) or (i+1 == 67053):
        d = pd.DataFrame(neighbourhood)
        d.to_csv("里_jupyter_1.csv", index=False, encoding='utf_8_sig')

100%|██████████████████████████████████████████████████████████████████████████| 10521/10521 [2:05:26<00:00,  1.40it/s]


In [30]:
d = pd.DataFrame(neighbourhood)
d.to_csv("里_jupyter_1.csv", index=False, encoding='utf_8_sig')

In [31]:
pd.Series(neighbourhood).value_counts()

經緯度錯誤    41952
無註明       6271
永康里       1066
尚頂里       1021
塩行里       1003
         ...  
公正里          1
菜寮           1
新園           1
飯店里          1
東勢寮          1
Name: 0, Length: 740, dtype: int64

# 統整理

In [98]:
suburb_data_1 = pd.read_csv("里_jupyter.csv", dtype='unicode') ## 
suburb_data_2 = pd.read_csv("里_jupyter_1.csv", dtype='unicode')
suburb_data_3 = pd.read_csv("里_jupyter_2.csv", dtype='unicode')
suburb_data_4 = pd.read_csv("里_jupyter_3.csv", dtype='unicode')

a1 = pd.concat([#suburb_data_1.iloc[:15706, 0], 
                suburb_data_2.iloc[:67053, 0], 
                suburb_data_4.iloc[67053:105531:, 0],
                suburb_data_3.iloc[105531:118400:, 0],
                suburb_data_1.iloc[118400:, 0], 
               ], axis=0).reset_index(drop=True)

a1.to_csv("里.csv", index=False, encoding='utf_8_sig')

In [100]:
data['里'] = a1
data.head(3)

Unnamed: 0,案件編號,發生日期,發生時間,GPS經度,GPS緯度,案件類別,地址類型,發生地點,24小時內死亡人數,受傷人數,...,事故位置,號誌種類,事故類型及型態,肇因研判,區域,季節,路燈數量,道路型態_大類別,路名或街名,里
0,10701AC191A0013,20180119,6-9時,120.2212,22.963416,交通事故,一般地址,臺南市東區生產路68號,1,0,...,機車優先道,無號誌,撞護欄(樁),未注意車前狀態,東區,冬天,0,直路,生產路,東智里
1,10701AC191B0003,20180101,0-3時,120.235561,22.987356,交通事故,交叉路口,臺南市東區富農街二段西側與富農街二段12巷處,0,1,...,交叉路口內,無號誌,側撞,酒醉(後)駕駛失控,東區,冬天,1,交岔路,富農街二段,東聖里
2,10701AC191B0023,20180101,18-21時,120.233903,22.991879,交通事故,一般地址,臺南市東區中華東路1段前0公尺,0,2,...,機車優先道,無號誌,追撞,未保持行車安全距離,東區,冬天,0,直路,中華東路一段,東光里


In [101]:
# 存檔
# data.to_csv("道路交通事故原因傷亡統計107-110_有路燈.csv", index=False, encoding='utf_8_sig')