In [1]:
import pandas as pd

In [2]:
url = "https://warp.da.ndl.go.jp/info:ndljp/pid/11423429/www.stat.go.jp/data/chouki/zuhyou/01-10.xls"

In [3]:
df_original = pd.read_excel(url, header = None, index_col = None)

In [4]:
df_original

Unnamed: 0,0,1,2,3,4,5,6
0,１－１０　市部・郡部別面積（大正９年～平成２２年）,,,,,,
1,1-10 Area by All Shi and All Gun (1920--2010),,,,,,
2,,,,,,,
3,(単位 平方キロメートル),,,,,,
4,(In square kilometers),,,,,,
5,年次,,全国,市部,,郡部,
6,,,,,# 人口集中地区面積,,# 人口集中地区面積
7,Year,,Japan,All shi,Population of Densely inhabited districts,All gun,Population of Densely inhabited districts
8,大正 9年,1920.0,381808.04,1375.36,-,380432.69,-
9,14,1925.0,381810.06,2181.5,-,379628.57,-


In [5]:
stat_name_jpn = df_original.loc[0,0]

In [6]:
stat_name_eng = df_original.loc[1,0]

In [7]:
stat_name_jpn, stat_name_eng

('１－１０\u3000市部・郡部別面積（大正９年～平成２２年）',
 '1-10 Area by All Shi and All Gun (1920--2010)')

In [8]:
# remove columns and rows with too many nulls
df_dropped = df_original.loc[df_original.notnull().sum(axis=1) > 2, df_original.notnull().sum() > 2].reset_index(drop = True)

In [12]:
df_dropped = df_dropped.replace('- ', 0)

In [13]:
df_dropped.iloc[3, 4]

0

In [14]:
df_dropped

Unnamed: 0,0,1,2,3,4,5,6
0,年次,,全国,市部,,郡部,
1,Year,,Japan,All shi,Population of Densely inhabited districts,All gun,Population of Densely inhabited districts
2,大正 9年,1920.0,381808.04,1375.36,0,380432.69,0
3,14,1925.0,381810.06,2181.5,0,379628.57,0
4,昭和 5年,1930.0,382264.91,2950.65,0,379314.26,0
5,10,1935.0,382545.42,5094.53,0,377450.89,0
6,15,1940.0,382545.42,8852.01,0,373693.41,0
7,20,1945.0,377298.15,14547.89,0,362750.26,0
8,22,1947.0,377298.15,16110.4,0,361187.75,0
9,25,1950.0,377099.08,20031.26,0,356925.64,0


In [16]:
columns_original_jpn = df_dropped.iloc[0,:]
columns_original_jpn

0     年次
1    NaN
2     全国
3     市部
4    NaN
5     郡部
6    NaN
Name: 0, dtype: object

In [17]:
columns_original_eng = df_dropped.iloc[1,:]
columns_original_eng

0                                          Year
1                                           NaN
2                                         Japan
3                                       All shi
4    Population of Densely inhabited districts 
5                                       All gun
6     Population of Densely inhabited districts
Name: 1, dtype: object

In [18]:
columns = ['year_jpn', 'year_wst', 'area_tot', 'area_urban', 'pop_urban_dense','area_rural', 'pop_rural_dense']

In [19]:
df_dropped.columns = columns

In [22]:
df_dropped = df_dropped.iloc[2:,]

In [23]:
df_dropped

Unnamed: 0,year_jpn,year_wst,area_tot,area_urban,pop_urban_dense,area_rural,pop_rural_dense
2,大正 9年,1920.0,381808.04,1375.36,0.0,380432.69,0.0
3,14,1925.0,381810.06,2181.5,0.0,379628.57,0.0
4,昭和 5年,1930.0,382264.91,2950.65,0.0,379314.26,0.0
5,10,1935.0,382545.42,5094.53,0.0,377450.89,0.0
6,15,1940.0,382545.42,8852.01,0.0,373693.41,0.0
7,20,1945.0,377298.15,14547.89,0.0,362750.26,0.0
8,22,1947.0,377298.15,16110.4,0.0,361187.75,0.0
9,25,1950.0,377099.08,20031.26,0.0,356925.64,0.0
10,30,1955.0,377151.09,67979.67,0.0,307870.86,0.0
11,35,1960.0,377151.09,82903.75,3555.7,292801.0,309.5


In [16]:
# Number of rows of data for each prefecture
rows_data = 23

In [17]:
data_row_init = 4

In [18]:
df_dropped.iloc[4, 0]

'明治31年'

In [19]:
df_dropped.iloc[4:4+rows_data, 0]

4     明治31年
5       36　
6       41　
7     大正 2年
8        7　
9        9　
10      14　
11    昭和 5年
12      10　
13      15　
14      22　
15      25　
16      30　
17      35　
18      40　
19      45　
20      50　
21      55　
22      60　
23    平成 2年
24       7　
25      12　
26      17　
Name: year_jpn, dtype: object

In [20]:
# Number of rows between labels in each prefecture
rows_label = 24

In [21]:
# Create a dictionary of prefectures and indices
pref_labels = {}
num_of_prefs = 48
pref_row_init = 9
for i in range(num_of_prefs):
    pref_name = df_original.iloc[pref_row_init + i * rows_label, 0].strip()
    pref_row  = data_row_init + i * rows_data
    pref_labels[pref_name] = pref_row

In [22]:
pref_labels

{'全国': 4,
 '北海道': 27,
 '青森県': 50,
 '岩手県': 73,
 '宮城県': 96,
 '秋田県': 119,
 '山形県': 142,
 '福島県': 165,
 '茨城県': 188,
 '栃木県': 211,
 '群馬県': 234,
 '埼玉県': 257,
 '千葉県': 280,
 '東京都': 303,
 '神奈川県': 326,
 '新潟県': 349,
 '富山県': 372,
 '石川県': 395,
 '福井県': 418,
 '山梨県': 441,
 '長野県': 464,
 '岐阜県': 487,
 '静岡県': 510,
 '愛知県': 533,
 '三重県': 556,
 '滋賀県': 579,
 '京都府': 602,
 '大阪府': 625,
 '兵庫県': 648,
 '奈良県': 671,
 '和歌山県': 694,
 '鳥取県': 717,
 '島根県': 740,
 '岡山県': 763,
 '広島県': 786,
 '山口県': 809,
 '徳島県': 832,
 '香川県': 855,
 '愛媛県': 878,
 '高知県': 901,
 '福岡県': 924,
 '佐賀県': 947,
 '長崎県': 970,
 '熊本県': 993,
 '大分県': 1016,
 '宮崎県': 1039,
 '鹿児島県': 1062,
 '沖縄県': 1085}

In [1]:
def save_csv(pref_name, csv_pref_name):
    pref_id = pref_labels[pref_name]
    df_temp = df_dropped.iloc[pref_id: pref_id + rows_data]
    csv_title = '../../Data/Downloaded/urban_rural_area_' + csv_pref_name + '.csv'
    df_temp.to_csv(csv_title, index = False)

In [24]:
save_csv('全国', 'total')

In [25]:
save_csv('東京都', 'tokyo')
save_csv('神奈川県', 'kanagawa')
save_csv('埼玉県', 'saitama')
save_csv('千葉県', 'chiba')
save_csv('愛知県', 'nagoya')
save_csv('京都府', 'kyoto')
save_csv('大阪府', 'osaka')
save_csv('兵庫県', 'hyogo')

In [24]:
df_dropped.to_csv( '../../Data/Downloaded/urban_rural_area.csv', index = False)