In [1]:
import pandas as pd

In [2]:
def create_cleaned_df(xl, sheet_name):
    df = xl.parse(sheet_name=sheet_name, header=None, index_col=None)
    df = drop_empty_columns_and_rows(df)
    rename_column_with_stat_id(df)
    df = get_rows_containing_data(df)
    return df


def drop_empty_columns_and_rows(df):
    return df.loc[df.notnull().sum(axis=1) > 2, df.notnull().sum() > 2]


def rename_column_with_stat_id(df):
    index = get_index_of_row_containing_stat_id(df)
    columns = df.loc[index].reset_index(drop=True).tolist()
    columns = rename_duplicates_in_list(columns)
    columns[0] = 'year_wst'
    df.columns = columns


def get_index_of_row_containing_stat_id(df):
    # All statistical IDs start with J.
    # Search from the second row
    index = df.index[df.iloc[:, 1].str.contains(r'^J', na=False)].tolist()[0]
    return index


def get_rows_with_digits(df):
    # Get rows with digts
    # Search from the first row
    rows_with_digits = df.iloc[:, 0].astype(str).str.isdigit().tolist()
    return rows_with_digits


def get_rows_containing_data(df):
    return df[get_rows_with_digits(df)]


def rename_duplicates_in_list(mylist):
    # Add underscores and progressive numbers to duplicate items in the list
    renamed_list = [str(v) + '_' + str(mylist[:i].count(v) + 1)
                    if mylist.count(v) >= 2 else v for i, v in enumerate(mylist)]
    return renamed_list



In [3]:
url = 'http://d-infra.ier.hit-u.ac.jp/Japanese/ltes/LTES_08.xlsx'
xl = pd.ExcelFile(url)

In [10]:
    # df1 is for CPI
    # In the note in Table 1, the authors of the statistics state that
    # the prewar CPI does not include the primary industry.
    # J0801__002 in Table 1 is the same as J0802__001 in Table 2,
    # but the latter has one more digit.
sheet_name1 = '第2表'
df1 = create_cleaned_df(xl, sheet_name1)

    # df2 is for price index for agricultural products
sheet_name2 = '第10表'
df2 = create_cleaned_df(xl, sheet_name2)

    # df3 is for price index for industrial products
sheet_name3 = '第15表'
df3 = create_cleaned_df(xl, sheet_name3)

    # df3 is for price index for industrial products
sheet_name4 = '第1表'
df4 = create_cleaned_df(xl, sheet_name4)
    # Create a temporary data frame to get the linked index.
df4_temp = xl.parse(sheet_name4, header=None, index_col=None)

In [11]:
    # Select columns for df1
    # The seventh column from the end has the year of postwar data
df1.columns.values[-7] = 'year_wst_post'
    # CPI, 1789-1938, 1934-36 = 100, General, 'J0802__001'
    # CPI, 1789-1938, 1934-36 = 100, Urban Families: Food, 'J0802__005'
    # CPI, 1789-1938, 1934-36 = 100, Urban Families: Clothing, 'J0802__006'
    # CPI, 1946-1965, 1934-36 = 1, General, 'J0802__101'
    # CPI, 1951-1965, 1934-36 = 1, Food, 'J0802__102'
    # CPI, 1951-1965, 1934-36 = 1, Clothing, 'J0802__103'
df1_pre = df1[['year_wst','J0802__001','J0802__003', 'J0802__005', 'J0802__006']].copy()
df1_post = df1[['year_wst_post','J0802__101', 'J0802__102',  'J0802__103']].copy()
    # Delete empty rows
df1_post = df1_post.dropna()

    # Multiply the column of df1_post by 100
    # so that 1934-1936 = 100 in both data frames.
columns_post = ['J0802__101', 'J0802__102',  'J0802__103']
adjustment_df1 = 100
for col in columns_post:
        df1_post[col] *= adjustment_df1

In [12]:
    # Change the data type to a string
df1_pre.loc[:, 'year_wst'] = df1_pre.year_wst.astype(int)
df1_post.loc[:, 'year_wst_post'] = df1_post.year_wst_post.astype(int)

    # Concatenate two data frames
df1 = df1_pre.merge(df1_post, how='outer',
                        left_on='year_wst', right_on='year_wst_post')

In [13]:
    # Create a column for the year
df1['year_wst'] = df1.year_wst.fillna(0) + df1.year_wst_post.fillna(0)

In [14]:
df1

Unnamed: 0,year_wst,J0802__001,J0802__005,J0802__006,year_wst_post,J0802__101,J0802__102,J0802__103
0,1879,33.13,32.12,48.04,,,,
1,1880,37.95,38.44,56.37,,,,
2,1881,41.81,40.63,77.97,,,,
3,1882,38.91,36.21,66.56,,,,
4,1883,33.43,29.34,49.96,,,,
...,...,...,...,...,...,...,...,...
70,1961,,,,1961.0,36003.0,40631.0,38030.0
71,1962,,,,1962.0,38368.0,43485.0,40266.0
72,1963,,,,1963.0,41173.0,47531.0,42756.0
73,1964,,,,1964.0,42893.0,49438.0,43969.0


In [8]:




    # Create a column dictionary
columns_pair = {
        'J0802__001': 'J0802__101',
        'J0802__005': 'J0802__102',
        'J0802__006': 'J0802__103'}
columns_pre = ['J0802__001', 'J0802__005', 'J0802__006']

for col in columns_pre:df1[col] = df1[col].fillna(0) + df1[columns_pair[col]].fillna(0)

df1 = df1[['year_wst', 'J0802__001', 'J0802__005', 'J0802__006']]

    # Select columns for df2
    # Agricultural product price index, 1934-36 = 100, 'J0810__004'
    # J0810__004 has two columns.
    # One is for 1874-1940 data, with 1934-36 = 100 and suffixed with '_1'.
    # The other is for 1950-1963 data, with 1934-36 = 1 and suffixed with '_2'.

df2 = df2[['year_wst', 'J0810__004_1', 'J0810__004_2']]

    # Add two columns, adjusting to 1934-1936 = 100
adjustment_df2 = 100
df2['J0810__004'] = df2['J0810__004_1'].fillna(0) + \
        df2['J0810__004_2'].fillna(0) * adjustment_df2
df2 = df2[['year_wst', 'J0810__004']]

    # Select columns for df3
    # Price Indexes of Manufacturfing and Mining Products, 'J0815__001'
    # J0815__001 has two columns.
    # One is for 1873-1945 data, with 1934-36 = 100 and suffixed with '_1'.
    # The other is for 1951-1962 data, with 1960 = 100 and suffixed with '_2'.
    # Pre-war data is not linked to post-war data.
df3 = df3[['year_wst', 'J0815__001_1']]

    # Select columns for df4
    # Indexes of Investment Goods Prices
    # J0801__003 has two columns.
    # One is for 1874-1940 data, with 1934-36 = 100 and suffixed with '_1'.
    # The other is for 1950-1963 data, with 1960 = 1 and suffixed with '_2'.
df4 = df4[['year_wst', 'J0801__003_1', 'J0801__003_2']]

    # Get the price index of 1955.
value_1955 = df4_temp.iloc[97, -1]
    # When 1934-36 = 1, the 1955 link index is 359.7.
linked_index_1955 = df4_temp.iloc[109, -1]
    # Multiply by 100 so that 1934-36 = 100.
adjustment_df4 = linked_index_1955 / value_1955 * 100
df4['J0801__003'] = df4['J0801__003_1'].fillna(0) + \
        df4['J0801__003_2'].fillna(0) * adjustment_df4
df4 = df4[['year_wst', 'J0801__003']]

    # Set the column name
df1.columns = ['year_wst', 'cpi_goods', 'cpi_agr', 'cpi_ind']
df2.columns = ['year_wst', 'ppi_agr']
df3.columns = ['year_wst', 'ppi_ind']
df4.columns = ['year_wst', 'epi_inv']

    # Test
    # df1.to_csv('test1.csv')
    # df2.to_csv('test2.csv')
    # df3.to_csv('test3.csv')
    # df4.to_csv('test4.csv')

    # Merge data frames
how = 'outer'
on = 'year_wst'
    # df = pd.merge(pd.merge(pd.merge(df1, df2, how=how, on=on),
    #              df3, how=how, on=on), df4, how=how, on=on)
df_merge1 = pd.merge(df1, df2, how=how, on=on)
df_merge2 = pd.merge(df3, df4, how=how, on=on)
df = pd.merge(df_merge1, df_merge2, how=how, on=on)
df = df.sort_values(by=['year_wst'])
