In [None]:
# Import library
import pandas as pd 
import os 
import pandas.tseries.offsets as offsets 

In [None]:
# Get file path
file_list = os.listdir(path='./data')
file_list.remove('.DS_Store') # If Mac, it is neccecary to remove
print(file_list)

In [None]:
# Create dataframe(df)
# Each columns are written in used language
df = pd.DataFrame(index=[], columns=['ID', '活動量日時', '日乳量', '反芻注意', '合計反芻時間', '活動量', '発情の可能性',  '活動量注意'])

In [None]:
# View df
df

In [None]:
# Read data of columns of all .xlsx files
# Coution: Too much data cannot be read at once (memory overload)
for i in file_list:
    df1 = pd.read_excel('./data/{0}'.format(i), header=None)
    ID = df1.iat[2, 3]
    ID = str(ID)
    ID = ID[:4]
    df2 = pd.read_excel('./data/{0}'.format(i), header=8, usecols=[0,1,2,3,4,5,6,7], skiprows=[9,10,11]) # Skiprows contain html metadata
    df2.columns = ['ID', '活動量日時', '日乳量', '反芻注意', '合計反芻時間', '活動量', '発情の可能性',  '活動量注意']
    df2['ID'] = ID
    df = df.append(df2, ignore_index=True)

In [None]:
# View df
df

In [None]:
# Convert date data type to string to correct date ordering
# At 2020, T4C was unable to write out the Japanese date markings correctly.
df['活動量日時'] = df['活動量日時'].astype(str)

In [None]:
# 確認
df['活動量日時']

In [None]:
# Check the numbers of rows
len(df)

In [None]:
# Fixing the date sequence
for i in range(len(df)):
    date = df.at[i, '活動量日時']
    df.at[i, '活動量日時'] = "20" + date[8:10] + "-" + date[5:7] + "-" + date[2:4] + " " + date[11:]

In [None]:
#　Confirmation
df['活動量日時']

In [None]:
# Save df
df.to_excel('./data.xlsx')

In [None]:
# Only the row with milk volume (at 0:00) is extracted and stored in df3.
df3 = df[df.日乳量 > 0]

In [None]:
# Reworking the index of df3
df3.reset_index(drop=True, inplace=True)
df3

In [None]:
# Convert date data type from string to datetime
pd.to_datetime(df3['活動量日時'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

In [None]:
# View df3
df3

In [None]:
# Select the date range to read
start_date = pd.to_datetime('2018-10-01 00:00:00')# Start
end_date = pd.to_datetime('2020-02-13 00:00:00') # End
daterange = end_date - start_date + offsets.Day(1) 
daterange_int = daterange.days # Numbers of days to read

In [None]:
daterange_int

In [None]:
# Create df4 (for inserting 日乳量 (Milk yield  kg/day))
df4 = pd.DataFrame(index=[], columns=['ID'])
df4.set_index('ID')

In [None]:
df3.drop_duplicates(inplace=True)

In [None]:
# Insert milk yield for each cow by date into df4
for i in range(daterange_int):
    date = start_date + offsets.Day(i)
    date_str = date.strftime("%Y-%m-%d %H:%M:%S")
    df5 = df3.query('活動量日時 in [@date_str]')
    df6 = df5.loc[:, ['ID', '日乳量']]
    df7 = df6.set_index('ID')
    df8 = df7.rename(columns={'日乳量': '{}'.format(date_str)})
    df4 = pd.concat([df4, df8], axis=1, sort=True)

In [None]:
df4

In [None]:
# Drop the row 'ID'
df9 = df4.drop('ID', axis=1) 

In [None]:
df9

In [None]:
# Save  df9 to xlsx format
df9.to_excel('./nyuryo.xlsx')

In [None]:
# Create df4 (for inserting 反芻時間 (Rumination time  min/day))
df10 = pd.DataFrame(index=[], columns=['ID'])
df10.set_index('ID')

In [None]:
# # Insert rumination time for each cow by date into df10
for i in range(daterange_int):
    date2 = start_date + offsets.Day(i) # 抽出日の初期化
    date2_str = date2.strftime("%Y-%m-%d %H:%M:%S") # 抽出日を文字列変換
    df11 = df3.query('活動量日時 in [@date2_str]') # 抽出日のデータ抽出
    df12 = df11.loc[:, ['ID', '合計反芻時間']] # IDと合計反芻時間を抽出
    df13 = df12.set_index('ID') # IDをインデックスにはめ込む
    df14 = df13.rename(columns={'合計反芻時間': '{}'.format(date2_str)}) # 合計反芻時間のラベルを日付に変換
    df10 = pd.concat([df10, df14], axis=1, sort=True) # df10に埋め込んでいく（indexはpandasが勝手にソートする）

In [None]:
# View df10
df10

In [None]:
# Remove the row 'ID'
df15 = df15.drop('ID', axis=1)

In [None]:
df15

In [None]:
# Save df15 to xlsx format
df15.to_excel('./hansu.xlsx')