In [None]:
from datetime import datetime
import pandas as pd
from pathlib import Path

datadir = Path("")

DATA_DIR = './2020'
# bookings20200515092345.csv: 
# - selected date range in IRIS: 2019.01.01 - 2019.12.31
# - all providers
#export = "bookings20200515092345.csv"

# bookings20200515134423.csv
# - selected range in IRIS: 2018.12.01 - 2019.12.31
# - only LMU
DATA_NAME = 'bookings20200515134423.csv' 

DATA_DIR = Path(DATA_DIR)
if not DATA_DIR.is_dir():
    raise ValueError('Please check DATA_DIR.')
DATA_FILE = DATA_DIR / DATA_NAME
if not DATA_FILE.exists():
    raise ValueError('Please check DATA_NAME.')

df = pd.read_csv(DATA_FILE, na_values='', skiprows=1)
print(df.shape)

# keep only LMU data
df = df[df['Provider'] == 'Light Microscopy Unit']

# select a date range
START = '2019-01-01'
END = '2020-01-01'


# https://stackoverflow.com/questions/35321812/move-column-in-pandas-dataframe/35322540
from pandas import DataFrame

def move_columns(df: DataFrame, cols_to_move: list, new_index: int) -> DataFrame:
    """
    This method re-arranges the columns in a dataframe to place the desired columns at the desired index.
    ex Usage: df = move_columns(df, ['Rev'], 2)   
    :param df:
    :param cols_to_move: The names of the columns to move. They must be a list
    :param new_index: The 0-based location to place the columns.
    :return: Return a dataframe with the columns re-arranged
    """
    other = [c for c in df if c not in cols_to_move]
    start = other[0:new_index]
    end = other[new_index:]
    return df[start + cols_to_move + end]


# convert the 'Date' column to datetime format 
df['Start']= pd.to_datetime(df['Start']) 
df['End']= pd.to_datetime(df['End']) 
df['Duration'] = df['End'] - df['Start']

# apply date range
df = df[(df['Start'] >= datetime.fromisoformat(START + 'T00:00:00')) & \
        (df['End'] < datetime.fromisoformat(END + 'T00:00:00'))].copy()

# calculate duration in hours
df['DurationH'] = df.apply(lambda row: row.Duration.total_seconds() / 3600, axis=1)
df['DurationH'] = df['DurationH'].round(decimals=1)
# drop timedelta column
df = df.drop(columns=['Duration'])
# rename hours column
df = df.rename(columns={"DurationH": "Duration"})

df = move_columns(df,['Duration'], 4)



# remove lines where resource is an add-on
addons = [ \
          #"3I Marianas base with lasers", "3I Marianas no lasers ", 
          "3I Marianas", \
          "3I 405", "3I 488", "3I 561", "3I 640", "3I marianas no laser add-on","3I marianas no lasers", \
          "Zeiss Z.1 LightSheet Lasers", "Light sheet 405", "Light sheet 445", "Light sheet 488", "Light sheet 514", "Light sheet 561", "Light sheet 640", "Light sheet no laser add on", "Zeiss Z.1 LightSheet Data management", \
          "LSM700 405", "LSM700 488","LSM700 555","LSM700 639", \
          "SP5 HCS-A 405", "SP5 HCS-A Argon","SP5 HCS-A 561","SP5 HCS-A 633", \
          "SP5 MP 405", "SP5 MP Argon", "SP5 MP 561", "SP5 MP 594", "SP5 MP 633", "SP5 MP Laser MP", \
          "SP8  STED 592 STED", "SP8 STED 405", "SP8 STED Argon","SP8 STED 561","SP8 STED 633", \
          "SP8 upright 405", "SP8 upright 488 ", "SP8 upright 488", "SP8 upright 552", "SP8 upright 638", \
          "No laser (admin only)", \
          "Sheep (TESTING AND DEVELOPMENT PURPOSE", "Super testers practice instrument", \
         "GE and DM5000 Room 2028,2", "Leica SP5II HCA and SP8 Upright, Room 2036b"]
df = df[~df['Resource'].isin(addons)]

# remove lines with booking status that should be ignored
ignore_statuses = ["Canceled", "Upcoming", "Undefined"]
df = df[~df['Status'].isin(ignore_statuses)]

# remove test groups
ignore_groups = ["Group Raimi research inc", "TEST Viktor"]
df = df[~df['Group'].isin(ignore_groups)]

# remove IRIS admin bookings
df = df[~df['BookedBy'].isin(['iris@science-it.ch'])]



# save maintenance bookings
df2 = df[df['Type'].str.contains("Maintenance")]
OUTPUT = DATA_DIR / (DATA_FILE.stem + "__" + START + "__" + END + "__maintenance" + DATA_FILE.suffix)
df2.to_csv(OUTPUT, index=False, na_rep='')

# remove maintenance bookings from original
df = df[~df['Type'].str.contains("Maintenance")]


# save as CSV
OUTPUT = DATA_DIR / (DATA_FILE.stem + "__" + START + "__" + END + DATA_FILE.suffix)
df.to_csv(OUTPUT, index=False, na_rep='')
print(df.shape)


# save uniques group / user pairs
df2 = df[['Group','User']]
df2 = df2.sort_values(['Group', 'User'], ascending=[True, True]).drop_duplicates()
df2['Booked group'] = df2.apply(lambda row: row.Group.split()[-1], axis=1)
OUTPUT = DATA_DIR / (DATA_FILE.stem + "__" + START + "__" + END + "__unique_group_user" + DATA_FILE.suffix)
df2.to_csv(OUTPUT, index=False)


# calculate group totals
df2 = df.groupby(["Group"]).sum()
df2['Duration'] = df2['Duration'].round(decimals=0)
df2.index.names = ['Group']
df2 = df2.rename(columns={"Duration": "IRIS total hours"})
#df2 = df2.drop(columns=['Request ID','Products', 'Project'])
df2 = df2.drop(columns=['Request ID','Operator','Products', 'Project'])

df2.reset_index(inplace=True)
df2['Booked group'] = df2.apply(lambda row: row.Group.split()[-1], axis=1)

OUTPUT = DATA_DIR / (DATA_FILE.stem + "__" + START + "__" + END + "__group_totals" + DATA_FILE.suffix)
df2.to_csv(OUTPUT, index=True, na_rep='')


#df3 = pd.DataFrame()
#df3['IRIS_group'] = sorted(df.Group.unique())
#df3['IRIS_total_hours'] = df3.apply(lambda row: row.IRIS_group, axis=1)


# Various checks

In [None]:
# this is to check difference between dataframe (works if there are no duplicates in the dfs themselves).
#df3 = pd.concat([df,df2019]).drop_duplicates(keep=False)

# diff between dataframes (see cell above)
#df3[['Date of booking','Resource', 'Start','End','User']]


In [None]:
# check which 3I entries have price info
tmp = df[df.Resource.str.startswith("3I Marianas") & (~df.Charges.isnull()) ]
print(tmp["Resource"].unique())

In [None]:
# find bookings that are longer than 1 day
day = pd.Timedelta("1 day")
df[df["Duration"] > day]

In [None]:
# check that add-ons are gone
print(sorted(df.Resource.unique()))
# check that cancellations and upcoming bookings are gone
print(sorted(df.Status.unique()))

In [None]:
# find bookings since lass billing with no WBS
previous_billing_date = '2019-5-23'
df[df["Cost center"].isnull() & ((df.Start > previous_billing_date))]

In [None]:
# find bookings by LMU staff
lmu_staff = ["Harri.Jaalinoja@helsinki.fi", "marko.crivaro@helsinki.fi", "kimmo.tanhuanpaa@helsinki.fi", "mika.molin@helsinki.fi", "viktor.raimi@helsinki.fi"]
df[df['User'].isin(lmu_staff)][['Start','End','Resource','Group']]