In [235]:
# https://preppindata.blogspot.com/2021/02/2021-week-8-karaoke-data.html

import pandas as pd
import numpy as np

### Input the data

In [236]:
df_choice = pd.read_excel(r'data/PD 2021 Wk 8 Input.xlsx', sheet_name = 'Karaoke Choices')
df_customer = pd.read_excel(r'data/PD 2021 Wk 8 Input.xlsx', sheet_name = 'Customers')
df_choice

Unnamed: 0,Date,Artist,Song
0,2020-12-22 13:59:59.971,Wham!,Last Christmas
1,2020-12-22 15:00:00.000,Dolly Parton,9 To 5
2,2020-12-22 15:02:00.010,Camilla Cabello Ft. Young Thug,Havana
3,2020-12-22 15:04:00.019,Moana,How Far I’ll Go
4,2020-12-22 18:00:00.000,Backstreet Boys,I Want It That Way
...,...,...,...
983,2021-02-01 22:09:00.029,Kings Of Leon,Sex Is On Fire
984,2021-02-01 22:11:00.038,Hugh Jackman & The Greatest Showman Cast,The Greatest Show
985,2021-02-01 22:14:00.010,Lil Nas X,Old Town Road
986,2021-02-01 22:16:59.981,Ike And Tina Turner,Proud Mary


### Calculate the time between songs

In [237]:
# convert into proper date format
df_choice['Date'] = pd.to_datetime(df_choice['Date'])

# get next row to current row, calc the diff and cast to minute
df_choice['Next'] = df_choice['Date'].shift(-1)
df_choice['timediff'] = (df_choice['Next'] - df_choice['Date'])/ pd.Timedelta(minutes=1)
df_choice

Unnamed: 0,Date,Artist,Song,Next,timediff
0,2020-12-22 13:59:59.971,Wham!,Last Christmas,2020-12-22 15:00:00.000,60.000483
1,2020-12-22 15:00:00.000,Dolly Parton,9 To 5,2020-12-22 15:02:00.010,2.000167
2,2020-12-22 15:02:00.010,Camilla Cabello Ft. Young Thug,Havana,2020-12-22 15:04:00.019,2.000150
3,2020-12-22 15:04:00.019,Moana,How Far I’ll Go,2020-12-22 18:00:00.000,175.999683
4,2020-12-22 18:00:00.000,Backstreet Boys,I Want It That Way,2020-12-22 19:00:00.029,60.000483
...,...,...,...,...,...
983,2021-02-01 22:09:00.029,Kings Of Leon,Sex Is On Fire,2021-02-01 22:11:00.038,2.000150
984,2021-02-01 22:11:00.038,Hugh Jackman & The Greatest Showman Cast,The Greatest Show,2021-02-01 22:14:00.010,2.999533
985,2021-02-01 22:14:00.010,Lil Nas X,Old Town Road,2021-02-01 22:16:59.981,2.999517
986,2021-02-01 22:16:59.981,Ike And Tina Turner,Proud Mary,2021-02-01 22:18:59.990,2.000150


### If the time between songs is greater than (or equal to) 59 minutes, flag this as being a new session

In [238]:
df_choice['Session_Flag'] = np.where(df_choice['timediff']>=59,1,0)
df_choice

Unnamed: 0,Date,Artist,Song,Next,timediff,Session_Flag
0,2020-12-22 13:59:59.971,Wham!,Last Christmas,2020-12-22 15:00:00.000,60.000483,1
1,2020-12-22 15:00:00.000,Dolly Parton,9 To 5,2020-12-22 15:02:00.010,2.000167,0
2,2020-12-22 15:02:00.010,Camilla Cabello Ft. Young Thug,Havana,2020-12-22 15:04:00.019,2.000150,0
3,2020-12-22 15:04:00.019,Moana,How Far I’ll Go,2020-12-22 18:00:00.000,175.999683,1
4,2020-12-22 18:00:00.000,Backstreet Boys,I Want It That Way,2020-12-22 19:00:00.029,60.000483,1
...,...,...,...,...,...,...
983,2021-02-01 22:09:00.029,Kings Of Leon,Sex Is On Fire,2021-02-01 22:11:00.038,2.000150,0
984,2021-02-01 22:11:00.038,Hugh Jackman & The Greatest Showman Cast,The Greatest Show,2021-02-01 22:14:00.010,2.999533,0
985,2021-02-01 22:14:00.010,Lil Nas X,Old Town Road,2021-02-01 22:16:59.981,2.999517,0
986,2021-02-01 22:16:59.981,Ike And Tina Turner,Proud Mary,2021-02-01 22:18:59.990,2.000150,0


### Create a session number field

In [239]:
df_choice["not equal?"] = df_choice["Session_Flag"].diff().ne(0)
df_choice["Session #"] = df_choice["Session_Flag"].diff().ne(0).cumsum()
df_choice


Unnamed: 0,Date,Artist,Song,Next,timediff,Session_Flag,not equal?,Session #
0,2020-12-22 13:59:59.971,Wham!,Last Christmas,2020-12-22 15:00:00.000,60.000483,1,True,1
1,2020-12-22 15:00:00.000,Dolly Parton,9 To 5,2020-12-22 15:02:00.010,2.000167,0,True,2
2,2020-12-22 15:02:00.010,Camilla Cabello Ft. Young Thug,Havana,2020-12-22 15:04:00.019,2.000150,0,False,2
3,2020-12-22 15:04:00.019,Moana,How Far I’ll Go,2020-12-22 18:00:00.000,175.999683,1,True,3
4,2020-12-22 18:00:00.000,Backstreet Boys,I Want It That Way,2020-12-22 19:00:00.029,60.000483,1,False,3
...,...,...,...,...,...,...,...,...
983,2021-02-01 22:09:00.029,Kings Of Leon,Sex Is On Fire,2021-02-01 22:11:00.038,2.000150,0,False,404
984,2021-02-01 22:11:00.038,Hugh Jackman & The Greatest Showman Cast,The Greatest Show,2021-02-01 22:14:00.010,2.999533,0,False,404
985,2021-02-01 22:14:00.010,Lil Nas X,Old Town Road,2021-02-01 22:16:59.981,2.999517,0,False,404
986,2021-02-01 22:16:59.981,Ike And Tina Turner,Proud Mary,2021-02-01 22:18:59.990,2.000150,0,False,404


### Number the songs in order for each session

In [240]:
df_choice['song_order'] = df_choice.groupby('Session #')['Date'].rank().astype('int')
df_choice

Unnamed: 0,Date,Artist,Song,Next,timediff,Session_Flag,not equal?,Session #,song_order
0,2020-12-22 13:59:59.971,Wham!,Last Christmas,2020-12-22 15:00:00.000,60.000483,1,True,1,1
1,2020-12-22 15:00:00.000,Dolly Parton,9 To 5,2020-12-22 15:02:00.010,2.000167,0,True,2,1
2,2020-12-22 15:02:00.010,Camilla Cabello Ft. Young Thug,Havana,2020-12-22 15:04:00.019,2.000150,0,False,2,2
3,2020-12-22 15:04:00.019,Moana,How Far I’ll Go,2020-12-22 18:00:00.000,175.999683,1,True,3,1
4,2020-12-22 18:00:00.000,Backstreet Boys,I Want It That Way,2020-12-22 19:00:00.029,60.000483,1,False,3,2
...,...,...,...,...,...,...,...,...,...
983,2021-02-01 22:09:00.029,Kings Of Leon,Sex Is On Fire,2021-02-01 22:11:00.038,2.000150,0,False,404,4
984,2021-02-01 22:11:00.038,Hugh Jackman & The Greatest Showman Cast,The Greatest Show,2021-02-01 22:14:00.010,2.999533,0,False,404,5
985,2021-02-01 22:14:00.010,Lil Nas X,Old Town Road,2021-02-01 22:16:59.981,2.999517,0,False,404,6
986,2021-02-01 22:16:59.981,Ike And Tina Turner,Proud Mary,2021-02-01 22:18:59.990,2.000150,0,False,404,7


### Match the customers to the correct session, based on their entry time (help)
- The Customer ID field should be null if there were no customers who arrived 10 minutes (or less) before the start of the session

In [241]:
df_customer

Unnamed: 0,Customer ID,Entry Time
0,3fdc46,2020-12-27 06:55:00
1,3fdc46,2020-12-31 03:55:00
2,3fdc46,2021-01-02 08:55:00
3,3fdc46,2021-01-09 05:55:00
4,3fdc46,2021-02-01 06:55:00
...,...,...
296,bdc39c,2021-01-28 10:53:00
297,8d850e,2020-12-28 02:51:00
298,8d850e,2021-01-12 22:51:00
299,8d850e,2021-01-26 11:51:00


In [242]:
#populate the start of the session for each session
df_choice['start'] = df_choice.groupby(['Session #'])['Date'].transform('first')
df_choice

Unnamed: 0,Date,Artist,Song,Next,timediff,Session_Flag,not equal?,Session #,song_order,start
0,2020-12-22 13:59:59.971,Wham!,Last Christmas,2020-12-22 15:00:00.000,60.000483,1,True,1,1,2020-12-22 13:59:59.971
1,2020-12-22 15:00:00.000,Dolly Parton,9 To 5,2020-12-22 15:02:00.010,2.000167,0,True,2,1,2020-12-22 15:00:00.000
2,2020-12-22 15:02:00.010,Camilla Cabello Ft. Young Thug,Havana,2020-12-22 15:04:00.019,2.000150,0,False,2,2,2020-12-22 15:00:00.000
3,2020-12-22 15:04:00.019,Moana,How Far I’ll Go,2020-12-22 18:00:00.000,175.999683,1,True,3,1,2020-12-22 15:04:00.019
4,2020-12-22 18:00:00.000,Backstreet Boys,I Want It That Way,2020-12-22 19:00:00.029,60.000483,1,False,3,2,2020-12-22 15:04:00.019
...,...,...,...,...,...,...,...,...,...,...
983,2021-02-01 22:09:00.029,Kings Of Leon,Sex Is On Fire,2021-02-01 22:11:00.038,2.000150,0,False,404,4,2021-02-01 22:00:00.029
984,2021-02-01 22:11:00.038,Hugh Jackman & The Greatest Showman Cast,The Greatest Show,2021-02-01 22:14:00.010,2.999533,0,False,404,5,2021-02-01 22:00:00.029
985,2021-02-01 22:14:00.010,Lil Nas X,Old Town Road,2021-02-01 22:16:59.981,2.999517,0,False,404,6,2021-02-01 22:00:00.029
986,2021-02-01 22:16:59.981,Ike And Tina Turner,Proud Mary,2021-02-01 22:18:59.990,2.000150,0,False,404,7,2021-02-01 22:00:00.029


In [243]:
df_output = pd.merge_asof(
    left = df_choice,
    right = df_customer.sort_values('Entry Time'),
    left_on='start',
    right_on='Entry Time',
    direction='backward',
    tolerance=pd.Timedelta('10 min')
)

#re-order/only select the targe column
df_output = df_output[['Session #','Customer ID','song_order','Date','Artist','Song']]
df_output

Unnamed: 0,Session #,Customer ID,song_order,Date,Artist,Song
0,1,cd2834,1,2020-12-22 13:59:59.971,Wham!,Last Christmas
1,2,2de3d7,1,2020-12-22 15:00:00.000,Dolly Parton,9 To 5
2,2,2de3d7,2,2020-12-22 15:02:00.010,Camilla Cabello Ft. Young Thug,Havana
3,3,2de3d7,1,2020-12-22 15:04:00.019,Moana,How Far I’ll Go
4,3,2de3d7,2,2020-12-22 18:00:00.000,Backstreet Boys,I Want It That Way
...,...,...,...,...,...,...
983,404,cdda70,4,2021-02-01 22:09:00.029,Kings Of Leon,Sex Is On Fire
984,404,cdda70,5,2021-02-01 22:11:00.038,Hugh Jackman & The Greatest Showman Cast,The Greatest Show
985,404,cdda70,6,2021-02-01 22:14:00.010,Lil Nas X,Old Town Road
986,404,cdda70,7,2021-02-01 22:16:59.981,Ike And Tina Turner,Proud Mary


### Output the data

In [244]:
df_output.to_csv(r'output/2021-week8-output.csv')