### Loading the csv files

In [2]:
import pandas as pd
import numpy as np
import datetime

In [3]:
mon = pd.read_csv('monday.csv', sep= ';', parse_dates=True, index_col='timestamp' )
mon

Unnamed: 0_level_0,customer_no,location
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-09-02 07:03:00,1,dairy
2019-09-02 07:03:00,2,dairy
2019-09-02 07:04:00,3,dairy
2019-09-02 07:04:00,4,dairy
2019-09-02 07:04:00,5,spices
...,...,...
2019-09-02 21:49:00,1442,checkout
2019-09-02 21:49:00,1444,checkout
2019-09-02 21:49:00,1445,dairy
2019-09-02 21:50:00,1446,dairy


In [4]:
tue = pd.read_csv('tuesday.csv', sep= ';', parse_dates=True, index_col='timestamp' )
wed = pd.read_csv('wednesday.csv', sep= ';', parse_dates=True, index_col='timestamp' )
thu = pd.read_csv('thursday.csv', sep= ';', parse_dates=True, index_col='timestamp' )
fri = pd.read_csv('friday.csv', sep= ';', parse_dates=True, index_col='timestamp' )
df = pd.concat([mon,tue, wed,thu,fri], sort=True)
df

Unnamed: 0_level_0,customer_no,location
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-09-02 07:03:00,1,dairy
2019-09-02 07:03:00,2,dairy
2019-09-02 07:04:00,3,dairy
2019-09-02 07:04:00,4,dairy
2019-09-02 07:04:00,5,spices
...,...,...
2019-09-06 21:50:00,1500,dairy
2019-09-06 21:50:00,1507,checkout
2019-09-06 21:50:00,1508,checkout
2019-09-06 21:50:00,1509,drinks


### Setting distinct ID for each customer and expanding our table's column


In [5]:
df.sort_index(inplace=True)
df['time'] = df.index.time 
df['day'] = df.index.day_name()
df['customer_id'] = df['day']+ '-' + df['customer_no'].astype(str)
df

Unnamed: 0_level_0,customer_no,location,time,day,customer_id
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-09-02 07:03:00,1,dairy,07:03:00,Monday,Monday-1
2019-09-02 07:03:00,2,dairy,07:03:00,Monday,Monday-2
2019-09-02 07:04:00,3,dairy,07:04:00,Monday,Monday-3
2019-09-02 07:04:00,4,dairy,07:04:00,Monday,Monday-4
2019-09-02 07:04:00,5,spices,07:04:00,Monday,Monday-5
...,...,...,...,...,...
2019-09-06 21:50:00,1500,dairy,21:50:00,Friday,Friday-1500
2019-09-06 21:50:00,1507,checkout,21:50:00,Friday,Friday-1507
2019-09-06 21:50:00,1508,checkout,21:50:00,Friday,Friday-1508
2019-09-06 21:50:00,1509,drinks,21:50:00,Friday,Friday-1509


### Determining the order of states

In [6]:
# finding the min of time in order to indicate the entrance of each customer into the supermarket
entrance = df.reset_index().groupby('customer_id')['timestamp'].min()
entrance

customer_id
Friday-1        2019-09-06 07:00:00
Friday-10       2019-09-06 07:06:00
Friday-100      2019-09-06 08:03:00
Friday-1000     2019-09-06 17:19:00
Friday-1001     2019-09-06 17:19:00
                        ...        
Wednesday-995   2019-09-04 16:52:00
Wednesday-996   2019-09-04 16:52:00
Wednesday-997   2019-09-04 16:52:00
Wednesday-998   2019-09-04 16:53:00
Wednesday-999   2019-09-04 16:53:00
Name: timestamp, Length: 7445, dtype: datetime64[ns]

In [7]:
for customer in df["customer_id"].unique():
    df.loc[
        (df['customer_id'] == customer) & (df.index == entrance[customer]),
        "section_order"
    ] = "first"

In [8]:
df.loc[df["location"] == "checkout", "section_order"] = "checkout"

In [9]:
df

Unnamed: 0_level_0,customer_no,location,time,day,customer_id,section_order
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-09-02 07:03:00,1,dairy,07:03:00,Monday,Monday-1,first
2019-09-02 07:03:00,2,dairy,07:03:00,Monday,Monday-2,first
2019-09-02 07:04:00,3,dairy,07:04:00,Monday,Monday-3,first
2019-09-02 07:04:00,4,dairy,07:04:00,Monday,Monday-4,first
2019-09-02 07:04:00,5,spices,07:04:00,Monday,Monday-5,first
...,...,...,...,...,...,...
2019-09-06 21:50:00,1500,dairy,21:50:00,Friday,Friday-1500,
2019-09-06 21:50:00,1507,checkout,21:50:00,Friday,Friday-1507,checkout
2019-09-06 21:50:00,1508,checkout,21:50:00,Friday,Friday-1508,checkout
2019-09-06 21:50:00,1509,drinks,21:50:00,Friday,Friday-1509,first


In [10]:
df['section_order'].fillna('following', inplace=True)

In [11]:
df

Unnamed: 0_level_0,customer_no,location,time,day,customer_id,section_order
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-09-02 07:03:00,1,dairy,07:03:00,Monday,Monday-1,first
2019-09-02 07:03:00,2,dairy,07:03:00,Monday,Monday-2,first
2019-09-02 07:04:00,3,dairy,07:04:00,Monday,Monday-3,first
2019-09-02 07:04:00,4,dairy,07:04:00,Monday,Monday-4,first
2019-09-02 07:04:00,5,spices,07:04:00,Monday,Monday-5,first
...,...,...,...,...,...,...
2019-09-06 21:50:00,1500,dairy,21:50:00,Friday,Friday-1500,following
2019-09-06 21:50:00,1507,checkout,21:50:00,Friday,Friday-1507,checkout
2019-09-06 21:50:00,1508,checkout,21:50:00,Friday,Friday-1508,checkout
2019-09-06 21:50:00,1509,drinks,21:50:00,Friday,Friday-1509,first


In [21]:
customers_in_first_sections = (
    df[df.section_order == 'first'].groupby('location')[['customer_id']].count()
)
customers_in_first_sections

Unnamed: 0_level_0,customer_id
location,Unnamed: 1_level_1
dairy,2141
drinks,1143
fruit,2810
spices,1351


In [22]:
total_entries = sum(customers_in_first_sections['customer_id'].to_list())
first_probabilities = customers_in_first_sections / total_entries
first_probabilities

Unnamed: 0_level_0,customer_id
location,Unnamed: 1_level_1
dairy,0.287576
drinks,0.153526
fruit,0.377435
spices,0.181464


### 1. The total number of customers in each section

In [23]:
section_sum = df.groupby('location')[['customer_id']].count().sort_values('customer_id')
section_sum

Unnamed: 0_level_0,customer_id
location,Unnamed: 1_level_1
spices,3754
drinks,3905
dairy,4679
fruit,5122
checkout,7417


In [24]:
# daily average of customer per section
section_avg = round(section_sum / 5, 0).astype(int)
section_avg

Unnamed: 0_level_0,customer_id
location,Unnamed: 1_level_1
spices,751
drinks,781
dairy,936
fruit,1024
checkout,1483


### 2. Number of customer per section over time

In [26]:
df.groupby(['time', 'location'])[['customer_id']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,customer_id
time,location,Unnamed: 2_level_1
07:00:00,dairy,3
07:00:00,drinks,3
07:00:00,fruit,4
07:01:00,checkout,2
07:01:00,dairy,4
...,...,...
21:50:00,checkout,2
21:50:00,dairy,3
21:50:00,drinks,1
21:50:00,fruit,2


In [27]:
# Average customer per section per minute
avg_customers_section = df.groupby(['time', 'location'])[['customer_id']].count()/5
avg_customers_section

Unnamed: 0_level_0,Unnamed: 1_level_0,customer_id
time,location,Unnamed: 2_level_1
07:00:00,dairy,0.6
07:00:00,drinks,0.6
07:00:00,fruit,0.8
07:01:00,checkout,0.4
07:01:00,dairy,0.8
...,...,...
21:50:00,checkout,0.4
21:50:00,dairy,0.6
21:50:00,drinks,0.2
21:50:00,fruit,0.4


In [28]:
# total number of customers per timestamp and location
customers_section = df.groupby([df.index, 'location'])[['customer_id']].count()
customers_section

Unnamed: 0_level_0,Unnamed: 1_level_0,customer_id
timestamp,location,Unnamed: 2_level_1
2019-09-02 07:03:00,dairy,2
2019-09-02 07:04:00,dairy,2
2019-09-02 07:04:00,fruit,1
2019-09-02 07:04:00,spices,3
2019-09-02 07:05:00,checkout,2
...,...,...
2019-09-06 21:50:00,checkout,2
2019-09-06 21:50:00,dairy,1
2019-09-06 21:50:00,drinks,1
2019-09-06 21:50:00,fruit,1


### 3. Number of customers at checkout over time