In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

In [2]:
monday_data = pd.read_csv('./data/monday.csv', sep=';')
tuesday_data = pd.read_csv('./data/tuesday.csv', sep=';')
wednesday_data = pd.read_csv('./data/wednesday.csv', sep=';')
thursday_data = pd.read_csv('./data/thursday.csv', sep=';')
friday_data = pd.read_csv('./data/friday.csv', sep=';')

In [3]:
def create_missing_checkout(df):
    
    """
    For some of last customers are the checkouts missing. This function adds them.
    """
    
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    timestamp = df['timestamp'].iloc[-1] + timedelta(minutes=3) #selecting the last timestamp in the df for date and time and adds 3 minutes
    
    data_checkout = df.loc[df['location'] == 'checkout']
    customers_with_checkout = data_checkout['customer_no'].unique()
    customers_ids = df['customer_no'].unique()
    customers_without_checkout = np.setxor1d(customers_with_checkout, customers_ids) # function, which compares to arrays for no matching values
    
    for ids in customers_without_checkout:
        new_row = pd.DataFrame({'timestamp':timestamp, 'customer_no':ids, 'location':'checkout'}, index =[0])
        df = pd.concat([df, new_row]).reset_index(drop = True)
        
    return df

In [4]:
monday_data = create_missing_checkout(monday_data)
tuesday_data = create_missing_checkout(tuesday_data)
wednesday_data = create_missing_checkout(wednesday_data)
thursday_data = create_missing_checkout(thursday_data)
friday_data = create_missing_checkout(friday_data)

In [5]:
tuesday_ids = np.arange((monday_data['customer_no'].max())+1, (monday_data['customer_no'].max())+(tuesday_data['customer_no'].max())+1)
tuesday_ids

array([1448, 1449, 1450, ..., 2867, 2868, 2869])

In [6]:
wednesday_ids = np.arange(tuesday_ids[-1]+1, tuesday_ids[-1]+tuesday_data['customer_no'].max()+1)
wednesday_ids

array([2870, 2871, 2872, ..., 4289, 4290, 4291])

In [7]:
thursday_ids = np.arange(wednesday_ids[-1]+1, wednesday_ids[-1]+wednesday_data['customer_no'].max()+1)
thursday_ids

array([4292, 4293, 4294, ..., 5820, 5821, 5822])

In [8]:
friday_ids = np.arange(thursday_ids[-1]+1, thursday_ids[-1]+thursday_data['customer_no'].max()+1)
friday_ids

array([5823, 5824, 5825, ..., 7355, 7356, 7357])

In [9]:
def replace_customer_ids(df, day_ids):
    
    """
    The customer id 'customer_no' resets each day. This functions makes it unique for all days. The daily IDs are needed.
    """
    
    value = 1
    i = 0

    for customer_id in day_ids:
        df.loc[ df['customer_no'] == value, 'customer_no'] = day_ids[i]
        value = value + 1
        i = i + 1
    
    return df

In [10]:
tuesday_data = replace_customer_ids(tuesday_data, tuesday_ids)
wednesday_data = replace_customer_ids(wednesday_data, wednesday_ids)
thursday_data = replace_customer_ids(thursday_data, thursday_ids)
friday_data = replace_customer_ids(friday_data, friday_ids)

In [11]:
data = pd.concat([monday_data, tuesday_data, wednesday_data, thursday_data, friday_data]).reset_index(drop = True)
data

Unnamed: 0,timestamp,customer_no,location
0,2019-09-02 07:03:00,1,dairy
1,2019-09-02 07:03:00,2,dairy
2,2019-09-02 07:04:00,3,dairy
3,2019-09-02 07:04:00,4,dairy
4,2019-09-02 07:04:00,5,spices
...,...,...,...
24900,2019-09-06 21:53:00,7325,checkout
24901,2019-09-06 21:53:00,7327,checkout
24902,2019-09-06 21:53:00,7328,checkout
24903,2019-09-06 21:53:00,7331,checkout


In [13]:
data.to_csv("./data/supermarket_data.csv", index=False)