## Import packages and data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import time

In [2]:
marker = time.time()
data = pd.read_csv('data/data.csv', sep=";")
print(f'Read dataset in {round(time.time()-marker, 0)} seconds.')

Read dataset in 49.0 seconds.


## Initial Data Exploration

In [None]:
data.shape

In [3]:
data.head()

Unnamed: 0,Company Code,Order Number,Employee,Product,Product Category,Client,Client City,Sale Date Time,Product Cost,Discount Amount,Amount,Total,Form of payment
0,39000,12,Stacy Day,Special Gasoline,Fuel,Customer not informed,No City,2017-03-31 04:10:00,3.05,0.0,5.642,20.02,Money
1,39000,21,Olive Stevenson,Special Diesel,Fuel,Customer not informed,No City,2017-03-31 04:13:00,2.51,0.0,125.045,350.0,Debit Card
2,39000,38,Stacy Day,Special Diesel,Fuel,Customer not informed,No City,2017-03-31 04:25:00,2.51,0.0,35.699,99.92,Money
3,39000,39,Olive Stevenson,Lubricant 1108,Lubricant,Customer not informed,No City,2017-03-31 04:26:00,7.409,0.0,1.0,13.0,Money
4,39000,39,Olive Stevenson,Diesel Auto Clean,Fuel,Customer not informed,No City,2017-03-31 04:26:00,2.56,0.0,42.162,120.96,Money


In [4]:
# Make column names more Pythonic (snake-case)
data.rename(columns = lambda x: x.lower().replace(' ','_'), inplace=True)

for col in data.columns:
    print(f'{col.ljust(17)}: {data[col].nunique():,} unique vales.')

company_code     : 50 unique vales.
order_number     : 2,200,978 unique vales.
employee         : 976 unique vales.
product          : 7,584 unique vales.
product_category : 39 unique vales.
client           : 10,817 unique vales.
client_city      : 552 unique vales.
sale_date_time   : 1,251,244 unique vales.
product_cost     : 23,252 unique vales.
discount_amount  : 2,400 unique vales.
amount           : 200,811 unique vales.
total            : 99,676 unique vales.
form_of_payment  : 7 unique vales.


In [5]:
data.rename(columns={'client_city': 'city'}, inplace=True)

In [6]:
data.head()

Unnamed: 0,company_code,order_number,employee,product,product_category,client,city,sale_date_time,product_cost,discount_amount,amount,total,form_of_payment
0,39000,12,Stacy Day,Special Gasoline,Fuel,Customer not informed,No City,2017-03-31 04:10:00,3.05,0.0,5.642,20.02,Money
1,39000,21,Olive Stevenson,Special Diesel,Fuel,Customer not informed,No City,2017-03-31 04:13:00,2.51,0.0,125.045,350.0,Debit Card
2,39000,38,Stacy Day,Special Diesel,Fuel,Customer not informed,No City,2017-03-31 04:25:00,2.51,0.0,35.699,99.92,Money
3,39000,39,Olive Stevenson,Lubricant 1108,Lubricant,Customer not informed,No City,2017-03-31 04:26:00,7.409,0.0,1.0,13.0,Money
4,39000,39,Olive Stevenson,Diesel Auto Clean,Fuel,Customer not informed,No City,2017-03-31 04:26:00,2.56,0.0,42.162,120.96,Money


#### Data Fix: Aggregate Product Names
If we look at the data, we can see that we do not, in fact, have 7,584 unique products being sold.

Some products are listed multiple times, such as:
- Lubricant 1495
- Lubricant 4105
- Lubricant 250

This lambda function removes any trailing spaces and numbers.

In [7]:
data['product'] = data['product'].apply(lambda x: re.sub(' [0-9]+', '', x))

We can see that after aggregating the product name, we now only have 45 unique products being sold.

In [8]:
print(f'product: {data["product"].nunique()} unique values.')

product: 45 unique values.


##### Add a date column for easier reference

In [9]:
# To convert string datetime to integer value of YYYYMMDD
def convert_date(d):
    x = d.split(' ')[0].split('-')
    return int(f'{x[0]}{x[1]}{x[2]}')

data['date'] = data.sale_date_time.apply(convert_date)

In [15]:
# first arg kept empty because it's 'x'
def basket_id(x):
    order, city, date = x['order_number'], x['city'], x['date']
    return f'{order}{city.replace(" ","").lower()}{date}'

In [16]:
s = time.time()
data['basket_id'] = data.apply(lambda x: basket_id(x), axis=1)
print(f'Finished in {round(time.time()-s, 0)} seconds.')

Finished in (1066, 0) seconds.


In [19]:
data.head()

Unnamed: 0,company_code,order_number,employee,product,product_category,client,city,sale_date_time,product_cost,discount_amount,amount,total,form_of_payment,date,basket_id
0,39000,12,Stacy Day,Special Gasoline,Fuel,Customer not informed,No City,2017-03-31 04:10:00,3.05,0.0,5.642,20.02,Money,20170331,12no city20170331
1,39000,21,Olive Stevenson,Special Diesel,Fuel,Customer not informed,No City,2017-03-31 04:13:00,2.51,0.0,125.045,350.0,Debit Card,20170331,21no city20170331
2,39000,38,Stacy Day,Special Diesel,Fuel,Customer not informed,No City,2017-03-31 04:25:00,2.51,0.0,35.699,99.92,Money,20170331,38no city20170331
3,39000,39,Olive Stevenson,Lubricant,Lubricant,Customer not informed,No City,2017-03-31 04:26:00,7.409,0.0,1.0,13.0,Money,20170331,39no city20170331
4,39000,39,Olive Stevenson,Diesel Auto Clean,Fuel,Customer not informed,No City,2017-03-31 04:26:00,2.56,0.0,42.162,120.96,Money,20170331,39no city20170331


In [20]:
data.to_csv("fixed_data.csv", index=False)

## Creating Binary Purchase/Product Vectors
Each vector signifies a 1 or 0 if the product exists in that basket

In [None]:
distance_function = lambda x: sqrt(2*(1-x))