In [1]:
import pandas as pd
import datetime as dt

In [2]:
# Dataset Imports
customers = pd.read_csv('Olist/olist_customers_dataset.csv')
orders = pd.read_csv('Olist/olist_orders_dataset.csv')
payments = pd.read_csv('Olist/olist_order_payments_dataset.csv')
reviews = pd.read_csv('Olist/olist_order_reviews_dataset.csv')
items = pd.read_csv('Olist/olist_order_items_dataset.csv')
products = pd.read_csv('Olist/olist_products_dataset.csv')
translations = pd.read_csv('Olist/product_category_name_translation.csv')

In [3]:
# Order Setup

# In order to do proper dataframe joining between customers and its only
# neighboring dataset, orders. We need to change the primary key from
# customer_id to unique_customer_id. This is because customer_id is not
# unique for every customer and is actually unique for every order. We want
# a unique customer id in the orders dataset, so we will want to move it 
# over. After this, we squad the Customer dataset down to only unique
# customers. Additionally, we want to start with our first variable, first_order
# and this can be initialized by setting all values under it to False.
# ------------------------------------------------------------------------
orders['first_order'] = False
orders = orders.set_index('customer_id').join(customers[['customer_id','customer_unique_id']].set_index('customer_id'))
customers.drop_duplicates(keep='last',subset=['customer_unique_id'],inplace=True)

In [4]:
# Getting first_order & new_customer

# We want to determine whether a customer was a new customer, which may
# help with explain different purchase behavior. 
# ------------------------------------------------------------------------

# 1) We first sort our orders dataframe by customer_unique_id and then by
#    order_purchase_timestamp. Then only the order ids of the last customer 
#    orders (the earliest timestamps) are saved into first_orders_id.
orders = orders.sort_values(by=['customer_unique_id','order_purchase_timestamp'])
first_orders_id = orders.drop_duplicates(keep='first',subset=['customer_unique_id'],inplace=False)['order_id']
first_orders_id.to_list()

# 2) We then get logical booleans that check whether the order id of a first
#    customer purchase is included within the first_order_id list. These
#    booleans are then saved into first_order in orders.
orders['first_order'] = orders.order_id.isin(first_orders_id.to_list())

# 3) A new dataframe is created that only saves the records in which 
#    first_order is True.
first_orders_ds = orders.loc[orders.first_order == True]

# 4) With these true values, they are then added into the customers dataset
#    based on customer_unique_id.
customers = customers.set_index('customer_unique_id').join(first_orders_ds[['customer_unique_id','order_purchase_timestamp']].set_index('customer_unique_id'))

# 5) Next, get some 'latest_date' metric, and in my case, I used the time stamp
#    for the last item purchased. To create new_customer, the first purchase is
#    subtracted from the latest date to get the difference in days. If this count
#    of days exceeds 90 days or 3 months, then they are considered to be new.
latest_date = dt.datetime.strptime('2018-09-09 23:59:59', '%Y-%m-%d %H:%M:%S')
customers['new_customer'] = False
customers['order_purchase_timestamp'] = pd.to_datetime(customers['order_purchase_timestamp'])
customers['new_customer'] = (pd.DataFrame(latest_date - customers['order_purchase_timestamp']).order_purchase_timestamp.dt.days < 90)
customers.rename(columns={'order_purchase_timestamp':'first_order'},inplace=True)
customers['new_customer'] = customers['new_customer'].astype(int)
customers = customers.reset_index()

In [5]:
# Getting Total Orders

# Getting Total Orders will allow us to measure a customer's order 
# contribution.
# ------------------------------------------------------------------------

# 1) count all orders based on customer in the orders dataset
total_orders = orders.groupby(by='customer_unique_id').order_id.count().reset_index()
total_orders.rename(columns={'order_id':'total_orders'},inplace=True)
# 2) move into customers dataset.
customers = customers.reset_index().set_index('customer_unique_id').join(total_orders.set_index('customer_unique_id'))
customers.pop('index')
customers = customers.reset_index()

In [6]:
# Getting Average Score

# Getting the average score for all of a customer's orders will allow us to
# measure their overall sentiment.
# ------------------------------------------------------------------------

# 1) Move review_score into orders dataframe
orders = orders.set_index('order_id').join(reviews[['order_id','review_score']].set_index('order_id')).reset_index()
orders.drop_duplicates(keep='last',subset=['order_id'],inplace=True)
# 2) Calculate average review score based on customers and save into new 
#    dataframe.
average_review_score = pd.DataFrame(orders.groupby(by='customer_unique_id')['review_score'].agg('mean')).reset_index()
average_review_score.rename(columns={'review_score':'average_review_score'})
# 3) Join dataframe into customers.
customers = customers.reset_index().set_index('customer_unique_id').join(average_review_score.set_index('customer_unique_id')).reset_index()

In [7]:
# Getting Total Price

# Total price will allow use to look at the total revenue a customer has
# contributed.
# ------------------------------------------------------------------------

# 1) Move payment value from payments into orders and sum based on customer.
#    Save this into total_price dataframe.
orders = orders.set_index('order_id').join(payments[['order_id','payment_value']].set_index('order_id')).reset_index()
total_price = pd.DataFrame(orders.groupby(by='customer_unique_id')['payment_value'].sum())
total_price.rename(columns={'payment_value':'total_price'},inplace=True)
total_price.reset_index()
orders.drop_duplicates(keep='last',subset=['order_id'],inplace=True)
# 2) Join new dataframe into customers dataframe.
customers = customers.set_index('customer_unique_id').join(total_price.reset_index()[['customer_unique_id','total_price']].set_index('customer_unique_id')).reset_index()

In [8]:
# Translations of Products

# Translate portugese product names into english names for ease of analysis.
# ------------------------------------------------------------------------
products = products.set_index('product_category_name').join(translations.set_index('product_category_name')).reset_index()
products.dropna(inplace=True)
products.pop('product_category_name')

0        agro_industria_e_comercio
1        agro_industria_e_comercio
2        agro_industria_e_comercio
3        agro_industria_e_comercio
4        agro_industria_e_comercio
                   ...            
32336        utilidades_domesticas
32337        utilidades_domesticas
32338        utilidades_domesticas
32339        utilidades_domesticas
32340        utilidades_domesticas
Name: product_category_name, Length: 32327, dtype: object

In [9]:
# Getting Favorite Product: <favorite_product> NOT USED

# A customer's favorite product may allow for some interesting results 
# when analyzing the aggregated clusters. It will be calculated by finding
# the product category that has been purchased the most by a particular
# customer. All equivalent top purchase category counts will be decided
# by whichever is earlist.
# ------------------------------------------------------------------------

# 1) Move Product Name from products into items dataframe
items = items.set_index('product_id').join(products[['product_id','product_category_name_english']].set_index('product_id')).reset_index()
# 2) Move Customer Id from orders, into items dataframe
items = items.set_index('order_id').join(orders[['order_id','customer_unique_id']].set_index('order_id')).reset_index()
# 3) Impute any Null categories with 'unknown'
items.fillna('unknown',inplace=True)
# 4) Get the most common product category grouped by customer ID.
favorite_products = pd.DataFrame(items.groupby(by='customer_unique_id')['product_category_name_english'].agg(lambda x: pd.Series.mode(x)[0]))
favorite_products.rename(columns={'product_category_name_english':'favorite_product'},inplace=True)
# 5) move <favorite_product> into customers dataframe.
customers = customers.set_index('customer_unique_id').join(favorite_products.reset_index().set_index('customer_unique_id')).reset_index()

In [10]:
# Getting Frequency Variables: <average_frequency> <came_back> NOT USED

# We want to get the average frequency (in days) that a customer has
# used Olist to purchase products. To get this, a boolean variable called
# 'came_back' will be created to indicate whether a customer has had more
# than one purchase.
# ------------------------------------------------------------------------
# 1) convert order dates into pandas timedate objects
orders['order_purchase_timestamp'] = pd.to_datetime(orders['order_purchase_timestamp'])
# 2) sort the orders first by customer_id and then by purchase_timestamp so
#    we can easily work with the sorted data, this is then saved into a 
#    temporary orders copy
orders2 = orders.sort_values(by=['customer_unique_id','order_purchase_timestamp'])
# 3) 'came_back' is calculated by grouping all orders by customer_id, and counting 
#    the total amount, which is then logically compared to greater than 1. These
#    new boolean fields will then be saved.
came_back = pd.DataFrame(orders2.groupby(by='customer_unique_id')['order_purchase_timestamp'].count() > 1)
came_back.rename(columns={'order_purchase_timestamp':'came_back'},inplace=True)
# 4) we move the came_back variable back into the original orders dataset by
#    customer_id and now get a list of all time distances between customer 
#    orders and their order previously.
orders = orders.set_index('customer_unique_id').join(came_back.reset_index()[['customer_unique_id','came_back']].set_index('customer_unique_id')).reset_index()
time_dis = pd.DataFrame(orders.loc[orders.came_back == True].groupby(by=['customer_unique_id'])['order_purchase_timestamp'].diff())
time_dis = time_dis.abs()
# 5) Extract the days attribute from the object and save it into the orders
#    dataset.
time_dis.order_purchase_timestamp = time_dis.order_purchase_timestamp.dt.days
orders['time_since_last'] = time_dis
# 6) Average Frequency is then calculated by getting the average of all 
#    differences in days for that particular customer, and it is then
#    combined with the came_back variable into the same dataframe.
average_frequency = pd.DataFrame(orders.groupby(by=['customer_unique_id'])['time_since_last'].mean())
average_frequency.rename(columns={'time_since_last':'average_frequency'},inplace=True)
average_frequency.reset_index()
average_frequency = average_frequency.reset_index().set_index('customer_unique_id').join(orders[['customer_unique_id','came_back']].reset_index().set_index('customer_unique_id')).reset_index()
average_frequency.drop_duplicates(keep='last',subset=['customer_unique_id'],inplace=True)
average_frequency.pop('index')
# 7) This data is now 1 to 1 with the customers dataset, and can be joined
#    with it. All average_frequencys' with a corresponding came_back variable 
#    equal to False results in an Null value due to inability to difference 
#    one number. To fix this, the total amount of days that this dataset was
#    collected over (735 days) will be imputed into said Null cells.
customers = customers.set_index('customer_unique_id').join(average_frequency[['customer_unique_id','average_frequency','came_back']].set_index('customer_unique_id')).reset_index()
customers.pop('index')
customers.drop_duplicates(subset=['customer_unique_id'],inplace=True)
customers["average_frequency"].fillna(value=735, inplace=True)
customers['came_back'] = customers['came_back'].astype(int)
customers.reset_index()

Unnamed: 0,index,customer_unique_id,customer_id,customer_zip_code_prefix,customer_city,customer_state,first_order,new_customer,total_orders,review_score,total_price,favorite_product,average_frequency,came_back
0,0,861eff4711a542e4b93843c6dd7febb0,06b8999e2fba1a1fbc88172c00ba8bc7,14409,franca,SP,2017-05-16 15:05:35,0,1,4.0,146.87,office_furniture,735.0,0
1,1,290c77bc529b7ac935b93aa66c333dc3,18955e83d337fd6b2def6b18a428ac77,9790,sao bernardo do campo,SP,2018-01-12 20:48:24,0,1,5.0,335.48,housewares,735.0,0
2,2,060e732b5b29e8181a18229c7b0b2b5e,4e7b3e00288586ebd08712fdd0374a03,1151,sao paulo,SP,2018-05-19 16:07:45,0,1,5.0,157.73,office_furniture,735.0,0
3,3,259dac757896d24d7702b9acbbff3f3c,b2b6027bc5c5109e529d4dc6358b12c3,8775,mogi das cruzes,SP,2018-03-13 16:06:38,0,1,5.0,173.30,office_furniture,735.0,0
4,4,345ecd01c38d18a9036ed96c73b8d066,4f2d8ab171c80ec8364f7c12e35b23ad,13056,campinas,SP,2018-07-29 09:51:30,1,1,5.0,252.25,home_confort,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96091,96091,1a29b476fee25c95fbafc67c5ac95cf8,17ddf5dd5d51696bb3d7c6291687be6f,3937,sao paulo,SP,2018-04-07 15:48:17,0,1,4.0,88.78,books_general_interest,735.0,0
96092,96092,d52a67c98be1cf6a5c84435bd38d095d,e7b71a9017aa05c9a7fd292d714858e8,6764,taboao da serra,SP,2018-04-04 08:20:22,0,1,5.0,129.06,sports_leisure,735.0,0
96093,96093,e9f50caf99f032f0bf3c55141f019d99,5e28dfe12db7fb50a4b2f691faecea5e,60115,fortaleza,CE,2018-04-08 20:11:50,0,1,1.0,56.04,health_beauty,735.0,0
96094,96094,73c2643a0a458b49f58cea58833b192e,56b18e2166679b8a959d72dd06da27f9,92120,canoas,RS,2017-11-03 21:08:33,0,1,5.0,711.07,watches_gifts,735.0,0


In [11]:
# Save to clean customers DS
customers.to_csv('CleanedDatasets/customers_c.csv')


In [12]:
# General customer statistics below:
# ------------------------------------------------------------------------

In [13]:
# Customers that came back proportion
customers.loc[customers.came_back==1]['customer_unique_id'].count() / len(customers)

0.031187562437562436

In [14]:
# Ratio of total revenue by customers who never came back
customers.loc[customers.came_back==0]['total_price'].sum() / customers.total_price.sum()

0.9410312792229364

In [15]:
# Returns the highest customer order count
customers.total_orders.max()

17

In [16]:
# Customer average review score
customers.review_score.mean()

4.069527095997687