In [1]:
import pandas as pd
import datetime as dt

In [2]:
orders = pd.read_csv('Olist/olist_orders_dataset.csv')
payments = pd.read_csv('Olist/olist_order_payments_dataset.csv')
reviews = pd.read_csv('Olist/olist_order_reviews_dataset.csv')
items = pd.read_csv('Olist/olist_order_items_dataset.csv')
sellers = pd.read_csv('Olist/olist_sellers_dataset.csv')
products = pd.read_csv('Olist/olist_products_dataset.csv')
translations = pd.read_csv('Olist/product_category_name_translation.csv')

In [3]:
# Variables to Add:
#     Product Total Sales
#     Product Total Revenue
#     Product Average Price Per Order
#     Product Average Frequency
#     Product Average Review Score

In [4]:
# Translations of Products

# For ease of analysis, all products in portugese will be translated to english.
# ------------------------------------------------------------------------
products = products.set_index('product_category_name').join(translations.set_index('product_category_name')).reset_index()
products.dropna(inplace=True)
products.pop('product_category_name')

0        agro_industria_e_comercio
1        agro_industria_e_comercio
2        agro_industria_e_comercio
3        agro_industria_e_comercio
4        agro_industria_e_comercio
                   ...            
32336        utilidades_domesticas
32337        utilidades_domesticas
32338        utilidades_domesticas
32339        utilidades_domesticas
32340        utilidades_domesticas
Name: product_category_name, Length: 32327, dtype: object

In [5]:
# Initialize an empty Products dataframe to add variables into.
Products = pd.DataFrame(products.product_category_name_english.unique())

In [6]:
# Total Products Sold

# ------------------------------------------------------------------------

# 1) move product category into items, drop na and get all item counts
#    based on product category, save this into a new dataframe
items = items.reset_index().set_index('product_id').join(products.reset_index()[['product_id','product_category_name_english']].set_index('product_id')).reset_index()
items.dropna()
product_counts = pd.DataFrame(items.groupby(by='product_category_name_english')['product_id'].count())
# 2) clean the variable names and join to Products
product_counts.rename(columns={'product_id':'total_orders'},inplace=True)
product_counts = product_counts.reset_index()
product_counts.rename(columns={'product_category_name_english':'product'},inplace=True)
Products.rename(columns={0:'product_name'},inplace=True)
Products = Products.set_index('product_name').join(product_counts.reset_index()[['total_orders','product']].set_index('product')).reset_index()

In [7]:
# Total Product Revenues

# ------------------------------------------------------------------------

# 1) group items based on product category and sum the price of each item.
#    Then save this into a new dataframe.
product_revenue = pd.DataFrame(items.groupby(by='product_category_name_english')['price'].sum())
product_revenue.rename(columns={'product_id':'product_count'},inplace=True)
product_revenue = product_revenue.reset_index()
# 2) Join dataframe into products and clean variable name
Products = Products.set_index('product_name').join(product_revenue[['product_category_name_english','price']].set_index('product_category_name_english')).reset_index()
Products.rename(columns={'price':'total_revenue'},inplace=True)

In [8]:
# Average Price per Order ratio

# Get the average price of the order than the product falls in
# ------------------------------------------------------------------------
price_per_order = Products.total_revenue / Products.total_orders
Products['price_per_order'] = price_per_order

In [9]:
# Product Frequency

# Get the average frequency (days) in which that product is purchased to 
# determine its popularity.
# ------------------------------------------------------------------------
# 1) move time of purchase from orders DS to items DS
orders['order_purchase_timestamp'] = pd.to_datetime(orders['order_purchase_timestamp'])
order_date = orders[['order_id','order_purchase_timestamp']]
items = items.set_index('order_id').join(order_date.set_index('order_id')).reset_index()
items.drop_duplicates(inplace=True)
items.rename(columns={'order_purchase_timestamp':'product_purchase_timestamp'},inplace=True)
# 2) since all products have multiple sales, we can move ahead and calculate average frequency
items.rename(columns={'product_category_name_english':'product_name'},inplace=True)
items2 = items.sort_values(by=['product_name','product_purchase_timestamp'])
time_dis = pd.DataFrame(items2.groupby(by=['product_name'])['product_purchase_timestamp'].diff())
time_dis = time_dis.abs()
time_dis.product_purchase_timestamp = time_dis.product_purchase_timestamp.dt.days
items2['time_since_last'] = time_dis
average_frequency = pd.DataFrame(items2.groupby(by=['product_name'])['time_since_last'].mean())
average_frequency.rename(columns={'time_since_last':'average_frequency'},inplace=True)
average_frequency.reset_index()
# 3) Move average frequency into Products DS
Products = Products.set_index('product_name').join(average_frequency.reset_index()[['product_name','average_frequency']].set_index('product_name')).reset_index()

In [10]:
# Average Product Score

# Describes overall sentiment towards that product
# ------------------------------------------------------------------------

# 1) Move review scores in to orders, and then from orders to items.
orders = orders.set_index('order_id').join(reviews[['order_id','review_score']].set_index('order_id')).reset_index()
orders.drop_duplicates(inplace=True)
items = items.set_index('order_id').join(orders[['order_id','review_score']].set_index('order_id')).reset_index()
items.drop_duplicates(inplace=True)

# 2) calculate average product score by getting mean review_score based on
#    product category and save to Products
average_review_score = pd.DataFrame(items.groupby(by='product_name')['review_score'].agg('mean')).reset_index()
average_review_score.rename(columns={'review_score':'average_review_score'})
Products = Products.set_index('product_name').join(average_review_score[['product_name','review_score']].set_index('product_name')).reset_index()

In [11]:
# Save the full Sellers dataset to a clean csv file
Products.to_csv('CleanedDatasets/products_c.csv')