In [6]:
# Import libaries
import numpy as np 
import pandas as pd 
import seaborn as sns; sns.set(rc={'figure.figsize':(16,9)})
import matplotlib.pyplot as plt
from scipy import stats 

In [7]:
# Read the datasets
olist_customers = pd.read_csv('olist_customers_dataset.csv')
olist_geolocation = pd.read_csv('olist_geolocation_dataset.csv')
olist_orders = pd.read_csv('olist_orders_dataset.csv')
olist_items = pd.read_csv('olist_order_items_dataset.csv')
olist_order_payments = pd.read_csv('olist_order_payments_dataset.csv')
olist_reviews = pd.read_csv('olist_order_reviews_dataset.csv')
olist_products = pd.read_csv('olist_products_dataset.csv')
olist_sellers = pd.read_csv('olist_sellers_dataset.csv')
olist_category = pd.read_csv('product_category_name_translation.csv')

In [9]:
# Merge the datasets
olist_new = olist_orders.merge(olist_items, on='order_id', how='left')
olist_new = olist_new.merge(olist_order_payments, on='order_id', how='outer', validate='m:m')
olist_new = olist_new.merge(olist_reviews, on='order_id', how='outer')
olist_new = olist_new.merge(olist_products, on='product_id', how='outer')
olist_new = olist_new.merge(olist_customers, on='customer_id', how='outer')
olist_new = olist_new.merge(olist_sellers, on='seller_id', how='outer')
olist_new= olist_new.merge(olist_category, on='product_category_name', how='inner')

olist_new.shape


(116576, 40)

In [None]:
# Explore data
def resumetable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values

    for name in summary['Name'].value_counts().index:
        summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2) 

    return summary

def cross_heatmap(df, cols, normalize=False, values=None, aggfunc=None):
    temp = cols
    cm = sns.light_palette("green", as_cmap=True)
    return pd.crosstab(df[temp[0]], df[temp[1]], 
                       normalize=normalize, values=values, aggfunc=aggfunc).style.background_gradient(cmap = cm)

resumetable(olist_new)

In [None]:
# Customer Analysis
# What regions have high demand?
# In what months?

In [None]:
# Time-stamp related data
timestamp_cols = ['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 
                  'order_delivered_customer_date', 'order_estimated_delivery_date']

for col in timestamp_cols:
    olist_new[col] = pd.to_datetime(olist_new[col])

# Time-Stamps transition --> Converting Datetime from Object to Datetime
olist_new['purchase_year'] = olist_new['order_purchase_timestamp'].dt.year
olist_new['purchase_month'] = olist_new['order_purchase_timestamp'].dt.month
olist_new['purchase_day'] = olist_new['order_purchase_timestamp'].dt.day
olist_new['purchase_day_of_week'] = olist_new['order_purchase_timestamp'].dt.dayofweek
olist_new['purchase_hour'] = olist_new['order_purchase_timestamp'].dt.hour


In [1]:
olist_new['order_purchase_timestamp'] = pd.to_datetime(olist_new['order_purchase_timestamp'])
olist_new['order_delivered_customer_date'] = pd.to_datetime(olist_new['order_delivered_customer_date'])
olist_new['order_estimated_delivery_date'] = pd.to_datetime(olist_new['order_estimated_delivery_date'])
olist_new['shipping_limit_date'] = pd.to_datetime(olist_new['shipping_limit_date'])
olist_new['order_delivered_carrier_date'] =pd.to_datetime(olist_new['order_delivered_carrier_date'])

NameError: name 'pd' is not defined

In [None]:
olist_new['estimated_days'] = (olist_new['order_estimated_delivery_date'].dt.date - olist_new['order_purchase_timestamp'].dt.date).dt.days

In [None]:
olist_new['arrival_days'] = (olist_new['order_delivered_customer_date'].dt.date - olist_new['order_purchase_timestamp'].dt.date).dt.days

In [2]:
olist_new['shipping_days'] = (olist_new['order_delivered_customer_date'].dt.date - olist_new['order_delivered_carrier_date'].dt.date).dt.days

NameError: name 'olist_new' is not defined

In [None]:
olist_new.drop((olist_new[['order_delivered_carrier_date', 'order_delivered_customer_date']][olist_new.shipping_days < 0]).index, inplace= True)

In [None]:
# First get seller to carrier duration in days
olist_new['seller_to_carrier_status'] = (olist_new['shipping_limit_date'].dt.date - olist_new['order_delivered_carrier_date'].dt.date).dt.days

# Now calssify the duration into 'OnTime/Early' & 'Late'
olist_new['seller_to_carrier_status'] = olist_new['seller_to_carrier_status'].apply(lambda x : 'OnTime/Early' if x >=0 else 'Late')

In [None]:
# First get difference between estimated delivery date and actual delivery date in days
olist_new['arrival_status'] = (olist_new['order_estimated_delivery_date'].dt.date - olist_new['order_delivered_customer_date'].dt.date).dt.days

# Now Classify the duration in 'OnTime/Early' & 'Late'
olist_new['arrival_status'] = olist_new['arrival_status'].apply(lambda x : 'OnTime/Early' if x >=0 else 'Late')

In [None]:
olist_new[['estimated_days', 'arrival_days', 'shipping_days']].describe()

In [None]:
# Number of orders per year
orders_per_year = olist_new['purchase_year'].value_counts().sort_index()

# Number of orders per month
orders_per_month = olist_new.groupby('purchase_month').size()

# Number of orders per day of the week
orders_per_day_of_week = olist_new['purchase_day_of_week'].value_counts().sort_index()

# Number of orders per hour
orders_per_hour = olist_new['purchase_hour'].value_counts().sort_index()

In [None]:
# Create subplots for each graph
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Plot number of orders per year
axes[0, 0].bar(orders_per_year.index, orders_per_year.values, color='skyblue')
axes[0, 0].set_title('Number of Orders per Year')
axes[0, 0].set_xlabel('Year')
axes[0, 0].set_ylabel('Number of Orders')

# Plot number of orders per month
sns.barplot(x=orders_per_month.index, y=orders_per_month.values, ax=axes[0, 1])
axes[0, 1].set_title('Number of Orders per Month')
axes[0, 1].set_xlabel('Month')
axes[0, 1].set_ylabel('Number of Orders')

# Plot number of orders per day of the week
axes[1, 0].bar(orders_per_day_of_week.index, orders_per_day_of_week.values, color='lightgreen')
axes[1, 0].set_title('Number of Orders per Day of the Week')
axes[1, 0].set_xlabel('Day of the Week')
axes[1, 0].set_ylabel('Number of Orders')
axes[1, 0].set_xticks(range(7))  # Set x-axis ticks for day of the week

# Plot number of orders per hour
sns.lineplot(x=orders_per_hour.index, y=orders_per_hour.values, ax=axes[1, 1], marker='o', color='orange')
axes[1, 1].set_title('Number of Orders per Hour')
axes[1, 1].set_xlabel('Hour')
axes[1, 1].set_ylabel('Number of Orders')

# Adjust layout
plt.tight_layout()

# Show plots
plt.show()

In [None]:
olist_new['ord_new'] = olist_new['order_item_id'].copy()

olist_new.loc[olist_new['order_item_id'].isin([7,8,9,10]), 'ord_new'] = '7 to 10'
olist_new.loc[(olist_new['order_item_id'] > 10), 'ord_new'] = '10 to 20'

In [None]:
plt.figure(figsize=(14,10))


plt.subplot(211)
g = sns.countplot(x='ord_new', data=olist_new)
g.set_title("Order Item Id Distribution", fontsize=20)
g.set_xlabel("State Name Short", fontsize=17)
g.set_ylabel("Count", fontsize=17)
sizes = []
for p in g.patches:
    height = p.get_height()
    sizes.append(height)
    g.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:1.2f}%'.format(height/len(olist_new)*100),
            ha="center", fontsize=12) 
g.set_ylim(0, max(sizes) * 1.1)



plt.show()

In [None]:

plt.figure(figsize=(12, 6))
plt.title('Customer State Distribution', fontsize=12)

# Plot countplot for customer state distribution
g = sns.countplot(x='customer_state', data=olist_new, orient='h')
g.set_xlabel("State Name", fontsize=17)
g.set_ylabel("Count", fontsize=17)
g.set_xticklabels(g.get_xticklabels(), rotation=45)
sizes = []
for p in g.patches:
    height = p.get_height()
    sizes.append(height)
    g.text(p.get_x() + p.get_width() / 2., height + 3, '{:1.2f}%'.format(height / len(olist_new) * 100),
           ha="center", fontsize=12)
g.set_ylim(0, max(sizes) * 1.1)

plt.show()


In [None]:
def classify_cat(x):

    if x in ['office_furniture', 'furniture_decor', 'furniture_living_room', 'kitchen_dining_laundry_garden_furniture', 'bed_bath_table', 'home_comfort', 'home_comfort_2', 'home_construction', 'garden_tools', 'furniture_bedroom', 'furniture_mattress_and_upholstery']:
        return 'Furniture'
    
    elif x in ['auto', 'computers_accessories', 'musical_instruments', 'consoles_games', 'watches_gifts', 'air_conditioning', 'telephony', 'electronics', 'fixed_telephony', 'tablets_printing_image', 'computers', 'small_appliances_home_oven_and_coffee', 'small_appliances', 'audio', 'signaling_and_security', 'security_and_services']:
        return 'Electronics'
    
    elif x in ['fashio_female_clothing', 'fashion_male_clothing', 'fashion_bags_accessories', 'fashion_shoes', 'fashion_sport', 'fashion_underwear_beach', 'fashion_childrens_clothes', 'baby', 'cool_stuff', ]:
        return 'Fashion'
    
    elif x in ['housewares', 'home_confort', 'home_appliances', 'home_appliances_2', 'flowers', 'costruction_tools_garden', 'garden_tools', 'construction_tools_lights', 'costruction_tools_tools', 'luggage_accessories', 'la_cuisine', 'pet_shop', 'market_place']:
        return 'Home & Garden'
    
    elif x in ['sports_leisure', 'toys', 'cds_dvds_musicals', 'music', 'dvds_blu_ray', 'cine_photo', 'party_supplies', 'christmas_supplies', 'arts_and_craftmanship', 'art']:
        return 'Entertainment'
    
    elif x in ['health_beauty', 'perfumery', 'diapers_and_hygiene']:
        return 'Beauty & Health'
    
    elif x in ['food_drink', 'drinks', 'food']:
        return 'Food & Drinks'
    
    elif x in ['books_general_interest', 'books_technical', 'books_imported', 'stationery']:
        return 'Books & Stationery'
    
    elif x in ['construction_tools_construction', 'construction_tools_safety', 'industry_commerce_and_business', 'agro_industry_and_commerce']:
        return 'Industry & Construction'

olist_new['product_category'] = olist_new.product_category_name_english.apply(classify_cat)

In [None]:
olist_new.product_category.value_counts()

In [None]:
plt.figure(figsize=[10, 6])
sns.barplot(x = olist_new.product_category.value_counts().values, y = olist_new.product_category.value_counts().index, palette= 'crest_r')
plt.title('Number of orders per each Category')
plt.xticks(rotation = 45)
sns.despine()