In [None]:
import datetime
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
dataset = pd.read_csv('https://dqlab-dataset.s3-ap-southeast-1.amazonaws.com/retail_raw_reduced.csv')

In [None]:
dataset['order_month'] = dataset['order_date'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d").strftime('%Y-%m'))

In [None]:
dataset['gmv'] = dataset['item_price'] * dataset['quantity']

In [None]:
plt.figure(figsize=(14,5))
dataset[dataset['order_month']=='2019-12'].groupby(['order_date'])['customer_id'].nunique().plot(color='#d24dff', marker='.', linewidth=1)
plt.title('Daily Number of Customers - December 2019', loc='center', pad=20, fontsize=20, color='#600080')
plt.xlabel('Order Date', fontsize=15, color='#600080')
plt.ylabel('Number of Customers', fontsize=15, color='#600080')
plt.grid(color='grey', linestyle=':', linewidth=0.5)
plt.ylim(ymin=0)

 **Case 1 " Cara Mendapatkan Brand Top 5 "**

In [None]:
top_brands = (dataset[dataset['order_month']=='2019-12'].groupby('brand')['quantity']
                .sum()
                .reset_index()
                .sort_values(by='quantity',ascending=False)
                .head(5))


dataset_top5brand_dec = dataset[(dataset['order_month']=='2019-12') & (dataset['brand'].isin(top_brands['brand'].to_list()))]


print(top_brands)

**Case 2 " Multi-Line Chart Daily Quaintity Untuk Brand Top 5 "**

In [None]:
dataset_top5brand_dec.groupby(['order_date','brand'])['quantity'].sum().unstack().plot(marker='.', cmap='plasma')

plt.title('Daily Sold Quantity Dec 2019-Breakdown by Brands',loc='center',pad=30, fontsize=20, color='black')
plt.xlabel('Order Date', fontsize = 12)
plt.ylabel('Quantity',fontsize = 12)
plt.grid(color='gray', linestyle=':', linewidth=0.5)
plt.ylim(ymin=0)
plt.legend(loc='upper center', bbox_to_anchor=(1.1, 1), shadow=True, ncol=1)
plt.annotate('Terjadi lonjakan', xy=(7, 310), xytext=(8, 300),
             weight='bold', color='black',
             arrowprops=dict(arrowstyle='->',
                             connectionstyle="arc3",
                             color='black'))
plt.gcf().set_size_inches(10, 5)
plt.tight_layout()

**Case 3 Menentukan Jumlah Produt Untuk Masing - Masing Brand Yang Laku Selama Bulan Desember 2019**

In [None]:
plt.figure(figsize=(14,5))
dataset_top5brand_dec.groupby('brand')['product_id'].nunique().sort_values(ascending=False).plot(kind='bar', color='#ffa64d')
plt.title('Number of Sold Products per Brand, December 2019',loc='center',pad=20, fontsize=15, color='#e67300')
plt.xlabel('Brand', fontsize = 15, color="#e67300")
plt.ylabel('Number of Products',fontsize = 15, color="#e67300")
plt.ylim(ymin=0)
plt.xticks(rotation=0)

**Case 4 Penjualan Produk Diatas 100 Dan Dibawah 100 selama Desember 2019 Gunakan Stacked Chart**

Membuat Data Frame Baru, Untuk Agregat Jumlah Quanitity Terjual Setiap Produk

In [None]:
dataset_top5brand_dec_per_product = dataset_top5brand_dec.groupby(['brand','product_id'])['quantity'].sum().reset_index()

Memberi Kolom Baru Untuk Produk yang terjual >= 100 dan <100

In [None]:
dataset_top5brand_dec_per_product['quantity_group'] = dataset_top5brand_dec_per_product['quantity'].apply(lambda x: '>= 100' if x>=100 else '< 100')
dataset_top5brand_dec_per_product.sort_values('quantity',ascending=False,inplace=True)

Membuat Referensi Pengurutan Brand Berdasarkan Banyaknya Semua Produk

In [None]:
s_sort = dataset_top5brand_dec_per_product.groupby('brand')['product_id'].nunique().sort_values(ascending=False)

Plot Stacked Barchart

In [None]:
dataset_top5brand_dec_per_product.groupby(['brand','quantity_group'])['product_id'].nunique().reindex(index=s_sort.index, level='brand').unstack().plot(kind='bar', stacked=True)
plt.title('Number of Sold Products per Brand, December 2019',loc='center',pad=30, fontsize=15, color='#002266')
plt.xlabel('Brand', fontsize = 12, color="#002266")
plt.ylabel('Number of Products',fontsize = 12, color="#002266")
plt.ylim(ymin=0)
plt.xticks(rotation=0)

**Case 5 Gunakan Hitogram Untuk Melihat Distribusi Harga Produk - Produk Yang Ada Di Top 5 Brand**

In [None]:
plt.figure(figsize=(14,5))
plt.hist(dataset_top5brand_dec.groupby('product_id')['item_price'].median(), bins=20, stacked=True, range=(1,2000000), color='#1affd1')
plt.title('Distribution of Price Median per Product\nTop 5 Brands in Dec 2019',fontsize=15, color='#008066')
plt.xlabel('Price Median', fontsize = 12, color="#008066")
plt.ylabel('Number of Products',fontsize = 12 , color="#008066")
plt.xlim(xmin=0,xmax=2000000)

**Case 6 Korelasi Quaintity VS GMV**

In [None]:
data_per_product_top5brand_dec = dataset_top5brand_dec.groupby('product_id').agg({'quantity': 'sum', 'gmv':'sum', 'item_price':'median'}).reset_index()


plt.figure(figsize=(14,5))
plt.scatter(data_per_product_top5brand_dec['quantity'],data_per_product_top5brand_dec['gmv'], marker='+', color='#00e600')
plt.title('Correlation of Quantity and GMV per Product\nTop 5 Brands in December 2019',fontsize=15, color='#008000')
plt.xlabel('Quantity', fontsize = 12, color="#008000")
plt.ylabel('GMV (in Millions)',fontsize = 12, color="#008000")
plt.xlim(xmin=0,xmax=300)
plt.ylim(ymin=0,ymax=200000000)
labels, locations = plt.yticks()
plt.yticks(labels, (labels/1000000).astype(int))

**Case 6B Korelasi Median Harga VS Quantity**

In [None]:
import matplotlib.pyplot as plt
plt.clf()


data_per_product_top5brand_dec = dataset_top5brand_dec.groupby('product_id').agg({'quantity': 'sum', 'gmv':'sum', 'item_price':'median'}).reset_index()


plt.figure(figsize=(14,5))
plt.scatter(data_per_product_top5brand_dec['item_price'],data_per_product_top5brand_dec['quantity'], marker='o', color='#ff99e6')
plt.title('Correlation of Quantity and GMV per Product\n Top 5 Brands in December 2019',fontsize=15, color='#b30086')
plt.xlabel('Price Median', fontsize = 12, color="#b30086")
plt.ylabel('Quantity',fontsize = 12, color="#b30086")
plt.xlim(xmin=0,xmax=2000000)
plt.ylim(ymin=0,ymax=250)