# Clustering

In [1]:
import os
import glob

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import cluster
import collections

sns.set(rc={'figure.figsize':(16,9)})
%matplotlib inline

## Local

In [2]:
path = 'dataset/*.csv'

## colab

In [3]:
from google.colab import drive

# Mount the folder "drive" on google drive to Colab Notebook
drive.mount('/content/drive')
path = '/content/drive/My Drive/wids-taipei/brazilian-ecommerce/'

# Change current working directory
os.chdir(path)

ModuleNotFoundError: No module named 'google.colab'

## Exploring the dataset


### About the dataset

This dataset has information about the customer and its location. Use it to identify unique customers in the orders dataset and to find the orders delivery location.

At our system each order is assigned to a unique `customer_id`. This means that the same customer will get different ids for different orders. The purpose of having a `customer_unique_id` on the dataset is to allow you to identify customers that made repurchases at the store. Otherwise you would find that each order had a different customer associated with.

- customer_id: key to the orders dataset. Each order has a unique customer_id.
- customer_unique_id: unique identifier of a customer.
- customer_zip_code_prefix: first five digits of customer zip code
- customer_city: customer city name
- customer_state: customer state

### Data Schema

The data is divided in multiple datasets for better understanding and organization. Please refer to the following data schema when working with it: 

![Data Schema](https://i.imgur.com/HRhd2Y0.png)

### Read data from CSV file with pandas

In [3]:
filenames = glob.glob(path)
pd_list = {}

for filename in filenames:
    name = filename.split("/")[-1].split(".")[0]
    pd_list[name] = pd.read_csv(os.path.join(filename))
    

for key,value in pd_list.items():
    print(key)


olist_sellers_dataset
product_category_name_translation
olist_orders_dataset
olist_order_items_dataset
olist_customers_dataset
olist_geolocation_dataset
olist_order_payments_dataset
olist_order_reviews_dataset
olist_products_dataset


In [4]:
cates = pd_list['product_category_name_translation'].set_index('product_category_name')
product_name = pd_list['olist_products_dataset'].set_index('product_id')
c_to_o = pd_list['olist_orders_dataset'].set_index('customer_id')["order_id"]
def categorized(series):
    return [product_name.loc[i]["product_category_name"] for i in series]
def concat(series):
#     for i in series:
#         for j in p.loc[c_to_o.loc[i]]["cate"]:
#             print(j)
    try:
        return [ j  for i in series for j in p.loc[c_to_o.loc[i]]["cate"]]
    except:
        return[]


In [5]:
order = pd.DataFrame(pd_list['olist_order_items_dataset'][["order_id","product_id"]])
p = pd.DataFrame(pd_list['olist_order_items_dataset'].groupby('order_id')["product_id"].apply(list))
p["cate"] = pd.DataFrame(p["product_id"].apply(categorized))


In [6]:
customer = pd.DataFrame(pd_list['olist_customers_dataset'][["customer_unique_id","customer_id"]])
b = pd.DataFrame(pd_list['olist_customers_dataset'].groupby('customer_unique_id')["customer_id"].apply(list))
b.head()

Unnamed: 0_level_0,customer_id
customer_unique_id,Unnamed: 1_level_1
0000366f3b9a7992bf8c76cfdf3221e2,[fadbb3709178fc513abc1b2670aa1ad2]
0000b849f77a49e4a4ce2b2a4ca5be3f,[4cb282e167ae9234755102258dd52ee8]
0000f46a3911fa3c0805444483337064,[9b3932a6253894a02c1df9d19004239f]
0000f6ccb0745a6a4b88665a16c9f078,[914991f0c02ef0843c0e7010c819d642]
0004aac84e0df4da2b147fca70cf8255,[47227568b10f5f58a524a75507e6992c]


In [8]:
b["cates"] = b["customer_id"].apply(concat)
#b["cates"].head()

### Clustering:
**1.one-hot-encode**

In [None]:
def one_hots(series, name):
    return 1 else 0 if name in series 

In [1]:
cates_array = cates.index.to_list()
for cate in cates_array:
    # print(cate)
    b[cate] = b['cates'].apply(one_hots, name = cate)



NameError: name 'cates' is not defined

**2. clustering (by KMEANS)**

In [None]:
def clustering(item_arrays, number_of_clusters) :
    kmeans_fit = cluster.KMeans(n_clusters = number_of_clusters ).fit(item_arrays)
    k = kmeans_fit.labels_
    print(collections.Counter(k))
    return kmeans_fit

result = clustering(b[cates_array], 10)