### 1) Imports

In [58]:
import os
import numpy as np
import pandas as pd

pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [59]:
# Imports scr info
customers_dataset = pd.read_csv(os.path.join('..', 'src', 'olist_customers_dataset.csv')).rename(columns={'customer_zip_code_prefix': 'zip_code_prefix'}).drop(['customer_unique_id', 'customer_city'], axis=1)
geolocation_dataset = pd.read_csv(os.path.join('..', 'src', 'olist_geolocation_dataset.csv')).rename(columns={'geolocation_zip_code_prefix': 'zip_code_prefix'}).drop(['geolocation_city', 
                                                                                                                                            'geolocation_state'], axis=1)
order_items_dataset = pd.read_csv(os.path.join('..', 'src', 'olist_order_items_dataset.csv')).drop(['order_item_id', 'seller_id','shipping_limit_date'], axis=1)
order_payments_dataset = pd.read_csv(os.path.join('..', 'src', 'olist_order_payments_dataset.csv')).drop(['payment_sequential', 'payment_installments'], axis=1)
order_reviews_dataset = pd.read_csv(os.path.join('..', 'src', 'olist_order_reviews_dataset.csv')).groupby(by=['order_id'], as_index=False)['review_score'].mean()
orders_dataset = pd.read_csv(os.path.join('..', 'src', 'olist_orders_dataset.csv')).drop(['order_status', 'order_purchase_timestamp',
       'order_approved_at', 'order_delivered_carrier_date',
       'order_delivered_customer_date', 'order_estimated_delivery_date'], axis=1)
products_dataset = pd.read_csv(os.path.join('..', 'src', 'olist_products_dataset.csv')).drop(['product_name_lenght','product_description_lenght', 
        'product_photos_qty', 'product_weight_g','product_length_cm', 'product_height_cm', 'product_width_cm'], axis=1)
product_category_name_translation = pd.read_csv(os.path.join('..', 'src', 'product_category_name_translation.csv'))

# Merge datasets

## Customers_Info
customers_info = pd.merge(customers_dataset, geolocation_dataset, on='zip_code_prefix', how='inner').drop('zip_code_prefix', axis=1)
customers_info = customers_info.iloc[customers_info.groupby('customer_id')['geolocation_lat'].agg(pd.Series.idxmax)]
customers_info = customers_info.query('geolocation_lat <= 5.27438888' and 'geolocation_lng >= -73.98283055' and 'geolocation_lat >= -33.75116944' and 'geolocation_lng <=  -34.79314722')

## Purchases_Info

purchases_info = pd.merge(order_items_dataset, products_dataset, on='product_id', how='inner')
purchases_info = pd.merge(purchases_info, order_payments_dataset, on='order_id', how='inner')
purchases_info = pd.merge(purchases_info, orders_dataset, on='order_id', how='inner').query("product_category_name in ('cama_mesa_banho', 'beleza_saude', 'esporte_lazer')")
purchases_info = pd.merge(purchases_info, product_category_name_translation, on='product_category_name', how='inner').drop(['product_category_name'], axis=1)[['customer_id', 'order_id', 
                                                                                                                                                                        'product_id', 'product_category_name_english', 'price', 'freight_value', 'payment_type', 'payment_value']]
purchases_info = pd.merge(purchases_info, order_reviews_dataset, on='order_id', how='inner').drop(['order_id'], axis=1)

In [60]:
# Aggregate datasets and rename columns
purchases_df = purchases_info.pivot_table(index='customer_id',columns='product_category_name_english',values=['payment_value', 'review_score'], fill_value=0)
purchases_df.columns = [f'{j}_{i}' for i,j in purchases_df.columns]
purchases_df.reset_index(inplace=True)

products = purchases_info.pivot_table(index='customer_id',columns='product_category_name_english',values='product_id', aggfunc= 'count', fill_value=0)
products.columns = [f'qty_of_products_{i}' for i in products.columns]
products.reset_index(inplace=True)

# Merge final data
from functools import reduce

dfs = [customers_info, products, purchases_df]

final_df = reduce(lambda  left,right: pd.merge(left,right,on=['customer_id'], how='inner'), dfs).query('qty_of_products_bed_bath_table > 2 or qty_of_products_health_beauty > 2 or qty_of_products_sports_leisure > 2')
final_df = final_df[['customer_id', 'customer_state', 'geolocation_lat', 'geolocation_lng',
                    'qty_of_products_bed_bath_table','bed_bath_table_payment_value','bed_bath_table_review_score',
                    'qty_of_products_health_beauty', 'health_beauty_payment_value', 'health_beauty_review_score',
                    'qty_of_products_sports_leisure', 'sports_leisure_payment_value','sports_leisure_review_score']]

# 500 customers sample                       
final_df = final_df.sample(500)
final_df


Unnamed: 0,customer_id,customer_state,geolocation_lat,geolocation_lng,qty_of_products_bed_bath_table,bed_bath_table_payment_value,bed_bath_table_review_score,qty_of_products_health_beauty,health_beauty_payment_value,health_beauty_review_score,qty_of_products_sports_leisure,sports_leisure_payment_value,sports_leisure_review_score
3138,1ee3f2411ad5097d580f5d8bbf3cd63f,SP,-23.95,-46.29,4,23.48,5.00,0,0.00,0.00,0,0.00,0.00
22282,dd747ee25e233b5cd6285d65972b7b7b,SP,-23.48,-46.58,3,92.74,1.00,0,0.00,0.00,0,0.00,0.00
19726,c455144b0c6e78f2574b7a92f22342cf,SP,-23.47,-46.25,5,1349.05,4.00,0,0.00,0.00,0,0.00,0.00
13521,86e84cf56a414caf580bc5b1a97bdc17,MG,-19.87,-44.02,3,205.68,1.00,0,0.00,0.00,0,0.00,0.00
14911,95512e0ed7a522e627582ef07f02a3b7,RJ,-22.87,-42.05,3,164.08,5.00,0,0.00,0.00,0,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3230,1fc148759f50ec10a03421aaf5efd89b,SP,-22.84,-47.05,6,326.79,5.00,0,0.00,0.00,0,0.00,0.00
11263,707818185591ca532dfd8fff383f0103,BA,-12.69,-38.33,0,0.00,0.00,0,0.00,0.00,3,108.84,3.00
16244,a20f368d4aa1abfd95ddf1619a7ad1e4,SP,-23.55,-46.88,3,119.27,1.00,0,0.00,0.00,0,0.00,0.00
8598,5575494798cac6f28d4da41aa44b44c8,MG,-20.70,-42.87,0,0.00,0.00,0,0.00,0.00,3,168.03,5.00


In [61]:
final_df.to_csv('data_treated/customers_purchases_reviews_olist.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'data_treated/customers_purchases_reviews_olist.csv'