# Информация о продавцах (sellers_olist_public_dataset_.csv)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import seaborn as sns
import sys, os
import pathlib
from pathlib import Path
from olist_churn_prediction.paths import SRC_DIR, PROCESSED_DIR, INTERIM_NOTEBOOK_DIR, RAW_DIR
from olist_churn_prediction import feature_processing

In [3]:
sellers = pd.read_csv(RAW_DIR / "sellers_olist_public_dataset_.csv", low_memory=False)

In [4]:
display(sellers.head())
print(sellers.shape)
sellers.dtypes

Unnamed: 0,order_id,product_id,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,0faabac07131235fc5d9d711471cb4db,40b09f33e646d488df2ca6fec4082d50,3d871de0142ce09b7081e2b9d1733cb1,132,campo limpo paulista,SP
1,970f41d57d6e21afa7b8c701b09acb95,d1c427060a0f73f6b889a5c7c61f2ac4,a1043bafd471dff536d0c462352beb48,371,ilicinea,MG
2,b674ed44cc3f6a869249421debebe104,bd7cd34fc6d02e730221b11edc354aae,46dc3b2cc0980fb8ec44634e21d2718e,222,rio de janeiro,RJ
3,cbe9eae36605cf2bd005c6bc1ae5f864,0e95d6eef2bedaf4ecf3c33f78199059,dc4a0fc896dc34b0d5bfec8438291c80,149,ibitinga,SP
4,f562f8a4adf5a459176f7170d0da220d,9a3a44b7cc16f2592b2110e4205edf97,5a8e7d5003a1f221f9e1d6e411de7c23,130,campinas,SP


(98981, 6)


order_id                  object
product_id                object
seller_id                 object
seller_zip_code_prefix     int64
seller_city               object
seller_state              object
dtype: object

#### Пояснения к признакам:

`order_id` - id заказа;

`product_id` - id товара;

`seller_id` - id продавца;

`seller_zip_code_prefix` - почтовый индекс продавца;

`seller_city` - город продавца;

`seller_state` - штат продавца.

Приведем значения к нижнему регистру:

In [5]:
df = sellers.copy()

In [6]:
feature_processing.lowercase_categoricals(df, cat_cols=['seller_city', 'seller_state'], inplace=True)

Unnamed: 0,order_id,product_id,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,0faabac07131235fc5d9d711471cb4db,40b09f33e646d488df2ca6fec4082d50,3d871de0142ce09b7081e2b9d1733cb1,132,campo_limpo_paulista,sp
1,970f41d57d6e21afa7b8c701b09acb95,d1c427060a0f73f6b889a5c7c61f2ac4,a1043bafd471dff536d0c462352beb48,371,ilicinea,mg
2,b674ed44cc3f6a869249421debebe104,bd7cd34fc6d02e730221b11edc354aae,46dc3b2cc0980fb8ec44634e21d2718e,222,rio_de_janeiro,rj
3,cbe9eae36605cf2bd005c6bc1ae5f864,0e95d6eef2bedaf4ecf3c33f78199059,dc4a0fc896dc34b0d5bfec8438291c80,149,ibitinga,sp
4,f562f8a4adf5a459176f7170d0da220d,9a3a44b7cc16f2592b2110e4205edf97,5a8e7d5003a1f221f9e1d6e411de7c23,130,campinas,sp
...,...,...,...,...,...,...
98976,bc0e143be142c22547b69235dd63a5ad,f0978bc4bcd57a1f2079015178b73cad,76d5af76d0271110f9af36c92573f765,31,sao_paulo,sp
98977,394d17c2b71a726e205caaeee3d2aa3d,dc52f0f5d3ec37a93eaf956cde4e5d2c,6560211a19b47992c3666cc44a7e94c0,58,sao_paulo,sp
98978,38a7488411749141f8bbb2f2045ec365,51ff0a0f61611dfa94e43c834c949dd3,33a17d60c64393351ebf1ef860f4e0f2,915,porto_alegre,rs
98979,be4547eda8ce76c95ea88bad8637c539,acdd9eaa79720ca045ce3768250b8e47,a416b6a846a11724393025641d4edd5e,37,sao_paulo,sp


Обработка проблемных городов:

In [7]:
feature_processing.disambiguate_city_state(df, city_col="seller_city", state_col="seller_state", inplace=True)

Unnamed: 0,order_id,product_id,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,0faabac07131235fc5d9d711471cb4db,40b09f33e646d488df2ca6fec4082d50,3d871de0142ce09b7081e2b9d1733cb1,132,campo_limpo_paulista,sp
1,970f41d57d6e21afa7b8c701b09acb95,d1c427060a0f73f6b889a5c7c61f2ac4,a1043bafd471dff536d0c462352beb48,371,ilicinea,mg
2,b674ed44cc3f6a869249421debebe104,bd7cd34fc6d02e730221b11edc354aae,46dc3b2cc0980fb8ec44634e21d2718e,222,rio_de_janeiro_rj,rj
3,cbe9eae36605cf2bd005c6bc1ae5f864,0e95d6eef2bedaf4ecf3c33f78199059,dc4a0fc896dc34b0d5bfec8438291c80,149,ibitinga,sp
4,f562f8a4adf5a459176f7170d0da220d,9a3a44b7cc16f2592b2110e4205edf97,5a8e7d5003a1f221f9e1d6e411de7c23,130,campinas,sp
...,...,...,...,...,...,...
98976,bc0e143be142c22547b69235dd63a5ad,f0978bc4bcd57a1f2079015178b73cad,76d5af76d0271110f9af36c92573f765,31,sao_paulo,sp
98977,394d17c2b71a726e205caaeee3d2aa3d,dc52f0f5d3ec37a93eaf956cde4e5d2c,6560211a19b47992c3666cc44a7e94c0,58,sao_paulo,sp
98978,38a7488411749141f8bbb2f2045ec365,51ff0a0f61611dfa94e43c834c949dd3,33a17d60c64393351ebf1ef860f4e0f2,915,porto_alegre,rs
98979,be4547eda8ce76c95ea88bad8637c539,acdd9eaa79720ca045ce3768250b8e47,a416b6a846a11724393025641d4edd5e,37,sao_paulo,sp


In [8]:
for col in df.columns:
    print(f"--- {col} ---")
    print(df[col].value_counts())
    print()

--- order_id ---
order_id
f04396b769e9563e83456389c7d8f043    8
d281ffe6e49a2a896db2eb9febfe07f8    6
5a3b1c29a49756e75f1ef513383c0c12    6
3990f96693d321ac142fff312bf3706a    6
7d8f5bfd5aff648220374a2df62e84d5    6
                                   ..
965feb5d147d2e59d18eb3f9a1b4d5a2    1
cc7c3c9e0e52b2748d1843228f84c9ad    1
986dae5bc78169ebbf74f5d5f79924ff    1
e0d528f6066d931e08f9f48512f587d2    1
c9dec96fd0b47827e3520b47b61c79d3    1
Name: count, Length: 96247, dtype: int64

--- product_id ---
product_id
99a4788cb24856965c36a24e339b6058    648
aca2eb7d00ea1a7b8ebd4e68314663af    615
422879e10f46682990de24d770e7f83d    472
d1c427060a0f73f6b889a5c7c61f2ac4    449
389d119b48cf3043d311335e499d9c6b    433
                                   ... 
d3ed5f1ed25b2b1f8f152a68a88b2966      1
e3aae2cb84e49978a0749848343133e9      1
f099978e3b449a27ed812cf8c31fb9c9      1
8dfe3982285530dc4dc67986b0090219      1
8b2ae6dc36fac80caf19882760cfc9b2      1
Name: count, Length: 24440, dtype: int64

--

## Выводы из таблицы sellers:
1. Местоположение большинства продавцов - город sao_paulo штата SP.
2. Есть как продавцы с большим количеством выставленных товаров (более 1000 лотов) так и продавцы с единственным товаром.

In [28]:
df.to_parquet(INTERIM_NOTEBOOK_DIR / "sellers_olist_public_dataset_interim.parquet", index=False)