# Объединение таблиц

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import seaborn as sns
import sys, os
import pathlib
from pathlib import Path
from pydantic import ValidationError
from olist_churn_prediction.paths import SRC_DIR, PROCESSED_DIR, INTERIM_DIR

In [2]:
sys.path.append(str(SRC_DIR))

In [2]:
from olist_churn_prediction import feature_processing, feature_engineering
from olist_churn_prediction.schemas_interim import GeolocationSchemaInterim, MainClassifiedSchemaInterim, PaymentsSchemaInterim, MeasuresSchemaInterim, CustomersSchemaInterim, SellersSchemaInterim
from olist_churn_prediction.paths import RAW_DIR

In [3]:
geolocation_data = feature_processing.load_data(INTERIM_DIR / "geolocation_olist_public_dataset_interim.parquet",
                      schema = GeolocationSchemaInterim,
                      validate = True)

In [4]:
display(geolocation_data.head())
print(geolocation_data.shape)
geolocation_data.dtypes

Unnamed: 0,zip_code_prefix,city,state,lat,lng
0,10,sao_paulo,sp,-23.547808,-46.636124
1,10,sao_paulo,sp,-23.546082,-46.644821
2,10,sao_paulo,sp,-23.540983,-46.642567
3,10,sao_paulo,sp,-23.546759,-46.645771
4,10,sao_paulo,sp,-23.546362,-46.643074


(323016, 5)


zip_code_prefix    string[python]
city               string[python]
state              string[python]
lat                       float32
lng                       float32
dtype: object

In [14]:
payments_data = feature_processing.load_data(INTERIM_DIR / "payments_olist_public_dataset_interim.parquet",
                      schema = PaymentsSchemaInterim,
                      validate = True)

In [15]:
display(payments_data.head())
payments_data.shape

Unnamed: 0,order_id,installments,sequential,payment_type,value
0,25e8ea4e93396b6fa0d3dd708e76c1bd,1,1,credit_card,65.709999
1,ba78997921bbcdc1373bb41e913ab953,8,1,credit_card,107.779999
2,42fdf880ba16b47b59251dd489d4441a,2,1,credit_card,128.449997
3,771ee386b001f06208a7419e4fc1bbd7,1,1,credit_card,81.160004
4,62fe0bd75deb58982ebf8e53a128a301,1,1,credit_card,15.82


(97428, 5)

In [16]:
# Группировка: множество payment_type по каждому user_id
type_counts = payments_data.groupby("order_id")["payment_type"].nunique()

# Оставляем только те user_id, где один уникальный тип оплаты
consistent_users = type_counts[type_counts == 1].index

# Фильтрация исходного DataFrame
filtered_df = payments_data[payments_data["order_id"].isin(consistent_users)]

print(filtered_df.shape)

(92119, 5)


#### Количество заказов с одним способом оплаты - 92119, изначальное количество - 97428. Решено убрать признак payment_type для сохранения числа заказов.

In [17]:
payments_data = payments_data.drop('payment_type', axis=1)

#### Сгруппируем по order_id:

In [27]:
payments_data_grouped = filtered_df.groupby("order_id").agg({
    "value": "sum",
    "installments": "max",
    "sequential": "count"}).reset_index()

In [29]:
display(payments_data_grouped.head())

Unnamed: 0,order_id,value,installments,sequential
0,0000f01da58c553799200eb71eb0563b,72.900002,1,1
1,00010242fe8c5a6d1ba2dd792cb16214,72.190002,2,1
2,00026c8a4fe2c14efa2afe807921981c,477.279999,1,1
3,000312d02b605a8a630c3a9882793ffb,673.200012,10,1
4,00042b26cf59d7ce69dfabb4e55b4fd9,218.039993,3,1


In [30]:
#payments_data_grouped.to_parquet(INTERIM_DIR / "payments_grouped_interim.parquet", index=False)

In [19]:
measures_data = feature_processing.load_data(INTERIM_DIR / "product_measures_olist_public_dataset_interim.parquet",
                      schema = MeasuresSchemaInterim,
                      validate = True)

In [20]:
display(measures_data.head())
measures_data.shape

Unnamed: 0,product_id,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,00faa46f36261af8bbf3a4d37fa4841b,100.0,21.0,9.0,14.0
1,b497d0032ea073dd58bbd2d7809d25b3,750.0,23.0,5.0,15.0
2,96bd76ec8810374ed1b65e291975717f,200.0,16.0,6.0,19.5
3,506110a1603e772a8ff52ea1f0b0431d,200.0,16.0,13.0,13.0
4,feb593f17c2f51af2932dbba3e99f1cb,50.0,35.0,4.0,26.0


(24439, 5)

In [31]:
customers_data = feature_processing.load_data(INTERIM_DIR / "public_customers_data_interim.parquet",
                      schema = CustomersSchemaInterim,
                      validate = True)

In [32]:
display(customers_data.head())
customers_data.shape

Unnamed: 0,customer_id,customer_unique_id
0,0000ca4ff2795842fd6b122c6d974468,ac0e5d7f07043dd32831bdad3b99ad26
1,00010f206878ba01f199aec4237de72e,4ed24aaed4079fe0661c0e4b3b420dff
2,00012a2ce6f8dcda20d059ce98491703,7e0e291c0f4fc2d69208954d26ed0586
3,000161a058600d5901f007fab4c27140,ff1dcb27ea444eddb94ea5fea77b875c
4,00017f1d6cce0d56046219a2cfabcbbb,237130c4b6717031ed62d9d0b16fd94f


(96264, 2)

In [37]:
print(customers_data['customer_unique_id'].value_counts())

customer_unique_id
ff4ea78481e00334563aea6d1681f496    10
b6169cb608e0a9d1b0ed53b93fe21713    10
915f2fe0968adc3e5a76d70ca9326bf5     9
779ab1cce17159b31d6a0a4694644455     8
4cc4fa5148dca9dd9570607562cffc0e     8
                                    ..
4efe154c832a40a1fd13279c887c7a3d     1
d04dbc7e00a8192de408474473c00c4c     1
e81c0d5a636449241742d8f188272d26     1
fe196d5896a4446e7a19b1febb3cf17b     1
959d47fde7bf9b43e7dc2f5d7373a43a     1
Name: count, Length: 92516, dtype: Int64


In [34]:
sellers_data = feature_processing.load_data(INTERIM_DIR / "sellers_olist_public_dataset_interim.parquet",
                      schema = SellersSchemaInterim,
                      validate = True)

In [35]:
display(sellers_data.head())
sellers_data.shape

Unnamed: 0,order_id,product_id,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,0faabac07131235fc5d9d711471cb4db,40b09f33e646d488df2ca6fec4082d50,3d871de0142ce09b7081e2b9d1733cb1,132,campo_limpo_paulista,sp
1,970f41d57d6e21afa7b8c701b09acb95,d1c427060a0f73f6b889a5c7c61f2ac4,a1043bafd471dff536d0c462352beb48,371,ilicinea,mg
2,b674ed44cc3f6a869249421debebe104,bd7cd34fc6d02e730221b11edc354aae,46dc3b2cc0980fb8ec44634e21d2718e,222,rio_de_janeiro,rj
3,cbe9eae36605cf2bd005c6bc1ae5f864,0e95d6eef2bedaf4ecf3c33f78199059,dc4a0fc896dc34b0d5bfec8438291c80,149,ibitinga,sp
4,f562f8a4adf5a459176f7170d0da220d,9a3a44b7cc16f2592b2110e4205edf97,5a8e7d5003a1f221f9e1d6e411de7c23,130,campinas,sp


(98981, 6)

In [41]:
customers_data = customers_data.rename(columns={"customer_id": "order_id"})

In [42]:
customers_data.head()

Unnamed: 0,order_id,customer_unique_id
0,0000ca4ff2795842fd6b122c6d974468,ac0e5d7f07043dd32831bdad3b99ad26
1,00010f206878ba01f199aec4237de72e,4ed24aaed4079fe0661c0e4b3b420dff
2,00012a2ce6f8dcda20d059ce98491703,7e0e291c0f4fc2d69208954d26ed0586
3,000161a058600d5901f007fab4c27140,ff1dcb27ea444eddb94ea5fea77b875c
4,00017f1d6cce0d56046219a2cfabcbbb,237130c4b6717031ed62d9d0b16fd94f


In [43]:
merged_table = customers_data.merge(payments_data_grouped, how='left', on='order_id')

In [46]:
print(merged_table.head())

                           order_id                customer_unique_id  value  \
0  0000ca4ff2795842fd6b122c6d974468  ac0e5d7f07043dd32831bdad3b99ad26    NaN   
1  00010f206878ba01f199aec4237de72e  4ed24aaed4079fe0661c0e4b3b420dff    NaN   
2  00012a2ce6f8dcda20d059ce98491703  7e0e291c0f4fc2d69208954d26ed0586    NaN   
3  000161a058600d5901f007fab4c27140  ff1dcb27ea444eddb94ea5fea77b875c    NaN   
4  00017f1d6cce0d56046219a2cfabcbbb  237130c4b6717031ed62d9d0b16fd94f    NaN   

   installments  sequential  
0          <NA>        <NA>  
1          <NA>        <NA>  
2          <NA>        <NA>  
3          <NA>        <NA>  
4          <NA>        <NA>  
