In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import os
import sys
import warnings
import datetime

import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go

from collections import namedtuple
from matplotlib import pyplot as plt
from dotenv import load_dotenv
from sklearn.preprocessing import MinMaxScaler

sys.path.append("../")

from scripts.distances import haversine_km  # shows an error but its a false positive (we added ../ to path)

data_path = "../data"

pio.renderers.default = "notebook_connected"

load_dotenv()
sns.color_palette('colorblind')
plt.style.use('Solarize_Light2')

# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not

try:
    pc_dpi = int(os.getenv('DPI'))
except TypeError:
    pc_dpi = 100
if pc_dpi is None:
    pc_dpi = 100


In [None]:
order_data = "../data/optimized/olist_orders.csv"
rfm_dataset = "../pickles/geolocd/cx_rfm.plk"
customer_dataset = "../data/optimized/geoloc_applied/olist_customers.csv"
sellers_dataset = "../data/optimized/geoloc_applied/olist_sellers.csv"
reviews_data = "../data/optimized/olist_reviews.csv"
order_items_data = "../data/optimized/olist_order_items.csv"


In [None]:
df_order = pd.read_csv(order_data)
df_rfm = pd.read_pickle(rfm_dataset)
df_cx = pd.read_csv(customer_dataset)
df_sellers = pd.read_csv(sellers_dataset)
df_items = pd.read_csv(order_items_data)
df_reviews = pd.read_csv(reviews_data)


# 0 : Getting the variables we need from other datasets
# 1 : Calculate Time deltas for orders
# 2 : Caculate Dist delta of seller - buyers
# 3 : Transitionning from the orders dataset to the cx_rfm dataset.

<br>
<hr>


# <u> 0 : Getting the variables we need from other datasets :</u>

- ## Goals
    - Joining the datasets of order and order_items on order_id to know the distance between the buyer and seller
    - Joining the datasets of order and reviews to know the reviews for reviewed orders.

<br>

- ## Whys :
    - Export for further feature engineering
    - Make the datasets less codependents, centralize infos in as few files as possible
    - We will work with only two datasets (orders and cx_rfm)

- ## Hows : 
    - >we'll see

<hr>


## <u> 0.1 : Products and Items for each orders :</u>

### How :

We will use order_id to find items and item sellers in olist_order_items dataset as DataFrame

In [None]:
df_order.head()


In [None]:
df_items.head()


In [None]:
# Checking if df_items contains duplicated order_id, if not, using order id as index
df_items["order_id"].duplicated().sum()


>That was a long shot anyway, but we cannot use order_id as index for df_items

In [None]:
# Making a selection on items : 
# Already in RFM, shipping date is not relevant

df_items = df_items.drop(columns=["shipping_limit_date", "price", "freight_value", "order_item_id"])

df_items.sort_values(by="order_id", inplace=True)


### Getting the product_list and seller_list for each order

In [None]:
df_order["product_list"] = np.dtype("object")
df_order["seller_list"] = np.dtype("object")

order_ids = df_order["order_id"].values

for current_order in order_ids:
    order_idx = df_order[df_order["order_id"] == current_order].index[0]
    items = df_items[df_items["order_id"] == current_order]
    df_order.at[order_idx, "product_list"] = list(items["product_id"].values)
    df_order.at[order_idx, "seller_list"] = list(items["seller_id"].values)


In [None]:
df_order.head()


> Looks good, now, we know that an order can contain multiple products, but can an order mobilize multiple sellers ? Let's check

In [None]:
multiple_sellers = 0

for tuple in df_order.itertuples():
    if len(tuple.seller_list) > 1:
        multiple_sellers += 1

print(f"there are {multiple_sellers} orders with multiple sellers")


> So yes, an order can totally mobilize multiple sellers, hence preventing us from changing the type of "seller_list" from list (or object in a dataframe) to a unique int. That also raises questions 

- Do we keep these multiple sellers and compute the average distance between the customer and these multiple sellers ?
- Do we take the avg. position of a unique "Seller_geoloc" using the mean position of all the multiple sellers ?
- Do we select a particular seller based on an agreed upon metric of characteristic ?

For now : we will take ∆avg for each order.
<br>
<i>In it's own section</i>

## <u> 0.2 : Geolocs from sellers and buyers for each order :</u>

- We will use the impractical tuple (lat/lon) and unpack it when we will have to plot on a map.
- Buyer will have only one geoloc, seller_list will be a list of lat/lon

> First let's take a look

In [None]:
df_order.head()


In [None]:
df_cx.head()


In [None]:
df_sellers.head()


### How : 
Using the same method as in the order/items, we will find the lat/lon tuple for each seller and buyer using : customer_id and seller_id

In [None]:
df_order["geoloc_cx"] = np.dtype("object")

cx_ids = df_order["customer_id"].values

for current_order in order_ids:
    order_idx = df_order[df_order["order_id"] == current_order].index[0]
    cx_id = df_order[df_order["order_id"] == current_order]["customer_id"].values[0]
    cx_row = df_cx[df_cx["customer_id"] == cx_id]
    lat, lon = float(cx_row["lat"].values[0]) , float(cx_row["lon"].values[0])
    lat_lon = (lat, lon)
    df_order.at[order_idx, "geoloc_cx"] = lat_lon


In [None]:
df_order["geoloc_sellers_list"] = np.dtype("object")

sellers_ids = df_order[df_order["order_id"] == current_order].index[0]

for current_order in order_ids:

    order_idx = df_order[df_order["order_id"] == current_order].index[0]
    sellers_id_list = list(df_order[df_order["order_id"] == current_order]["seller_list"].values[0])
    lat_lon_list = []

    for seller_id in sellers_id_list:
        seller_row = df_sellers[df_sellers["seller_id"] == seller_id]
        lat, lon = float(seller_row["lat"].values[0]), float(seller_row["lon"].values[0])
        lat_lon = (lat, lon)
        lat_lon_list.append(lat_lon)
        del lat_lon
    df_order.at[order_idx, "geoloc_sellers_list"] = lat_lon_list


In [None]:
df_order.head()


> Thats nice we can work with that

## <u> 0.3 : Reviews for all orders :</u>

### How :
- Similar method to above, only with df_reviews
- Selecting what we want to know from review with df.head()
<br>

> <i>(note to self : work on a generic way of approaching the simple problem (not the list of lists) with concurrent.futures) </i>

In [None]:
# Lets select our variables first :

df_reviews.head()


In [None]:
df_reviews.info()


>We are mainly interested in review score, but we can also create a boolean variable to check whether or not the review was assorted with a comment or not. We can debate whether or not the lenght of the review is important. In the future we can also think about using NLP engines such as GPT-3 to detect the "tone" of the answer and the topics approached. This would be, however, a little heavy for the model and a lot a work to put into place

<br>

<b>For now</b>

- Create the boolean value for comment in reviews directly
- Create a new boolean assign boolean values : has_rating and has_comment
- <b>Maybe</b> (check with team) create a boolean value to check if, in the case of comment, an Olist support agent has replied. Could be relevant (we like to know that our comment was read or our complaint heard/taken care of). It isnt explicit on the dataset if this is a support agent who replied or the Cx who updated the review. Will check online

<i><b>!!!Note!!!</b> : after online (Kaggle, cf. sources) check : review_answer_timestamp : Shows satisfaction survey answer timestamp., no support agent involved as far as I can see. Last point is irrelevant but could be a nice addition for Olist's databases in the future.</i>


In [None]:
def has_comment(row) -> bool:
    """
    - Determines if a review is assorted with a comment/message (returns boolean)
    - For dataset : reviews
    """

    if not pd.isna(row["review_comment_message"]):
        return True
    elif pd.isna(row["review_comment_message"]):
        return False


In [None]:
# lets drop the non useful columns :
df_reviews.drop(columns=["review_answer_dt", "review_creation_date", "review_comment_title"], inplace=True)

df_reviews.head()


#### Plotting the ratings as barplot out of curiosity :
In general, only two kind of people rate/comment : the really happy ones and the really disappointed ones. If this is true here, we should see a disproportional amount of good (5) and bad (1, 2), a little bit of 4s but a very low amount of 3s. Let's see

In [None]:
# Forcing dtypes : 
df_reviews["review_score"] = df_reviews["review_score"].astype(int)


In [None]:

cmap = {1: "#810000", 2: "red", 3: "royalblue", 4: "navy", 5: "#003153"}
title = "Distribution of ratings in the reviews given by users"

fig = px.histogram(df_reviews, x="review_score", color="review_score", color_discrete_map=cmap, title=title)

fig.update_layout(margin={"b": 25, "t": 25, "r": 25, "l": 25})
fig.update_layout(title_y=0.95, title_x=0.1)
fig.update_layout(font_color="black")

fig.show()


Let's place our boolean in review : 

In [None]:
df_reviews["has_comment"] = df_reviews.apply(has_comment, axis=1)


In [None]:
df_reviews.head()

In [None]:
df_order["review_score"] = np.dtype("int")
df_order["has_rating"] = np.dtype("bool")
df_order["has_comment"] = np.dtype("bool")

order_ids = df_order["order_id"].values

for current_order in order_ids:
    order_idx = df_order[df_order["order_id"] == current_order].index[0]
    try:
        review = df_reviews[df_reviews["order_id"] == current_order]
        df_order.at[order_idx, "review_score"] = review["review_score"].values[0]
        df_order.at[order_idx, "has_rating"] = True
        if any(review["has_comment"]):
            df_order.at[order_idx, "has_comment"] = True
        else:
            df_order.at[order_idx, "has_comment"] = False

    except IndexError:
        df_order.at[order_idx, "review_score"] = np.nan
        df_order.at[order_idx, "has_rating"] = False
        df_order.at[order_idx, "has_comment"] = False


In [None]:
df_order.head()


<hr>

# <u>1 : Calculate Time deltas for orders</u>
## 1.1 : Calculate Time delta of delivery for delivered orders (status = delivered)
## 1.2 : Calculate Time delta of expected/delivered orders + booleans

<hr>

## <u>1.1 : Calculate Time delta of delivery for delivered orders (status = delivered) </u>

### 1 : How ?
We apply only that ∆ on delivered orders, if the order["order_delivered_customer_date"] is blank, function will skip it
### 2 : Applied to Cx :
We will calculate the average ∆T for each Cx using the list of orders in df_rfm : for Cx with exactly one order, this will be of no consequence as ∆T will be from only one order, but for Cxs w/ multiple orders, we will take the avg. ∆. For Cx w/ no ∆ present on orders (if their order status is not "deliverd"), see point 3.
### 3 : Remaining values : 
No modification of missing values until the merger with the rfm dataset -- The merger then will use a mean inputer while taking into account the cluster of the customer

<hr>

In [None]:
df_order.head()


In [None]:
# Checkings for NAs in "order_delivered_customer_date" & "order_purchase_dt"
temp = df_order[df_order["order_status"] == "delivered"]

delivered_na_count = temp["order_delivered_customer_date"].isna().sum()
purchase_dt_na_count = temp["order_purchase_dt"].isna().sum()

print(f"There is {delivered_na_count} values missing in order_delivered_customer_date")
print(f"There is {purchase_dt_na_count} values missing in order_purchase_dt")

del temp  # Flush


There are 8 values missing in order_delivered_customer_date, likely the order was delivered too recently at sql dump time. Regardless, ignoring.

In [None]:
def calc_delta_t(row):
    """
    If status of order is delivered, returns delta between the delivery and order.
    If any data is missing for this operation to work, returns nan
    """
    if row["order_status"] == "delivered":

        delivered_at = pd.to_datetime(row["order_delivered_customer_date"])
        ordered_at = pd.to_datetime(row["order_purchase_dt"])
        
        if pd.isna(delivered_at) or pd.isna(ordered_at):
            return np.nan
        elif not pd.isna(delivered_at) and not pd.isna(ordered_at):
            return delivered_at - ordered_at
    else:
        return np.nan


In [None]:
df_order["delta_delivery"] = df_order.apply(calc_delta_t, axis=1)


In [None]:
# checking :

df_order["delta_delivery"].head()


In [None]:
# checking number of nans and the number of not delivered orders, should be pretty similar : 

na_delta_del = df_order['delta_delivery'].isna().sum()
not_delivered_len = len(df_order[df_order['order_status'] != 'delivered'])
print(f"{na_delta_del} missing values in Delta Delivery")
print(f"{not_delivered_len} missing values in packages not delivered")

print(f"delta_del_na - not_delivered = {na_delta_del - not_delivered_len}")


We can see the same, expected number of delivered packages without a delta delivery. As predicted

<hr>

## <u> 1.2 : Calculate Time delta of expected/delivered orders + booleans </u>

- Goals : We want to establish, for packages delivered, the difference between the expected date of delivery and the actual delivery date.<br>
It might be a good indicator of satisfaction : we are happy if our package is early/on time, and not happy if we are delivered late.
- Creation of booleans : EarlyOrOnTime : True for early and on time , False for late

<hr>

In [None]:
def calc_expect_delta(row):
    """
    Calculates the time delta between the expected delivery date, keeping y, m, d in order_delivered_customer_date
    3 cases if not nan:
    - value = 0 --> On time
    - value < 0 --> Early
    - Value > 0 --> Late

    Returns : pd.TimeDelta or np.nan
    """

    if row["order_status"] == "delivered":

        delivered_at_full = pd.to_datetime(row["order_delivered_customer_date"])
        expected_at = pd.to_datetime(row["order_estimated_delivery_date"])

        if pd.isna(delivered_at_full) or pd.isna(expected_at):
            return np.nan
        elif not pd.isna(delivered_at_full) and not pd.isna(expected_at):
        
            delivered_at_trunc = pd.to_datetime(
                datetime.date(
                        int(delivered_at_full.year),
                        int(delivered_at_full.month),
                        int(delivered_at_full.day)
                    )
                )
            return delivered_at_trunc - expected_at
    else:
        return np.nan


def on_time_late(row):
    """
    Returns True if package arrived before or on the predicted day, and False if package is late
    """

    if row["order_status"] == "delivered":
        delta_days = row['expectation_delta'].days
        if not pd.isna(row["expectation_delta"]):
            if delta_days <= 0:
                return True
            elif delta_days > 0:
                return False


In [None]:
df_order["expectation_delta"] = df_order.apply(calc_expect_delta, axis=1)


In [None]:
deltas = df_order["expectation_delta"].value_counts().to_dict()
deltas = dict(sorted(deltas.items()))

delta_days = {}

for key, value in deltas.items():
    delta_days[key.days] = value


It seems we have a lot of extremes. Plotting in bars to see clearly

In [None]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(8, 3),
    dpi=pc_dpi,
)

ax1.boxplot(x=delta_days, vert=False, showfliers=True, showmeans=True, widths=.5)


###
# Titles/Lables
ax1.set_yticklabels([])
ax1.set_xlabel("Delta between expected and actual delivery")
fig.suptitle("Repartition of deltas between expected and actual delivery")
#
###

fig.tight_layout()
plt.show()


> This repartition shows that there is an unusual amount of very high Deltas : -150, +180, these are not normal and might not be what is expected but edge cases to view case by case by Olist, knowing the context might explain these values. Nevertheless, there are the values the dataset gives us, thus, this is the data we will use. Theory is that date format was inversed (either DDMMYYYY became MMDDYYYY or the other way around) - We will discard these depending on quantile representation

In [None]:
# Boolean and we're done :

df_order["early_on_time"] = df_order.apply(on_time_late, axis=1)


In [None]:
df_order["expectation_delta"].describe(percentiles=([.001, .1, .25, .75, .9, .95, .998]))


After looking up the quantiles, 0.1%, 99.9%
> We will discard these extreme cases. (above 0.1% and under 99.9%)


In [None]:
p_0_1 = df_order["expectation_delta"].quantile(0.001) # Getting the value @ 0.1%
p_99_9 = df_order["expectation_delta"].quantile(0.999) # Getting the value @ 99.9%

remove_low = df_order[df_order["expectation_delta"].lt(p_0_1) & df_order["expectation_delta"].notna()].index.tolist()
remove_high = df_order[df_order["expectation_delta"].gt(p_99_9) & df_order["expectation_delta"].notna()].index.tolist()

remove_high_low = remove_high + remove_low

print(f"We will remove {len(remove_high_low)} orders from the dataframe.")


In [None]:
df_order = df_order.drop(index=remove_high_low)


In [None]:
# Redoing the index

df_order.reset_index(drop=True, inplace=True)


In [None]:
# Checking- check True :

df_order[df_order["early_on_time"] == True].head()


In [None]:
# Checking- check na :

df_order[df_order["early_on_time"].isna()].head()


In [None]:
# Checking- check False :

df_order[df_order["early_on_time"] == False].head()


<hr>

# <u>2 : Caculate Dist delta of seller - buyers</u>

> Using a function that calculates the distance between two points on Earth using the Haversine formula, we can :
    > - Get the ∆ distance between a seller and a buyer
    > - If there are more than one sellers, calculating the average distance
    > - If there are no sellers, returning nans

<hr>

In [None]:
def get_delta_dist(row):
    """
    Uses structural pattern matching and haversine_km function to calculate :
    - case nb_seller = 1 : the delta dist between seller and cx
    - case nb_seller != 1 and != 0 (case _) : total_delta_dist / nb_sellers (avg. dist)
    - case nb_seller = 0 : returns np.nan
    """

    lat_cx, lon_cx = row["geoloc_cx"][0], row["geoloc_cx"][1]
    seller_list = row["geoloc_sellers_list"]
    match len(seller_list):
        case 1:
            seller_tuple = seller_list[0]
            seller_lat, seller_lon = seller_tuple[0], seller_tuple[1]
            return haversine_km(
                lat_o=lat_cx,
                lon_o=lon_cx,
                lat_f=seller_lat,
                lon_f=seller_lon
            )

        case 0:
            return np.nan

        case _:
            seller_nb = len(seller_list)
            delta_total = 0
            for seller_tuple in seller_list:
                seller_lat, seller_lon = seller_tuple[0], seller_tuple[1]
                delta_total += haversine_km(
                    lat_o=lat_cx,
                    lon_o=lon_cx,
                    lat_f=seller_lat,
                    lon_f=seller_lon)
            avg_delta = delta_total / seller_nb
            return avg_delta


In [None]:
df_order["seller_to_cx_dist"] = df_order.apply(get_delta_dist, axis=1)

df_order.head()


<hr>

# <u>3 : Transitionning from the orders dataset to the cx_rfm dataset.</u>

&emsp;This will enable us to add all the new variables to the first RFM classification. We will need to ensure that the merger is done correctly, and, since we deleted 191 orders from the dataset, we might need to filter some Cxs. For multiple orders, we will need to choose between keeping multiple orders or doing the average of the variable.


### <u>3.1: Reviews.</u>

### <u>3.1.1: Reviews, cases not NA.</u>

- Review Score : Average for each individual Cx if multiple orders
- Has Rated : Boolean, at least once
- Rating Ratio : Orders Rated / Total Orders made
- Commented : At least once
- Comment Ratio : Orders commented / Total Orders made

In [None]:
order_columns = df_order.columns.tolist()

print(order_columns)


#### Keep/Discard : 
<hr>

##### Discard :
- `order_purchase_dt', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date'` are already partially in df_rfm, no use in duplication
- `geoloc_cx', 'geoloc_sellers_list'` : case Cx, already computed the most recent known GPS position, no use for sellers.

##### Method : 
We will use `order_id_list` and loop over the rfm dataset, taking the info we need. To reduce computation time, we first will remove columns we don't need.

##### Maybe :
The use of products can be important, we will append it as well.

In [None]:
# m for merger
columns_drop = [
    "order_purchase_dt", "order_approved_at", "order_delivered_carrier_date",
    "order_delivered_customer_date", "order_estimated_delivery_date", "geoloc_cx",
    "geoloc_sellers_list"
    ]

df_order_m = df_order.drop(columns=columns_drop)

df_order_m.head()


In [None]:
df_rfm.head()


In [None]:
df_rfm.dtypes

# Type enforcement and export as pkl necessary


In [None]:
df_rfm["customer_uid"] = df_rfm["customer_uid"].astype(np.uint32)
df_rfm["num_orders"] = df_rfm["num_orders"].astype(np.uint8)
df_rfm["cluster_kmeans_4"] = df_rfm["cluster_kmeans_4"].astype(np.uint8)
df_rfm["cluster_DBSCAN"] = df_rfm["cluster_DBSCAN"].astype(np.uint8)
df_rfm["most_ancient_order_dt"] = pd.to_datetime(df_rfm["most_ancient_order_dt"])
df_rfm["most_recent_order_dt"] = pd.to_datetime(df_rfm["most_recent_order_dt"])


In [None]:
# # Somewhere along the way I broke the order lists so lets fix,
# # I dont know where, but this works as a fix
# # Lets comment that out once we are done, saves time 
# # (there are better ways to spend 2 minutes of computing power)
# # Uncomment if orderlists are broken again

# unique_ids = df_rfm["customer_uid"].unique()

# uid_aliases = dict.fromkeys(unique_ids)

# for uid in uid_aliases:
#     uid_aliases[uid] = df_cx[df_cx["customer_unique_id"] == uid]["customer_id"].values.tolist()

# # Drop
# try:
#     df_rfm.drop(columns=["order_id_list"], inplace=True)
# except KeyError:
#     #means we dropped it
#     pass


# rfm_id_drop = []  # As long as we are here we can pick up the 192 deleted cxs

# df_rfm["order_id_list"] = np.dtype("object")

# for key, alias_list in uid_aliases.items():
#     rfm_index = df_rfm[df_rfm["customer_uid"] == key].index.values[0]
#     order_list = []
#     for alias in alias_list:
#         try:
#             order_by_alias = df_order[df_order["customer_id"] == alias]["order_id"].values
#             order_list.append(int(order_by_alias))
#         except TypeError:
#             # Means we are on one of the 192 deleted orders, thats fine, skip
#             pass
#     if len(order_list) >= 1:
#         df_rfm.at[rfm_index, "order_id_list"] = np.array(order_list, dtype=object)
#     else:
#         rfm_id_drop.append(rfm_index)


In [None]:
# # Saving in the middle because of the lists, uncomment to redo
# df_rfm.to_pickle(path="../pickles/geolocd/cx_rfm.plk")
# df_rfm.to_csv(path_or_buf="../data/optimized/geoloc_applied/cx_rfm.csv", index=False)


In [None]:
df_rfm.dtypes


In [None]:
df_rfm["rating_avg"] = np.nan
df_rfm["rating_ratio"] = np.nan
df_rfm["comment_ratio"] = np.nan
df_rfm["has_rated"] = False
df_rfm["has_commented"] = False

reviews_fields = ["rating_avg", "rating_ratio", "comment_ratio", "has_rated", "has_commented"]
Reviews_info = namedtuple(typename="Reviews_info", field_names=reviews_fields)


In [None]:
def get_review_info(id_list):

    orders = df_order_m[df_order_m["order_id"].isin(id_list)]
    match len(orders):

        case 1:
            if any(orders["review_score"].notna()):
                rating = orders["review_score"].values.sum()
                rating_ratio = 1
                has_rated = True
            else:
                rating = np.nan
                rating_ratio = 0
                has_rated = False

            if any(orders["has_comment"] == True):
                has_commented = True
                comment_ratio = 1
            else:
                has_commented = False
                comment_ratio = 0

        case _:
            if any(orders["review_score"].notna()):
                avg_orders = orders.dropna(subset=["review_score"])
                rating = np.average(avg_orders["review_score"].values.tolist())
                has_rated = True
                rating_ratio = len(orders["review_score"].notna()) / len(orders)
            else:
                rating = np.nan
                has_rated = False
                rating_ratio = 0

            if any(orders["has_comment"] == True):
                comment_count = len(orders[orders["has_comment"]] == True)
                comment_ratio = comment_count / len(orders)
                has_commented = True
            else:
                comment_ratio = 0
                has_commented = False

    reviews_info = Reviews_info(
        rating_avg=rating, rating_ratio=rating_ratio,
        comment_ratio=comment_ratio, has_rated=has_rated, has_commented=has_commented
        )
    return reviews_info


In [None]:
orders_all = df_order["order_id"].unique()
rm_rfm_idx = []

# Takes a long while, finding a more efficient way would be an improvement

for index, row in df_rfm.iterrows():
    rfm_order_ids = row["order_id_list"]
    order_in_both = False
    try:
        reviews_info = get_review_info(rfm_order_ids)
        df_rfm.at[index, "rating_avg"] = reviews_info.rating_avg
        df_rfm.at[index, "rating_ratio"] = reviews_info.rating_ratio
        df_rfm.at[index, "comment_ratio"] = reviews_info.comment_ratio
        df_rfm.at[index, "has_rated"] = reviews_info.has_rated
        df_rfm.at[index, "has_commented"] = reviews_info.has_commented
    except TypeError:
        rm_rfm_idx.append(index)
        pass


In [None]:
print(f"removing {len(rm_rfm_idx)} clients since we removed their orders duringthe percentile filtering")


In [None]:
# Let's give it a new name since its no longer RFM

df_export = df_rfm.drop(index=rm_rfm_idx)


In [None]:
df_export.head()


In [None]:
df_export.info()


## <u>3.2 Delta delivery & Delta Expected/Actual</u>

- ∆Delivery : Average if multiple orders, nan if no orders (?) or input cluster avg.
- ∆Expect/delivery : Average if multiple orders, nan if no orders (?) or input cluster avg.
- Early_on_time : True majority, including 50% , False if under, Nan if no info

In [None]:
df_export["delta_delivery"] = np.nan
df_export["expected_reality"] = np.nan
df_export["early_on_time"] = np.nan

df_export.reset_index(drop=True, inplace=True)

distance_fields = ["delta_delivery", "expected_reality", "early_on_time"]

Distance_info = namedtuple("Distance_info", field_names=distance_fields)


In [None]:
def get_time_info(order_list):
    orders = df_order_m[df_order_m["order_id"].isin(order_list)]
    
    match len(orders):
    
        case 1:
            delta_delivery = orders["delta_delivery"].values[0]
            expected_reality = orders["expectation_delta"].values[0]
            early_on_time = orders["early_on_time"].values[0]
    
        case _:
            try:
                delta_delivery = np.average(orders["delta_delivery"].values)
                expected_reality = np.average(orders["delta_delivery"].values)
                trues_num = len(orders[orders["early_on_time"] == True])
                false_num = len(orders[orders["early_on_time"] == False])
            
                if trues_num >= false_num:
                    early_on_time = True
                elif false_num > trues_num:
                    early_on_time = False

            except ValueError:
                distance_info = Distance_info(
                    delta_delivery=np.nan,
                    expected_reality=np.nan,
                    early_on_time=np.nan
                    )
                return distance_info

    distance_info = Distance_info(
        delta_delivery=delta_delivery,
        expected_reality=expected_reality,
        early_on_time=early_on_time
        )

    return distance_info
    

In [None]:
for index, row in df_export.iterrows():
    export_orders = row["order_id_list"]
    order_in_both = False
    distance_info = get_time_info(export_orders)
    df_export.at[index, "delta_delivery"] = distance_info.delta_delivery
    df_export.at[index, "expected_reality"] = distance_info.expected_reality
    df_export.at[index, "early_on_time"] = distance_info.early_on_time


In [None]:
df_export.head()


## <u>3.3 : Deltas distances Cx/seller :</u>
- Average if multiple orders
- Checking for cases with no orders

In [None]:
df_export["distance_cx_seller"] = np.nan


In [None]:
def get_dist_info(order_list):
    orders = df_order_m[df_order_m["order_id"].isin(order_list)]
    match len(orders):
        case 1:
            return float(orders["seller_to_cx_dist"])
        case _:
            return np.average(orders["seller_to_cx_dist"])


In [None]:
for index, row in df_export.iterrows():
    order_ids = row["order_id_list"]
    delta_dist = get_dist_info(order_list=order_ids)
    df_export.at[index, "distance_cx_seller"] = delta_dist


In [None]:
df_export.dtypes


Before exporting we want to ensure correct types and formats are enforced, Timedeltas have switched from days to nanoseconds and this is an unnecessary degree of precision we corrected earlier.
Early/OnTime is boolean with some NaNs - the rest looks okay


In [None]:
df_export["delta_delivery"] = pd.to_timedelta(df_export["delta_delivery"], unit="days")
df_export["expected_reality"] = pd.to_timedelta(df_export["expected_reality"], unit="days")
df_export["early_on_time"] = df_export["early_on_time"].astype(bool)


In [None]:
df_export.info()


In [None]:
df_export.head()


In [None]:
# Looks good and usable, let's export that as pickle and csv for backup

df_export.to_pickle(path="../final_datasets/olist_customers.pkl")
df_export.to_csv(path_or_buf="../final_datasets/olist_customers.csv")
