In [1]:
import math
import pandas as pd
import numpy as np
from pgmpy.models import BayesianModel, BayesianNetwork
from pgmpy.estimators import MaximumLikelihoodEstimator

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("../../Data.csv")
df.dropna(inplace=True)

In [3]:
def transform_df(df):
    purchases = df[df['Quantity'] > 0].copy()
    returns = df[df['Quantity'] < 0].copy()
    
    returns['Quantity'] = returns['Quantity'].abs()
    
    returns_grouped = returns.groupby(['Customer ID', 'StockCode']).agg({'Quantity': 'sum'}).reset_index()
    returns_grouped['is_returned'] = 1
    
    result = pd.merge(purchases, returns_grouped[['Customer ID', 'StockCode', 'is_returned']], on=['Customer ID', 'StockCode'], how='left')
    
    # Заполнение NaN значений в is_returned как 0 (нет возврата)
    result['is_returned'] = result['is_returned'].fillna(0).astype(int)

    return result

transformed_df = transform_df(df)

In [4]:
from functools import partial

q_quantiles = transformed_df['Quantity'].quantile([0.25, 0.5, 0.75]).values
p_quantiles = transformed_df['Price'].quantile([0.25, 0.5, 0.75]).values

def round_to_quantile(quantiles, x):
    return quantiles[(np.abs(quantiles - x)).argmin()]

transformed_df["Quantity_rounded"] = transformed_df["Quantity"].apply(partial(round_to_quantile, q_quantiles))
transformed_df["Price_rounded"] = transformed_df["Price"].apply(partial(round_to_quantile, p_quantiles))

In [5]:
model = BayesianNetwork([('Price_rounded', 'is_returned'), ('Quantity_rounded', 'is_returned')])

In [6]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(transformed_df, random_state=45, shuffle=True)

In [7]:
model.fit(df_train, estimator=MaximumLikelihoodEstimator)

In [8]:
for cpd in model.get_cpds():
    print("CPD of {variable}:".format(variable=cpd.variable))
    print(cpd)

CPD of Price_rounded:
+---------------------+----------+
| Price_rounded(1.25) | 0.378163 |
+---------------------+----------+
| Price_rounded(1.95) | 0.259491 |
+---------------------+----------+
| Price_rounded(3.75) | 0.362345 |
+---------------------+----------+
CPD of is_returned:
+------------------+-----+------------------------+
| Price_rounded    | ... | Price_rounded(3.75)    |
+------------------+-----+------------------------+
| Quantity_rounded | ... | Quantity_rounded(12.0) |
+------------------+-----+------------------------+
| is_returned(0)   | ... | 0.859185378076942      |
+------------------+-----+------------------------+
| is_returned(1)   | ... | 0.14081462192305802    |
+------------------+-----+------------------------+
CPD of Quantity_rounded:
+------------------------+----------+
| Quantity_rounded(2.0)  | 0.47149  |
+------------------------+----------+
| Quantity_rounded(6.0)  | 0.159741 |
+------------------------+----------+
| Quantity_rounded(12.0) | 0.3

In [13]:
y_pred = model.predict(df_test[["Price_rounded", "Quantity_rounded"]])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 10739.33it/s]


In [15]:
from sklearn.metrics import accuracy_score
accuracy_score(df_test["is_returned"], y_pred)

0.9431677730737611