In [None]:
from dotenv import load_dotenv
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

load_dotenv()
dataset_path = os.getenv("DATASET_PATH")

if not dataset_path:
    raise ValueError("DATASET_PATH NOT FOUND")

print("Files available in dataset:")
data_file = None
for filename in os.listdir(dataset_path):
    print(filename)
    if filename.endswith(".csv"):
        data_file = os.path.join(dataset_path, filename)

if not data_file:
    raise ValueError("No CSV file found in dataset folder")

pd.set_option("display.float_format", "{:.0f}".format)
df = pd.read_csv(data_file)

print("Shape (rows, columns):", df.shape)
print("\nFirst 5 rows:")
print(df.head())

print("\nData types:")
print(df.dtypes)

numeric_columns = df.select_dtypes(include="number").columns.tolist()
categorical_columns = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

print("\nNumerical columns:", numeric_columns)
print("Categorical columns:", categorical_columns)

print("\nMissing values per column:")
print(df.isna().sum())

print("\nBasic statistics for numerical columns:")
print(df.describe())

print("\nCardinality (number of unique values) for categorical columns:")
print(df[categorical_columns].nunique())

print("\nExample value counts for each categorical column:")
for col in categorical_columns:
    print(f"\nColumn: {col}")
    print(df[col].value_counts().head())

df = df.dropna(subset=["TRANSACTION_DT", "PRODUCT_SUBCLASS"])

try:
    df["TRANSACTION_DT"] = pd.to_datetime(df["TRANSACTION_DT"])
except Exception:
    pass

df["present"] = 1

basket = df.pivot_table(
    index="TRANSACTION_DT",
    columns="PRODUCT_SUBCLASS",
    values="present",
    aggfunc="max",
    fill_value=0,
)

print("\nCrisp basket (0/1) head:")
print(basket.head())

product_columns = basket.columns.tolist()
n_transactions, n_products = basket.shape

uniform_random = np.random.rand(n_transactions, n_products)
existential_probabilities = 0.9 - 0.2 * uniform_random

presence_mask = basket.to_numpy() == 1
probability_matrix = presence_mask * existential_probabilities

df_uapriori = pd.DataFrame(probability_matrix, columns=product_columns)
df_uapriori.insert(0, "TRANSACTION_DT", basket.index)

print("\nU-Apriori formatted data (head):")
print(df_uapriori.head())

output_file = os.path.join(dataset_path, "uapriori_dataset.csv")
df_uapriori.to_csv(output_file, index=False)
print("\nU-Apriori dataset saved to:", output_file)

transaction_id_column = "TRANSACTION_DT"
item_columns_u = [c for c in df_uapriori.columns if c != transaction_id_column]

print("\nStructural check: head")
print(df_uapriori.head())

print("\nStructural check: info")
print(df_uapriori.info())

print("\nNumber of items:", len(item_columns_u))
print("Example item columns:", item_columns_u[:10])

has_nans = df_uapriori[item_columns_u].isna().any().any()
global_min = df_uapriori[item_columns_u].min().min()
global_max = df_uapriori[item_columns_u].max().max()

print("\nNaN check on item columns:", has_nans)
print("Global min over item columns:", float(global_min))
print("Global max over item columns:", float(global_max))

values = df_uapriori[item_columns_u].to_numpy()
positive_values = values[values > 0]

if positive_values.size > 0:
    print("Min positive probability:", float(positive_values.min()))
    print("Max positive probability:", float(positive_values.max()))
else:
    print("No positive probabilities found")

crisp_from_probabilities = (df_uapriori[item_columns_u] > 0).astype(int)

if crisp_from_probabilities.shape == basket[item_columns_u].shape:
    mismatched_cells = (
        crisp_from_probabilities.to_numpy() != basket[item_columns_u].to_numpy()
    ).sum()
    print("\nMismatched cells between basket and df_uapriori:", int(mismatched_cells))
else:
    print("\nShape mismatch between basket and df_uapriori item matrices")
    print("basket shape:", basket[item_columns_u].shape)
    print("df_uapriori shape:", crisp_from_probabilities.shape)

if positive_values.size > 0:
    mean_probability = float(positive_values.mean())
    std_probability = float(positive_values.std())
    print("\nProbability distribution summary")
    print("Mean p:", mean_probability)
    print("Std p:", std_probability)
else:
    print("\nProbability distribution summary not computed (no positive values)")



Files available in dataset:
uapriori_dataset.csv
ta_feng_all_months_merged.csv
Shape (rows, columns): (817741, 9)

First 5 rows:
  TRANSACTION_DT  CUSTOMER_ID AGE_GROUP PIN_CODE  PRODUCT_SUBCLASS  \
0      11/1/2000      1104905     45-49      115            110411   
1      11/1/2000       418683     45-49      115            120107   
2      11/1/2000      1057331     35-39      115            100407   
3      11/1/2000      1849332     45-49   Others            120108   
4      11/1/2000      1981995     50-54      115            100205   

      PRODUCT_ID  AMOUNT  ASSET  SALES_PRICE  
0  4710199010372       2     24           30  
1  4710857472535       1     48           46  
2  4710043654103       2    142          166  
3  4710126092129       1     32           38  
4  4710176021445       1     14           18  

Data types:
TRANSACTION_DT      object
CUSTOMER_ID          int64
AGE_GROUP           object
PIN_CODE            object
PRODUCT_SUBCLASS     int64
PRODUCT_ID          