In [69]:
import pandas as pd
from data_retrieval import connect_db, fetch_trx_data, fetch_cat_data, fetch_cat1_data, fetch_private_label_data, fetch_sustainability_data
from data_processing import multi_aggregate_data, apply_custom_calculations
from data_analysis import create_baskets_365_qty, flatten_baskets
from modelling import create_cooccurrence_matrix_with_recommendations
from modeling_2 import get_top_5_cat3_items, get_top_5_cat1_items, create_cat3_to_top_item_map
from modeling_3 import  replace_item_cde_with_cat3_set,map_and_add_recommendations, map_and_add_recommendations_cat1, transform_recommendations, reorder_recommendations
from modeling_4 import remove_duplicate_and_self_references, shift_recommendations_left, remove_empty_related_items

In [21]:
# Sittun's code review edit suggestions

#import modelling as m1
#import modeling_2 as m2

In [22]:
#m2.replace_low_values()

In [23]:
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

In [24]:
# Load items 
df = pd.read_excel('data/AB_Scope_sust.xlsx', usecols=['Item Number'])
print(f">> Loaded {len(df)} e-commerce items with their item_cde")

>> Loaded 4284 e-commerce items with their item_cde


In [25]:
# Rename the columns
df.rename(columns={'Item Number': 'item_cde'}, 
          inplace=True)
print(f">> Renamed the columns to {df.columns.tolist()}")

>> Renamed the columns to ['item_cde']


In [26]:
# connecting to db
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [27]:
# getting transaction data from db
trx_df = fetch_trx_data(conn, df)
print(f">> Fetched the transaction data containing {len(trx_df)} rows")

>> Fetched the transaction data containing 1298320 rows


## If Manufacturer filteration is needed (when scope not given)

In [28]:
#trx_df_new = trx_df[trx_df['mfg_name'].str.contains('3M', na=False)]


In [29]:
# OPTIONAL: Apply if mfg filter needed
#mfg_df=filter_mfg_name(trx_df, '')

# Filter segment


In [30]:
#Get only the Packaging df entries
trx_df_pkg =  trx_df[trx_df['segment'].str.contains('Facility', na=False)]
print(f">> Sixe of data for Packaging data is Length: {len(trx_df_pkg)}")

>> Sixe of data for Packaging data is Length: 834912


In [31]:
items_pkg_sust=trx_df_pkg['item_cde'].unique()

In [32]:
items_pkg_sust=[int(x) for x in items_pkg_sust]

In [33]:
items_pkg_sust_df = pd.DataFrame({'item_cde': items_pkg_sust})

In [34]:
# aggregate data at bill-to
aggregated_df = multi_aggregate_data(trx_df_pkg)
print(f">> Aggregated data at bill-to level.Length: {len(aggregated_df)} rows")

>> Aggregated data at bill-to level.Length: 8407 rows


In [35]:
# custom column calculation for avg time interval
custom_calculated_df = apply_custom_calculations(aggregated_df)
print(f">> Calculated custom columns.Length: {len(custom_calculated_df)} rows")

>> Calculated custom columns.Length: 8407 rows


In [36]:
# make baskets for all possible starting date
baskets_df_365_qty = create_baskets_365_qty(custom_calculated_df)
print(f">> Made baskets with size: {len(baskets_df_365_qty)} baskets")

>> Made baskets with size: 8407 baskets


In [37]:
# flatten the baskets
flattened_basket_list_365 = flatten_baskets(baskets_df_365_qty)
print(f">> Flattened baskets. Length: {len(flattened_basket_list_365)}")

>> Flattened baskets. Length: 158682


In [38]:
from collections import defaultdict
import pandas as pd
from tqdm import tqdm

def create_cooccurrence_matrix_with_recommendations_2(baskets): 
    # Initialize the dictionary of dictionaries
    co_occurrence = defaultdict(lambda: defaultdict(int))
    recommendations = defaultdict(lambda: defaultdict(int))

    # Loop through each basket
    for basket in tqdm(baskets):
        # Extract items and their quantities from the basket
        items = list(basket.keys())
        quantities = list(basket.values())
        
        # Sort the items to ensure that each pair is counted once
        sorted_items = sorted(items)
        
        # Count each pair in the basket
        for i in range(len(sorted_items)):
            for j in range(i + 1, len(sorted_items)):
                item1, item2 = sorted_items[i], sorted_items[j]
                quantity1, quantity2 = quantities[items.index(item1)], quantities[items.index(item2)]
                
                # Increment the co-occurrence count by the minimum quantity of the two items
                co_occurrence[item1][item2] +=  min(quantity1, quantity2)
                co_occurrence[item2][item1] += min(quantity1, quantity2)

    # Convert to DataFrame
    # Extract items and sort them to ensure DataFrame columns and rows are aligned
    items = sorted(co_occurrence.keys())
    df = pd.DataFrame(index=items, columns=items).fillna(0)
    
    # Fill the DataFrame and make recommendations
    for item1, neighbors in tqdm(co_occurrence.items()):
        for item2, count in neighbors.items():
            df.at[item1, item2] = count
            recommendations[item1][item2] = count
            recommendations[item2][item1] = count
    
    # Create recommendations DataFrame for all items
    rec_df = pd.DataFrame(index=items, columns=[f"Recommendation {i+1}" for i in range(len(items)-1)])
    for item, recs in recommendations.items():
        # Remove the queried item from recommendations
        recs.pop(item, None)
        # Sort by count
        sorted_recs = sorted(recs.items(), key=lambda x: x[1], reverse=True)
        total_count = sum(count for _, count in sorted_recs)
        
        for i, (rec_item, fre) in enumerate(sorted_recs):
            rec_df.at[item, f"Recommendation {i+1}"] = rec_item
    
    return df, rec_df


In [39]:
# make item level recommendations
co_occurrence_matrix, recommendation_df = create_cooccurrence_matrix_with_recommendations_2(flattened_basket_list_365)
print(f">> Made item level recommendations for: {len(recommendation_df)} items")

100%|██████████| 158682/158682 [00:22<00:00, 7088.24it/s] 
100%|██████████| 1468/1468 [00:10<00:00, 145.24it/s]


>> Made item level recommendations for: 1468 items


In [40]:
# Convert all values to integers, ignoring NaN
recommendation_df = recommendation_df.apply(pd.to_numeric, errors='coerce')

# Create the desired DataFrame
result_df = pd.DataFrame({
    'item_cde': recommendation_df.index,
    'reco': recommendation_df.apply(lambda row: [int(x) for x in row.dropna()], axis=1)
})



# Filter for sustainable

In [41]:
# connecting to db
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [42]:
# Get sustainable label information
sustainable_df = fetch_sustainability_data(conn, df)
print(f">> Fetched the sustainable label data containing {len(sustainable_df)} rows")

>> Fetched the sustainable label data containing 1809 rows


In [43]:
# Convert sustainable_df['item_cde'] to integers
sustainable_df['item_cde'] = sustainable_df['item_cde'].astype(int)

# Create a set of sustainable items
sustainable_set = set(sustainable_df['item_cde'])

# Filter reco list based on sustainable_df
result_df['reco'] = result_df['reco'].apply(lambda lst: [x for x in lst if x in sustainable_set])


In [44]:
result_df = result_df[result_df['reco'].apply(lambda x: len(x) > 0)]


In [45]:
# Convert sustainable_df['item_cde'] to integers
sustainable_df['item_cde'] = sustainable_df['item_cde'].astype(int)

# Create a set of sustainable items
sustainable_set = set(sustainable_df['item_cde'])

# Filter reco list based on sustainable_df
result_df['reco'] = result_df['reco'].apply(lambda lst: [x for x in lst if x in sustainable_set])

In [46]:
# Filter trx_df_pkg and keep 'item_cde' as a string
trx_df_sustainable = trx_df_pkg[trx_df_pkg['item_cde'].astype(int).isin(sustainable_set)].copy()

# Ensure 'item_cde' remains as string
trx_df_sustainable['item_cde'] = trx_df_sustainable['item_cde'].astype(str)
#trx_df_sustainable

In [47]:
# Get the cat3 top 5 items
top_5_items_cat3=get_top_5_cat3_items(trx_df_sustainable)
print(f">> Got cat3 top 5 items: {len(top_5_items_cat3)}")

>> Got cat3 top 5 items: 47


In [48]:
# Get the cat1 top 5 items
top_5_items_cat1=get_top_5_cat1_items(trx_df_sustainable)
print(f">> Got cat1 top 5 items: {len(top_5_items_cat1)}")

>> Got cat1 top 5 items: 9


In [49]:
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [50]:
# getting cat3 data from db
conn = connect_db()

cat3_df = fetch_cat_data(conn, df)
print(f">> Fetched the cat3 data containing {len(cat3_df)} rows")

%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 

In [51]:
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [52]:
# getting cat1 data from db
cat1_df = fetch_cat1_data(conn, df)
print(f">> Fetched the cat1 data containing {len(cat1_df)} rows")

%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 

>> Fetched the cat1 data containing 4284 rows


In [53]:
# Map item to top item in cat3
item_to_cat3_top_map= create_cat3_to_top_item_map(df, cat3_df, top_5_items_cat3)
print(f">> Map items from item to top items in cat3 {len(item_to_cat3_top_map)} rows")

>> Map items from item to top items in cat3 1065 rows


# Cat 3 operations

In [54]:
# Make cat3 level baskets for co-occurance
cat3_basket_365=replace_item_cde_with_cat3_set(flattened_basket_list_365, cat3_df)
print(f">> Made cat3 level baskets for co-occurance size: {len(cat3_basket_365)}")

>> Made cat3 level baskets for co-occurance size: 158682


In [55]:
# Create co-occurance df for cat3 level
cat3_co_occurrence_matrix, cat3_recommendation_df=create_cooccurrence_matrix_with_recommendations_2(cat3_basket_365)
print(f">> Create co-occurance df for cat3 level df Length: {len(cat3_recommendation_df)}")

  1%|          | 1655/158682 [00:00<00:09, 16314.00it/s]

100%|██████████| 158682/158682 [00:07<00:00, 21341.22it/s]
100%|██████████| 117/117 [00:00<00:00, 450.32it/s]


>> Create co-occurance df for cat3 level df Length: 117


In [56]:
# Map cat3 and add cat3 level recos
new_cat3_recommendation_df = map_and_add_recommendations(items_pkg_sust_df, cat3_df, result_df, cat3_recommendation_df,top_5_items_cat3)
print(f">> Product+cat3 level df of Length: {len(new_cat3_recommendation_df)} rows")

>> Product+cat3 level df of Length: 1482 rows


In [57]:
result_df_2 = pd.DataFrame({
    'item_cde': new_cat3_recommendation_df.item_cde,
    'reco': new_cat3_recommendation_df.drop(columns=['item_cde'])  # Apply to all other columns
        .apply(lambda row: [
            int(x) if isinstance(x, (str, float, int)) and str(x).strip() != '' else x 
            for x in row.dropna()
        ], axis=1)
})

# Cat 1 Operations

In [58]:
# Make cat1 level baskets for co-occurance
cat1_basket_365=replace_item_cde_with_cat3_set(flattened_basket_list_365, cat1_df)
print(f">> Made cat1 level baskets for co-occurance size: {len(cat1_basket_365)}")

>> Made cat1 level baskets for co-occurance size: 158682


In [59]:
# Create co-occurance df for cat1 level
cat1_co_occurrence_matrix, cat1_recommendation_df=create_cooccurrence_matrix_with_recommendations(cat1_basket_365, top_n=5)
print(f">> Create co-occurance df for cat3 level df Length: {len(cat1_recommendation_df)}")

  0%|          | 0/158682 [00:00<?, ?it/s]

100%|██████████| 158682/158682 [00:01<00:00, 83289.04it/s]
100%|██████████| 12/12 [00:00<00:00, 1461.13it/s]


>> Create co-occurance df for cat3 level df Length: 12


In [60]:
# Map cat1 and add cat3 level recos
new_cat1_recommendation_df = map_and_add_recommendations_cat1(items_pkg_sust_df, cat1_df, new_cat3_recommendation_df, cat1_recommendation_df, top_5_items_cat1)
print(f">> Product+cat1+cat3 level df of Length: {len(new_cat1_recommendation_df)} rows")

>> Product+cat1+cat3 level df of Length: 1482 rows


In [61]:
result_df_3 = pd.DataFrame({
    'item_cde': new_cat1_recommendation_df.item_cde,
    'reco': new_cat1_recommendation_df.drop(columns=['item_cde'])  # Apply to all other columns
        .apply(lambda row: [
            int(x) if isinstance(x, (str, float, int)) and str(x).strip() != '' else x 
            for x in row.dropna()
        ], axis=1)
})

In [62]:
import pandas as pd
import itertools


# Ensure reco column contains lists and handle empty lists
result_df_3['reco'] = result_df_3['reco'].apply(lambda x: x if isinstance(x, list) else [])

# Flatten list of lists into a single list
result_df_3['reco'] = result_df_3['reco'].apply(lambda x: list(itertools.chain.from_iterable(x)) if any(isinstance(i, list) for i in x) else x)

# Sort each list and keep the first 5 min values
result_df_3['reco'] = result_df_3['reco'].apply(lambda x: x[:5])

# Convert the lists into separate columns
df_result_expanded = result_df_3.reco.apply(lambda x: pd.Series(x)).rename(columns=lambda i: f'Recommendation {i+1}')

# Merge with original DataFrame
df_3 = pd.concat([result_df_3.drop(columns=['reco']), df_result_expanded], axis=1)



# Reorder private items

In [63]:
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [64]:
# Get private label information
private_l_df = fetch_private_label_data(conn, df)
print(f">> Fetched the private label data containing {len(private_l_df)} rows")

>> Fetched the private label data containing 4284 rows


In [65]:
private_l_df['private_label_sw'].value_counts ()

private_label_sw
N    2568
Y    1715
Name: count, dtype: int64

In [66]:
# Re-order private items to top
reorder_private_df = reorder_recommendations(df_3, private_l_df)
print(f">> Re-ordered the private label data containing {len(reorder_private_df)} rows")

>> Re-ordered the private label data containing 1482 rows


# Sanity Check for all recommended items in sustainable

In [67]:
# Convert 'item_cde' column in items_pkg_sust_df to a set for fast lookup
sustainable_items_set = set(items_pkg_sust_df['item_cde'])

# Define a function to check if all recommended items exist in sustainable_items_set
def all_recos_in_sust(row):
    recommendations = [row[f'Recommendation {i}'] for i in range(1, 6) if pd.notna(row[f'Recommendation {i}'])]
    return all(item in sustainable_items_set for item in recommendations)

# Apply the check function to each row
df_3['check'] = df_3.apply(all_recos_in_sust, axis=1)



In [70]:
# Remove duplicate recommendations and self-references
reorder_df_cleaned = remove_duplicate_and_self_references(df_3)
print(f">>Removed duplicate recommendations and self-references data containing {len(reorder_df_cleaned)} rows")

>>Removed duplicate recommendations and self-references data containing 1482 rows


In [71]:
#Shift non empty recommendations to left
reorder_shift_left = shift_recommendations_left(reorder_df_cleaned)
print(f">>Shift non empty recommendations data containing {len(reorder_shift_left)} rows")

>>Shift non empty recommendations data containing 1482 rows


# Transform for enable

In [72]:
# Transform the recommendations row-wise
transformed_df = transform_recommendations(reorder_shift_left)
print(f">> Transform the recommendations, new Length {len(transformed_df)} rows, which is 5 times {len(new_cat1_recommendation_df)} (Prev length)")

>> Transform the recommendations, new Length 7410 rows, which is 5 times 1482 (Prev length)


In [73]:
# Remove empty items from related items column
nonempty_transformed_df=remove_empty_related_items(transformed_df)
print(f">>Data after removing empty related items have {len(nonempty_transformed_df)} rows")

>>Data after removing empty related items have 7109 rows


In [74]:
nonempty_transformed_df

Unnamed: 0,Primary Item Number,Related Item Number
0,10012415,10098323
1,10012415,10546160
2,10012415,10943252
3,10012415,10097190
4,10012415,10049812
...,...,...
7403,20051712,10568358
7404,20051712,10098323
7406,20036187,10169745
7407,20036187,10477987


In [78]:
#nonempty_transformed_df.to_csv('nonprint_minimum2_recommendations.csv', index=False)

## Result needed for new items only

In [75]:
nonempty_transformed_df.to_csv('FS_SUSTAINABLE_RECOS.csv',index=False)