# Load

In [115]:
import pandas as pd
from data_retrieval import connect_db, fetch_trx_data, fetch_cat_data, fetch_cat1_data, fetch_private_label_data, fetch_sustainability_data, fetch_segment_data
from data_processing import multi_aggregate_data, apply_custom_calculations
from data_analysis import  create_baskets_365_qty, flatten_baskets
from modelling import create_cooccurrence_matrix_with_recommendations, create_cooccurrence_matrix_with_recommendations_2
from modeling_2 import get_top_5_cat3_items, get_top_5_cat1_items, create_cat3_to_top_item_map
from modeling_3 import replace_item_cde_with_cat3_set,map_and_add_recommendations, map_and_add_recommendations_cat1, transform_recommendations, reorder_private
from modeling_4 import remove_duplicate_and_self_references, shift_recommendations_left, remove_empty_related_items, drop_spaces

In [116]:
# Sittun's code review edit suggestions

#import modelling as m1
#import modeling_2 as m2

In [117]:
#m2.replace_low_values()

In [118]:
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

In [119]:
# Load items 
df = pd.read_excel('data/AB_Scope_sust.xlsx', usecols=['Item Number'])
print(f">> Loaded {len(df)} e-commerce items with their item_cde")

>> Loaded 4284 e-commerce items with their item_cde


In [120]:
#Rename the columns
df.rename(columns={'Item Number': 'item_cde'}, 
          inplace=True)
print(f">> Renamed the columns to {df.columns.tolist()}")

>> Renamed the columns to ['item_cde']


## Fetch segment data

In [121]:
# connecting to db
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [122]:
# getting segment data from db
seg_df = fetch_segment_data(conn, df)
print(f">> Fetched the segment data containing {len(seg_df)} rows")

>> Fetched the segment data containing 4284 rows


In [123]:
#filter Packaging segment
seg_pkg_df=seg_df[seg_df['segment'].str.contains('Print', na=False)]

In [124]:
# connecting to db
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [125]:
# getting transaction data from db
trx_seg_df = fetch_trx_data(conn, seg_pkg_df)
print(f">> Fetched the transaction data containing {len(trx_seg_df)} rows")

>> Fetched the transaction data containing 184077 rows


## If Manufacturer filteration is needed (when scope not given)

In [126]:
#trx_df_new = trx_seg_df[trx_seg_df['mfg_name'].str.contains('3M', na=False)]


In [127]:
# OPTIONAL: Apply if mfg filter needed
#mfg_df=filter_mfg_name(trx_df, '')

In [128]:
# aggregate data at bill-to
aggregated_df = multi_aggregate_data(trx_seg_df)
print(f">> Aggregated data at bill-to level.Length: {len(aggregated_df)} rows")

>> Aggregated data at bill-to level.Length: 5174 rows


In [129]:
# custom column calculation for avg time interval
custom_calculated_df = apply_custom_calculations(aggregated_df)
print(f">> Calculated custom columns.Length: {len(custom_calculated_df)} rows")

>> Calculated custom columns.Length: 5174 rows


In [130]:
# make baskets for all possible starting date
baskets_df_365_qty = create_baskets_365_qty(custom_calculated_df)
print(f">> Made baskets with size: {len(baskets_df_365_qty)} baskets")

>> Made baskets with size: 5174 baskets


In [131]:
# flatten the baskets
flattened_basket_list_365 = flatten_baskets(baskets_df_365_qty)
print(f">> Flattened baskets. Length: {len(flattened_basket_list_365)}")

>> Flattened baskets. Length: 82733


In [132]:
# make item level recommendations
co_occurrence_matrix, recommendation_df = create_cooccurrence_matrix_with_recommendations_2(flattened_basket_list_365)
print(f">> Made item level recommendations for: {len(recommendation_df)} items")

100%|██████████| 82733/82733 [00:02<00:00, 39396.89it/s]
100%|██████████| 557/557 [00:01<00:00, 405.36it/s]


>> Made item level recommendations for: 557 items


In [133]:
# Convert all values to integers, ignoring NaN
recommendation_df = recommendation_df.apply(pd.to_numeric, errors='coerce')

# Create the desired DataFrame
result_df = pd.DataFrame({
    'item_cde': recommendation_df.index,
    'reco': recommendation_df.apply(lambda row: [int(x) for x in row.dropna()], axis=1)
})



# Filter for sustainable

In [134]:
# connecting to db
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [135]:
# Get sustainable label information
sustainable_df = fetch_sustainability_data(conn, df)
print(f">> Fetched the sustainable label data containing {len(sustainable_df)} rows")

>> Fetched the sustainable label data containing 1842 rows


In [136]:
# Convert sustainable_df['item_cde'] to integers
sustainable_df['item_cde'] = sustainable_df['item_cde'].astype(int)

# Create a set of sustainable items
sustainable_set = set(sustainable_df['item_cde'])

# Filter reco list based on sustainable_df
result_df['reco'] = result_df['reco'].apply(lambda lst: [x for x in lst if x in sustainable_set])

In [137]:
# segment_sustainable items
df_pkg_sustainable = seg_pkg_df[seg_pkg_df['item_cde'].astype(int).isin(sustainable_set)].copy()


In [138]:
# Filter trx_df_pkg and keep 'item_cde' as a string
trx_df_sustainable = trx_seg_df[trx_seg_df['item_cde'].astype(int).isin(sustainable_set)].copy()

# Ensure 'item_cde' remains as string
trx_df_sustainable['item_cde'] = trx_df_sustainable['item_cde'].astype(str)
#trx_df_sustainable

In [139]:
result_df = result_df[result_df['reco'].apply(lambda x: len(x) > 0)]


In [140]:
# Get the cat3 top 5 items
top_5_items_cat3=get_top_5_cat3_items(trx_df_sustainable)
print(f">> Got cat3 top 5 items: {len(top_5_items_cat3)}")

>> Got cat3 top 5 items: 14


In [141]:
# Get the cat1 top 5 items
top_5_items_cat1=get_top_5_cat1_items(trx_df_sustainable)
print(f">> Got cat1 top 5 items: {len(top_5_items_cat1)}")

>> Got cat1 top 5 items: 5


In [142]:
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [143]:
# getting cat3 data from db
conn = connect_db()

cat3_df = fetch_cat_data(conn, df)
print(f">> Fetched the cat3 data containing {len(cat3_df)} rows")

%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 

In [144]:
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [145]:
# getting cat1 data from db
cat1_df = fetch_cat1_data(conn, df)
print(f">> Fetched the cat1 data containing {len(cat1_df)} rows")

%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 

In [146]:
# Map item to top item in cat3
item_to_cat3_top_map= create_cat3_to_top_item_map(df, cat3_df, top_5_items_cat3)
print(f">> Map items from item to top items in cat3 {len(item_to_cat3_top_map)} rows")

>> Map items from item to top items in cat3 871 rows


# Cat 3 operations

In [147]:
# Make cat3 level baskets for co-occurance
cat3_basket_365=replace_item_cde_with_cat3_set(flattened_basket_list_365, cat3_df)
print(f">> Made cat3 level baskets for co-occurance size: {len(cat3_basket_365)}")

>> Made cat3 level baskets for co-occurance size: 82733


In [148]:
# Create co-occurance df for cat3 level
cat3_co_occurrence_matrix, cat3_recommendation_df=create_cooccurrence_matrix_with_recommendations_2(cat3_basket_365)
print(f">> Create co-occurance df for cat3 level df Length: {len(cat3_recommendation_df)}")

100%|██████████| 82733/82733 [00:00<00:00, 318708.58it/s]
100%|██████████| 17/17 [00:00<?, ?it/s]

>> Create co-occurance df for cat3 level df Length: 17





In [149]:
# Map cat3 and add cat3 level recos
new_cat3_recommendation_df = map_and_add_recommendations(seg_pkg_df, cat3_df, result_df, cat3_recommendation_df,top_5_items_cat3)
print(f">> Product+cat3 level df of Length: {len(new_cat3_recommendation_df)} rows")

>> Product+cat3 level df of Length: 897 rows


In [150]:
# Drop empty space on new_cat3_recommendation_df

result_df_2=drop_spaces(new_cat3_recommendation_df)

In [151]:
result_df_2 = pd.DataFrame({
    'item_cde': new_cat3_recommendation_df.item_cde,
    'reco': new_cat3_recommendation_df.drop(columns=['item_cde'])  # Apply to all other columns
        .apply(lambda row: [
            int(x) if isinstance(x, (str, float, int)) and str(x).strip() != '' else x 
            for x in row.dropna()
        ], axis=1)
})

# Cat 1 Operations

In [152]:
# Make cat1 level baskets for co-occurance
cat1_basket_365=replace_item_cde_with_cat3_set(flattened_basket_list_365, cat1_df)
print(f">> Made cat1 level baskets for co-occurance size: {len(cat1_basket_365)}")

>> Made cat1 level baskets for co-occurance size: 82733


In [153]:
# Create co-occurance df for cat1 level
cat1_co_occurrence_matrix, cat1_recommendation_df=create_cooccurrence_matrix_with_recommendations(cat1_basket_365, top_n=5)
print(f">> Create co-occurance df for cat3 level df Length: {len(cat1_recommendation_df)}")

100%|██████████| 82733/82733 [00:00<00:00, 543351.06it/s]
100%|██████████| 7/7 [00:00<?, ?it/s]

>> Create co-occurance df for cat3 level df Length: 7





In [154]:
# Map cat1 and add cat3 level recos
new_cat1_recommendation_df = map_and_add_recommendations_cat1(seg_pkg_df, cat1_df, new_cat3_recommendation_df, cat1_recommendation_df, top_5_items_cat1)
print(f">> Product+cat1+cat3 level df of Length: {len(new_cat1_recommendation_df)} rows")

>> Product+cat1+cat3 level df of Length: 936 rows


In [155]:
result_df_3 = pd.DataFrame({
    'item_cde': new_cat1_recommendation_df.item_cde,
    'reco': new_cat1_recommendation_df.drop(columns=['item_cde'])  # Apply to all other columns
        .apply(lambda row: [
            int(x) if isinstance(x, (str, float, int)) and str(x).strip() != '' else x 
            for x in row.dropna()
        ], axis=1)
})

In [156]:
import pandas as pd
import itertools


# Ensure reco column contains lists and handle empty lists
result_df_3['reco'] = result_df_3['reco'].apply(lambda x: x if isinstance(x, list) else [])

# Flatten list of lists into a single list
result_df_3['reco'] = result_df_3['reco'].apply(lambda x: list(itertools.chain.from_iterable(x)) if any(isinstance(i, list) for i in x) else x)

# Sort each list and keep the first 5 min values
result_df_3['reco'] = result_df_3['reco'].apply(lambda x: x[:5])

# Convert the lists into separate columns
df_result_expanded = result_df_3.reco.apply(lambda x: pd.Series(x)).rename(columns=lambda i: f'Recommendation {i+1}')

# Merge with original DataFrame
df_3 = pd.concat([result_df_3.drop(columns=['reco']), df_result_expanded], axis=1)



# Reorder alliance, private items

In [157]:
def reorder_alliance_new(df, pvt_a_label_df):
    """
    Reorders the recommendations for each item_cde based on private_label_sw
    and adds columns indicating whether each recommendation is private branded.
    Parameters:
        df: A DataFrame containing 'item_cde' and 'Recommendation 1' to 'Recommendation 15'.
        private_label_df: A DataFrame containing 'item_cde' and 'private_label_sw'.
    Returns:
        reordered_df: A DataFrame with reordered recommendations and private branding info.
    """
    pvt_a_label_df['Item Number'] = pvt_a_label_df['Item Number'].astype(str)

    alliance_brand_dict = {item: 1 if item in pvt_a_label_df['Item Number'].values else 0 for item in df['item_cde']}
    
    print(alliance_brand_dict)
    
    reordered_recommendations = []

    for _, row in df.iterrows():
        item_cde = row['item_cde']
        recommendations = [(row[f'Recommendation {i}'][0] if isinstance(row[f'Recommendation {i}'], list) else row[f'Recommendation {i}'],
        alliance_brand_dict.get(row[f'Recommendation {i}'][0] if isinstance(row[f'Recommendation {i}'], list) else row[f'Recommendation {i}']))
        for i in range(1, 6)]

        
        # Sort recommendations based on private_label_sw ('Y' should come first)
        recommendations.sort(key=lambda x: x[1] != 1)
        
        reordered_row = {'item_cde': item_cde}
        for i, (rec, private_label) in enumerate(recommendations):
            reordered_row[f'Recommendation {i+1}'] = rec
            reordered_row[f'Recommendation {i+1}_alliance'] = private_label
        
        reordered_recommendations.append(reordered_row)
    
    reordered_df = pd.DataFrame(reordered_recommendations)
    return reordered_df

In [158]:
alliance_df=pd.read_excel("data/Alliance + PB Scope - AB.xlsx")

# Re-order private items to top
reorder_private_df1 = reorder_alliance_new(df_3, alliance_df)
print(f">> Re-ordered the private label data containing {len(reorder_private_df1)} rows")

{'10298253': 1, '10299390': 1, '10301632': 1, '10302459': 0, '10302519': 1, '10727262': 1, '10727284': 1, '10727287': 1, '10727303': 1, '10727310': 1, '10727314': 1, '10727321': 1, '10727322': 1, '10727324': 1, '10727328': 1, '10727671': 1, '10734887': 1, '10734889': 1, '10734890': 1, '10735671': 1, '10735672': 1, '10735673': 1, '10735674': 1, '10735675': 1, '10735676': 1, '10735677': 1, '10735678': 1, '10735679': 1, '10735680': 1, '10735681': 1, '10735682': 1, '10735683': 1, '10735684': 1, '10735685': 1, '10735686': 1, '10735687': 1, '10735688': 1, '10735689': 1, '10735690': 1, '10735691': 1, '10735692': 1, '10735693': 1, '10735694': 1, '10735695': 1, '10735696': 1, '10735697': 1, '10735698': 1, '10735699': 1, '10735700': 1, '10735701': 1, '10735702': 1, '10735703': 1, '10735704': 1, '10735705': 1, '10735706': 1, '10735707': 1, '10735708': 1, '10735709': 1, '10735710': 1, '10735711': 1, '10735712': 1, '10735713': 1, '10735714': 1, '10735715': 1, '10735716': 1, '10735717': 1, '10735718

In [159]:
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [160]:
# Get private label information
private_l_df = fetch_private_label_data(conn, df)
print(f">> Fetched the private label data containing {len(private_l_df)} rows")

>> Fetched the private label data containing 4284 rows


In [161]:
def reorder_private_5(df, private_df):
    """
    Reorders the recommendations for each item_cde based on private_label_sw
    and adds columns indicating whether each recommendation is private branded.
    Parameters:
        df: A DataFrame containing 'item_cde' and 'Recommendation 1' to 'Recommendation 15'.
        private_label_df: A DataFrame containing 'item_cde' and 'private_label_sw'.
    Returns:
        reordered_df: A DataFrame with reordered recommendations and private branding info.
    """
    private_label_dict = dict(zip(private_df['item_cde'], private_df['private_label_sw']))
    
    reordered_recommendations = []

    for _, row in df.iterrows():
        item_cde = row['item_cde']
        recommendations = [(row[f'Recommendation {i}'], private_label_dict.get(row[f'Recommendation {i}'], 'N')) 
                           for i in range(1, 6)]
        
        # Sort recommendations based on private_label_sw ('Y' should come first)
        recommendations.sort(key=lambda x: x[1] != 'Y')
        
        reordered_row = {'item_cde': item_cde}
        for i, (rec, private_label) in enumerate(recommendations):
            reordered_row[f'Recommendation {i+1}'] = rec
            reordered_row[f'Recommendation {i+1}_private'] = private_label
        
        reordered_recommendations.append(reordered_row)
    
    reordered_df = pd.DataFrame(reordered_recommendations)
    return reordered_df

In [162]:
# Re-order private items to top
reorder_private_df2 = reorder_private_5(reorder_private_df1, private_l_df)
print(f">> Re-ordered the private label data containing {len(reorder_private_df2)} rows")

>> Re-ordered the private label data containing 936 rows


# Sanity Check for all recommended items in sustainable

In [163]:
# Convert 'item_cde' column in items_pkg_sust_df to a set for fast lookup
sustainable_items_pkg_set = set(df_pkg_sustainable['item_cde'])

# Define a function to check if all recommended items exist in sustainable_items_set
def all_recos_in_sust(row):
    recommendations = [row[f'Recommendation {i}'] for i in range(1, 6) if pd.notna(row[f'Recommendation {i}'])]
    return all(item in sustainable_items_pkg_set for item in recommendations)

# Apply the check function to each row
reorder_private_df2['check'] = reorder_private_df2.apply(all_recos_in_sust, axis=1)



In [164]:
def remove_duplicate_and_self_references_5(reorder_with_desc):
    """
    Sets duplicate recommendations and self-references to None in the DataFrame.
    
    Parameters:
        reorder_with_desc: A DataFrame containing columns for item codes and their recommendations.
    
    Returns:
        A DataFrame with duplicate recommendations and self-references set to None, keeping only the earliest occurrence.
    """
    recommendation_cols = [
        'Recommendation 1', 'Recommendation 2', 
        'Recommendation 3', 'Recommendation 4', 'Recommendation 5'
    ]

    def remove_duplicates_and_self_refs(row):
        seen = set()
        unique_recommendations = []
        item_cde = row['item_cde']
        
        # Collect unique recommendations
        for col in recommendation_cols:
            recommendation = row[col]
            if recommendation not in seen and recommendation != item_cde:
                seen.add(recommendation)
                unique_recommendations.append(recommendation)
        
        # Fill the row with unique recommendations and shift left if needed
        result = []
        for recommendation in unique_recommendations:
            result.append(recommendation)
        
        # Fill remaining slots with None
        while len(result) < len(recommendation_cols):
            result.append(None)
        
        return pd.Series(result, index=recommendation_cols)

    # Apply the function to each row
    reorder_with_desc[recommendation_cols] = reorder_with_desc.apply(remove_duplicates_and_self_refs, axis=1)

    return reorder_with_desc


In [165]:
# Remove duplicate recommendations and self-references
reorder_df_cleaned = remove_duplicate_and_self_references_5(reorder_private_df2)
print(f">>Removed duplicate recommendations and self-references data containing {len(reorder_df_cleaned)} rows")

>>Removed duplicate recommendations and self-references data containing 936 rows


In [166]:
#Shift non empty recommendations to left
reorder_shift_left = shift_recommendations_left(reorder_df_cleaned)
print(f">>Shift non empty recommendations data containing {len(reorder_shift_left)} rows")

>>Shift non empty recommendations data containing 936 rows


# Transform for enable

In [167]:
# Transform the recommendations row-wise
transformed_df = transform_recommendations(reorder_shift_left)
print(f">> Transform the recommendations, new Length {len(transformed_df)} rows, which is 5 times {len(new_cat1_recommendation_df)} (Prev length)")

>> Transform the recommendations, new Length 4680 rows, which is 5 times 936 (Prev length)


In [168]:
# Remove empty items from related items column
nonempty_transformed_df=remove_empty_related_items(transformed_df)
print(f">>Data after removing empty related items have {len(nonempty_transformed_df)} rows")

>>Data after removing empty related items have 4600 rows


In [169]:
nonempty_transformed_df

Unnamed: 0,Primary Item Number,Related Item Number
0,10298253,10301632.0
1,10298253,10802527.0
2,10298253,10302519.0
3,10298253,10735816.0
4,10298253,10735815.0
...,...,...
4671,20046068,10298253.0
4672,20046068,10990077.0
4675,20046242,10735718.0
4676,20046242,10298253.0


In [170]:
#nonempty_transformed_df.to_csv('nonprint_minimum2_recommendations.csv', index=False)

## Result needed for new items only

In [171]:
nonempty_transformed_df.to_csv('results_mar25/print_sustainable.csv',index=False)