In [65]:
import pandas as pd
from data_retrieval import connect_db, fetch_trx_data, fetch_cat_data, fetch_cat1_data, fetch_item_descriptions, fetch_private_label_data, fetch_sustainability_data
from data_processing import multi_aggregate_data, apply_custom_calculations
from data_analysis import create_baskets, create_baskets_365, create_baskets_365_qty, flatten_baskets
from modelling import create_cooccurrence_matrix_with_recommendations, create_pair_frequency_matrix,add_freq
from modeling_2 import replace_low_values,get_top_5_cat3_items, get_top_5_cat1_items, create_cat3_to_top_item_map, replace_recommendations
from modeling_3 import filter_mfg_name, replace_item_cde_with_cat3_set,map_and_add_recommendations, map_and_add_recommendations_cat1, transform_recommendations, filter_print_segment, reorder_recommendations, add_descriptions
from modeling_4 import remove_duplicate_and_self_references, shift_recommendations_left, copy_rows_with_0_to_2_recommendation, copy_rows_with_0_or_1_recommendation, add_recommendations2, minimum_three_recommendations, are_values_unique, remove_empty_related_items

In [66]:
# Sittun's code review edit suggestions

#import modelling as m1
#import modeling_2 as m2

In [67]:
#m2.replace_low_values()

In [68]:
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

In [69]:
# Load items 
df = pd.read_excel('data/AB_Scope_latest.xlsx', usecols=['Item Number'])
print(f">> Loaded {len(df)} e-commerce items with their item_cde")

>> Loaded 4284 e-commerce items with their item_cde


In [70]:
df

Unnamed: 0,Item Number
0,10735678
1,10755108
2,10735704
3,10735679
4,10735697
...,...
4279,20087303
4280,20087304
4281,20087154
4282,20088556


In [71]:
# Rename the columns
df.rename(columns={'Item Number': 'item_cde'}, 
          inplace=True)
print(f">> Renamed the columns to {df.columns.tolist()}")

>> Renamed the columns to ['item_cde']


In [72]:
# connecting to db
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [73]:
# getting transaction data from db
trx_df = fetch_trx_data(conn, df)
print(f">> Fetched the transaction data containing {len(trx_df)} rows")

>> Fetched the transaction data containing 1338033 rows


## If Manufacturer filteration is needed (when scope not given)

In [74]:
#trx_df_new = trx_df[trx_df['mfg_name'].str.contains('3M', na=False)]


In [75]:
# OPTIONAL: Apply if mfg filter needed
#mfg_df=filter_mfg_name(trx_df, '')

In [76]:
#Get only the Print df entries
trx_df_nonpr = filter_print_segment(trx_df)
print(f">> Sixe of data for non print  data is Length: {len(trx_df)}")

>> Sixe of data for non print  data is Length: 1338033


In [77]:
# aggregate data at bill-to
aggregated_df = multi_aggregate_data(trx_df_nonpr)
print(f">> Aggregated data at bill-to level.Length: {len(aggregated_df)} rows")

>> Aggregated data at bill-to level.Length: 5174 rows


In [78]:
# custom column calculation for avg time interval
custom_calculated_df = apply_custom_calculations(aggregated_df)
print(f">> Calculated custom columns.Length: {len(custom_calculated_df)} rows")

>> Calculated custom columns.Length: 5174 rows


In [79]:
# make baskets for all possible starting date
baskets_df_365_qty = create_baskets_365_qty(custom_calculated_df)
print(f">> Made baskets with size: {len(baskets_df_365_qty)} baskets")

>> Made baskets with size: 5174 baskets


In [80]:
# flatten the baskets
flattened_basket_list_365 = flatten_baskets(baskets_df_365_qty)
print(f">> Flattened baskets. Length: {len(flattened_basket_list_365)}")

>> Flattened baskets. Length: 82733


In [81]:
# make item level recommendations
co_occurrence_matrix, recommendation_df = create_cooccurrence_matrix_with_recommendations(flattened_basket_list_365)
print(f">> Made item level recommendations for: {len(recommendation_df)} items")

100%|██████████| 82733/82733 [00:01<00:00, 41758.54it/s]
100%|██████████| 557/557 [00:01<00:00, 402.64it/s]


>> Made item level recommendations for: 557 items


In [82]:
# Get the pair frequency matrix
pair_freq_matrix = create_pair_frequency_matrix(flattened_basket_list_365)
print(f">> Created pair wise frequency matrix of Length: {len(pair_freq_matrix)}")

>> Created pair wise frequency matrix of Length: 563


In [83]:
#add freq
recommendation_df=add_freq(recommendation_df,pair_freq_matrix)
print(f">>Added freq to the df {len(recommendation_df)}")

>>Added freq to the df 557


In [84]:
recommendation_df['item_cde'][0]


'10298253'

In [85]:
#Replace recommendations with low basket count with the top cat 3 item
recommendation_df=replace_low_values(recommendation_df,100)
print(f">>Replaced items of baskets with low thresholds: {len(recommendation_df)} rows")

>>Replaced items of baskets with low thresholds: 557 rows


In [86]:
# Get the cat3 top 5 items
top_5_items_cat3=get_top_5_cat3_items(trx_df)
print(f">> Got cat3 top 5 items: {len(top_5_items_cat3)}")

>> Got cat3 top 5 items: 246


In [87]:
# Get the cat1 top 5 items
top_5_items_cat1=get_top_5_cat1_items(trx_df)
print(f">> Got cat1 top 5 items: {len(top_5_items_cat1)}")

>> Got cat1 top 5 items: 32


In [88]:
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [89]:
# getting cat3 data from db
conn = connect_db()

cat3_df = fetch_cat_data(conn, df)
print(f">> Fetched the cat3 data containing {len(cat3_df)} rows")

%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 

In [90]:
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [91]:
# getting cat1 data from db
cat1_df = fetch_cat1_data(conn, df)
print(f">> Fetched the cat1 data containing {len(cat1_df)} rows")

%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 

In [92]:
# Map item to top item in cat3
item_to_cat3_top_map= create_cat3_to_top_item_map(df, cat3_df, top_5_items_cat3)
print(f">> Map items from item to top items in cat3 {len(item_to_cat3_top_map)} rows")

>> Map items from item to top items in cat3 4239 rows


In [93]:
item_to_cat3_top_map

{10735678: '10735718',
 10755108: '10755118',
 10735704: '10735718',
 10735679: '10735718',
 10735697: '10735718',
 10755124: '10755118',
 10755135: '10755118',
 10735680: '10735718',
 10735698: '10735718',
 10735681: '10735718',
 10738641: '10735784',
 10755136: '10755118',
 10755125: '10755118',
 10735699: '10735718',
 10735682: '10735718',
 10735683: '10735718',
 10755137: '10755118',
 10755126: '10755118',
 10735705: '10735718',
 10735684: '10735718',
 10755127: '10755118',
 10735706: '10735718',
 10735707: '10735718',
 10755138: '10755118',
 10755139: '10755118',
 10755140: '10755118',
 10735708: '10735718',
 10755141: '10755118',
 10735709: '10735718',
 10735710: '10735718',
 10735711: '10735718',
 10735712: '10735718',
 10735713: '10735718',
 10735714: '10735718',
 10735715: '10735718',
 10735716: '10735718',
 10735717: '10735718',
 10735718: '10735718',
 10735719: '10735718',
 10735720: '10735718',
 10735721: '10735718',
 10735723: '10735784',
 10735770: '10735784',
 10755167: 

In [94]:
# Replace items with low freq with top items

#new_recommendation_df=replace_recommendations(recommendation_df,item_to_cat3_top_map)
#print(f">> Replacde items with low freq with top items with {len(new_recommendation_df)} rows")

In [95]:
recommendation_df

Unnamed: 0,item_cde,Recommendation 1,Recommendation 2,Recommendation 3,Recommendation 4,Recommendation 5,frequency 1,frequency 2,frequency 3,frequency 4,frequency 5
0,10298253,10301632,10802527,10302519,10735816,10735815,1303.0,154.0,692.0,285.0,159.0
1,10299390,10298253,10735814,10735815,10735817,10735816,281.0,CAT,CAT,CAT,CAT
2,10301632,10298253,10802527,10302519,10735817,10766056,1303.0,424.0,569.0,320.0,162.0
3,10302459,11136851,11136857,10735815,10802527,10735814,CAT,CAT,CAT,CAT,CAT
4,10302519,10298253,10301632,10802527,10299390,11136851,692.0,569.0,189.0,CAT,CAT
...,...,...,...,...,...,...,...,...,...,...,...
552,20099041,10766059,11142581,10735695,20015144,20099031,CAT,CAT,CAT,CAT,CAT
553,20099043,10735704,10735703,10735707,10735833,20099029,CAT,CAT,CAT,CAT,CAT
554,20099045,10755134,20099016,20099041,20099031,20099046,CAT,CAT,CAT,CAT,CAT
555,20099046,10735869,20099045,10735679,20099030,20099022,CAT,CAT,CAT,CAT,CAT


# Cat 3 operations

In [96]:
# Make cat3 level baskets for co-occurance
cat3_basket_365=replace_item_cde_with_cat3_set(flattened_basket_list_365, cat3_df)
print(f">> Made cat3 level baskets for co-occurance size: {len(cat3_basket_365)}")

>> Made cat3 level baskets for co-occurance size: 82733


In [97]:
# Create co-occurance df for cat3 level
cat3_co_occurrence_matrix, cat3_recommendation_df=create_cooccurrence_matrix_with_recommendations(cat3_basket_365, top_n=5)
print(f">> Create co-occurance df for cat3 level df Length: {len(cat3_recommendation_df)}")

100%|██████████| 82733/82733 [00:00<00:00, 432251.71it/s]
100%|██████████| 17/17 [00:00<00:00, 1082.65it/s]

>> Create co-occurance df for cat3 level df Length: 17





In [98]:
# Map cat3 and add cat3 level recos
new_cat3_recommendation_df = map_and_add_recommendations(df, cat3_df, recommendation_df, cat3_recommendation_df,top_5_items_cat3)
print(f">> Product+cat3 level df of Length: {len(new_cat3_recommendation_df)} rows")

>> Product+cat3 level df of Length: 897 rows


In [99]:
new_cat3_recommendation_df

Unnamed: 0,item_cde,Recommendation 1,Recommendation 2,Recommendation 3,Recommendation 4,Recommendation 5,frequency 1,frequency 2,frequency 3,frequency 4,frequency 5
0,10298253,10301632,10802527,10302519,10735816,10735815,1303.0,154.0,692.0,285.0,159.0
1,10299390,10298253,10735814,10735815,10735817,10735816,281.0,CAT,CAT,CAT,CAT
2,10301632,10298253,10802527,10302519,10735817,10766056,1303.0,424.0,569.0,320.0,162.0
3,10302459,11136851,11136857,10735815,10802527,10735814,CAT,CAT,CAT,CAT,CAT
4,10302519,10298253,10301632,10802527,10299390,11136851,692.0,569.0,189.0,CAT,CAT
...,...,...,...,...,...,...,...,...,...,...,...
892,20029846,10735784,10298253,10872574,10766056,10735718,,,,,
893,20029894,10735784,10298253,10872574,10766056,10735718,,,,,
894,20030362,10735784,10298253,10872574,10766056,10735718,,,,,
895,20030380,10735784,10298253,10872574,10766056,10735718,,,,,


In [100]:
#new_cat3_recommendation_df.to_csv('new_cat3_recommendation_df_with_item_cde.csv',index=False)

# Cat 1 Operations

In [101]:
# Make cat1 level baskets for co-occurance
cat1_basket_365=replace_item_cde_with_cat3_set(flattened_basket_list_365, cat1_df)
print(f">> Made cat1 level baskets for co-occurance size: {len(cat1_basket_365)}")

>> Made cat1 level baskets for co-occurance size: 82733


In [102]:
# Create co-occurance df for cat1 level
cat1_co_occurrence_matrix, cat1_recommendation_df=create_cooccurrence_matrix_with_recommendations(cat1_basket_365, top_n=5)
print(f">> Create co-occurance df for cat3 level df Length: {len(cat1_recommendation_df)}")

100%|██████████| 82733/82733 [00:00<00:00, 511279.37it/s]
100%|██████████| 7/7 [00:00<?, ?it/s]

>> Create co-occurance df for cat3 level df Length: 7





In [103]:
cat1_recommendation_df

Unnamed: 0,Recommendation 1,Recommendation 2,Recommendation 3,Recommendation 4,Recommendation 5
Coated Freesheet,Uncoated Freesheet,Envelopes,Pressure Sensitive,Synthetics,Wide Format
Envelopes,Coated Freesheet,Uncoated Freesheet,Pressure Sensitive,Synthetics,Wide Format
Pressure Sensitive,Coated Freesheet,Uncoated Freesheet,Envelopes,Synthetics,Wide Format
Specialty,Coated Freesheet,Uncoated Freesheet,Pressure Sensitive,,
Synthetics,Coated Freesheet,Uncoated Freesheet,Envelopes,Pressure Sensitive,Wide Format
Uncoated Freesheet,Coated Freesheet,Envelopes,Pressure Sensitive,Synthetics,Wide Format
Wide Format,Uncoated Freesheet,Coated Freesheet,Envelopes,Pressure Sensitive,Synthetics


In [104]:
# Map cat1 and add cat3 level recos
new_cat1_recommendation_df = map_and_add_recommendations_cat1(df, cat1_df, new_cat3_recommendation_df, cat1_recommendation_df, top_5_items_cat1)
print(f">> Product+cat1+cat3 level df of Length: {len(new_cat1_recommendation_df)} rows")

>> Product+cat1+cat3 level df of Length: 936 rows


In [105]:
new_cat1_recommendation_df

Unnamed: 0,item_cde,Recommendation 1,Recommendation 2,Recommendation 3,Recommendation 4,Recommendation 5,frequency 1,frequency 2,frequency 3,frequency 4,frequency 5
0,10298253,10301632,10802527,10302519,10735816,10735815,1303.0,154.0,692.0,285.0,159.0
1,10299390,10298253,10735814,10735815,10735817,10735816,281.0,CAT,CAT,CAT,CAT
2,10301632,10298253,10802527,10302519,10735817,10766056,1303.0,424.0,569.0,320.0,162.0
3,10302459,11136851,11136857,10735815,10802527,10735814,CAT,CAT,CAT,CAT,CAT
4,10302519,10298253,10301632,10802527,10299390,11136851,692.0,569.0,189.0,CAT,CAT
...,...,...,...,...,...,...,...,...,...,...,...
931,20014557,10735718,10298253,10990077,,,,,,,
932,20046242,10735718,10298253,10990077,,,,,,,
933,20046068,10735718,10298253,10990077,,,,,,,
934,20045610,10735718,10298253,10990077,,,,,,,


# Reorder private items

In [106]:
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [107]:
# Get private label information
private_l_df = fetch_private_label_data(conn, df)
print(f">> Fetched the private label data containing {len(private_l_df)} rows")

>> Fetched the private label data containing 4284 rows


In [108]:
private_l_df['private_label_sw'].value_counts ()

private_label_sw
N    2568
Y    1715
Name: count, dtype: int64

In [109]:
# Re-order private items to top
reorder_private_df = reorder_recommendations(new_cat1_recommendation_df, private_l_df)
print(f">> Re-ordered the private label data containing {len(reorder_private_df)} rows")

>> Re-ordered the private label data containing 936 rows


In [110]:
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [111]:
# Get item descriptions
item_desc_df=fetch_item_descriptions(conn,df)
print(f">>Fetched item descriptions data containing {len(reorder_private_df)} rows")

>>Fetched item descriptions data containing 936 rows


In [112]:
#Select columns
reorder_df=reorder_private_df[['item_cde','Recommendation 1','Recommendation 2','Recommendation 3','Recommendation 4','Recommendation 5']]

In [113]:
# Remove duplicate recommendations and self-references
reorder_df_cleaned = remove_duplicate_and_self_references(reorder_df)
print(f">>Removed duplicate recommendations and self-references data containing {len(reorder_df_cleaned)} rows")

>>Removed duplicate recommendations and self-references data containing 936 rows


In [114]:
#Shift non empty recommendations to left
reorder_shift_left=shift_recommendations_left(reorder_df_cleaned)
print(f">>Shift non empty recommendations data containing {len(reorder_shift_left)} rows")

>>Shift non empty recommendations data containing 936 rows


# Atleast 2 recommendation (Katie's request)

In [115]:
#Check # of rows with less trhan 2 recommendation
rows_with_0_or_1_recommendation = copy_rows_with_0_or_1_recommendation(reorder_shift_left)
print(f">>Items with less than 2 recommendations are Length: {len(rows_with_0_or_1_recommendation)} rows")

>>Items with less than 2 recommendations are Length: 2 rows


In [116]:
rows_with_0_to_2_recommendation = copy_rows_with_0_to_2_recommendation(reorder_shift_left)
print(f">>Items with less than 3 recommendations are Length: {len(rows_with_0_to_2_recommendation)} rows")

>>Items with less than 3 recommendations are Length: 8 rows


In [117]:
#Make atleast 2 recommendations
#updated_reorder_df = add_recommendations2(reorder_shift_left, cat3_df, top_5_items_cat3, cat1_df, top_5_items_cat1)
#print(f">>Atleast two recommendations added for data with {len(updated_reorder_df)} rows")

In [118]:
updated_reorder_df = minimum_three_recommendations(reorder_df, cat3_df, top_5_items_cat3, cat1_df, top_5_items_cat1)
print(f">>Atleast three recommendations added for data with {len(updated_reorder_df)} rows")

>>Atleast three recommendations added for data with 936 rows


# Check if updated df has any less than 2 reco rows

In [119]:
rows_with_0_to_2_recommendation = copy_rows_with_0_to_2_recommendation(updated_reorder_df)
print(f">>Length of data with less than 2 recommendations {len(rows_with_0_to_2_recommendation)} rows")

>>Length of data with less than 2 recommendations 0 rows


In [120]:
# Add unique check column to ensure every recommendation is unique in a given row
updated_reorder_df['unique_check'] = updated_reorder_df.apply(are_values_unique, axis=1)
print(f">>Unique check column added for data with {len(updated_reorder_df)} rows")

>>Unique check column added for data with 936 rows


In [121]:
#updated_reorder_df.to_csv('unique_updated_reorder_df_min2_recos.csv')

In [122]:
# Transform the recommendations row-wise
transformed_df = transform_recommendations(updated_reorder_df)
print(f">> Transform the recommendations, new Length {len(transformed_df)} rows, which is 5 times {len(new_cat1_recommendation_df)} (Prev length)")

>> Transform the recommendations, new Length 4680 rows, which is 5 times 936 (Prev length)


In [123]:
# Remove empty items from related items column
nonempty_transformed_df=remove_empty_related_items(transformed_df)
print(f">>Data after removing empty related items have {len(nonempty_transformed_df)} rows")

>>Data after removing empty related items have 4647 rows


In [124]:
nonempty_transformed_df

Unnamed: 0,Primary Item Number,Related Item Number
0,10298253,10301632
1,10298253,10802527
2,10298253,10302519
3,10298253,10735816
4,10298253,10735815
...,...,...
4671,20045610,10298253
4672,20045610,10990077
4675,20045342,10735718
4676,20045342,10298253


In [125]:
type(nonempty_transformed_df['Primary Item Number'][1])

str

In [126]:
#nonempty_transformed_df.to_csv('nonprint_minimum2_recommendations.csv', index=False)

## Result needed for new items only

In [127]:
def filter_primary_item_numbers(df, item_numbers):
    # Filter the DataFrame to include only rows where 'Primary Item Number' is in the item_numbers list
    filtered_df = df[df['Primary Item Number'].isin(item_numbers)]
    return filtered_df

# Define the list of item numbers
item_numbers = ['20121463', '20121758'


]

nonempty_transformed_filtered_df = filter_primary_item_numbers(nonempty_transformed_df, item_numbers)
print(nonempty_transformed_filtered_df)

Empty DataFrame
Columns: [Primary Item Number, Related Item Number]
Index: []


In [None]:
nonempty_transformed_filtered_df.to_csv('results_mar25/print_overall_results.csv',index=False)

In [75]:
# Add item descriptions
updated_reorder_df_with_desc=add_descriptions(item_desc_df,updated_reorder_df)
print(f">>Added item descriptions to data containing {len(updated_reorder_df_with_desc)} rows")

>>Added item descriptions to data containing 96 rows
