# Load

In [None]:
import pandas as pd
from data_retrieval import connect_db, fetch_trx_data, fetch_cat_data, fetch_cat1_data, fetch_item_descriptions, fetch_private_label_data, fetch_mfg_name_data
from data_processing import multi_aggregate_data, apply_custom_calculations
from data_analysis import create_baskets_365_qty, flatten_baskets
from modelling import create_cooccurrence_matrix_with_recommendations, create_pair_frequency_matrix,add_freq
from modeling_2 import replace_low_values,get_top_5_cat3_items, get_top_5_cat1_items, create_cat3_to_top_item_map, replace_recommendations
from modeling_3 import replace_item_cde_with_cat3_set,map_and_add_recommendations, map_and_add_recommendations_cat1, transform_recommendations, filter_pkg_segment, reorder_recommendations, add_descriptions
from modeling_4 import remove_duplicate_and_self_references, shift_recommendations_left, copy_rows_with_0_to_2_recommendation, copy_rows_with_0_or_1_recommendation, add_recommendations2, minimum_three_recommendations, are_values_unique, remove_empty_related_items

In [198]:
# Sittun's code review edit suggestions

#import modelling as m1
#import modeling_2 as m2

In [199]:
#m2.replace_low_values()

In [200]:
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

In [201]:
# Load items 
df = pd.read_excel('data/AB_Scope_latest.xlsx', usecols=['Item Number'])
print(f">> Loaded {len(df)} e-commerce items with their item_cde")

>> Loaded 4284 e-commerce items with their item_cde


In [202]:
# Rename the columns
df.rename(columns={'Item Number': 'item_cde'}, 
          inplace=True)
print(f">> Renamed the columns to {df.columns.tolist()}")

>> Renamed the columns to ['item_cde']


In [203]:
# connecting to db
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [204]:
# getting transaction data from db
trx_df = fetch_trx_data(conn, df)
print(f">> Fetched the transaction data containing {len(trx_df)} rows")

>> Fetched the transaction data containing 1338033 rows


## If Manufacturer filteration is needed (when scope not given)

In [207]:
#trx_df_new = trx_df[trx_df['mfg_name'].str.contains('3M', na=False)]
#uniq=trx_df_new['item_cde'].value_counts()


In [143]:
# OPTIONAL: Apply if mfg filter needed
#mfg_df=filter_mfg_name(trx_df, '')

In [None]:
#Get only the non Print df entries
trx_df_nonpr = filter_pkg_segment(trx_df)
print(f">> Sixe of data for non print  data is Length: {len(trx_df)}")

>> Sixe of data for non print  data is Length: 22366


In [145]:
# aggregate data at bill-to
aggregated_df = multi_aggregate_data(trx_df_nonpr)
print(f">> Aggregated data at bill-to level.Length: {len(aggregated_df)} rows")

>> Aggregated data at bill-to level.Length: 1550 rows


In [146]:
# custom column calculation for avg time interval
custom_calculated_df = apply_custom_calculations(aggregated_df)
print(f">> Calculated custom columns.Length: {len(custom_calculated_df)} rows")

>> Calculated custom columns.Length: 1550 rows


In [147]:
# make baskets for all possible starting date
baskets_df_365_qty = create_baskets_365_qty(custom_calculated_df)
print(f">> Made baskets with size: {len(baskets_df_365_qty)} baskets")

>> Made baskets with size: 1550 baskets


In [148]:
# flatten the baskets
flattened_basket_list_365 = flatten_baskets(baskets_df_365_qty)
print(f">> Flattened baskets. Length: {len(flattened_basket_list_365)}")

>> Flattened baskets. Length: 14118


In [149]:
# make item level recommendations
co_occurrence_matrix, recommendation_df = create_cooccurrence_matrix_with_recommendations(flattened_basket_list_365)
print(f">> Made item level recommendations for: {len(recommendation_df)} items")

100%|██████████| 14118/14118 [00:00<00:00, 558154.64it/s]
100%|██████████| 118/118 [00:00<00:00, 7267.98it/s]

>> Made item level recommendations for: 118 items





In [150]:
# Get the pair frequency matrix
pair_freq_matrix = create_pair_frequency_matrix(flattened_basket_list_365)
print(f">> Created pair wise frequency matrix of Length: {len(pair_freq_matrix)}")

>> Created pair wise frequency matrix of Length: 138


In [151]:
#add freq
recommendation_df=add_freq(recommendation_df,pair_freq_matrix)
print(f">>Added freq to the df {len(recommendation_df)}")

>>Added freq to the df 118


In [152]:
recommendation_df['item_cde'][0]


'10029049'

In [153]:
#Replace recommendations with low basket count with the top cat 3 item
recommendation_df=replace_low_values(recommendation_df,100)
print(f">>Replaced items of baskets with low thresholds: {len(recommendation_df)} rows")

>>Replaced items of baskets with low thresholds: 118 rows


In [154]:
# Get the cat3 top 5 items
top_5_items_cat3=get_top_5_cat3_items(trx_df_new)
print(f">> Got cat3 top 5 items: {len(top_5_items_cat3)}")

>> Got cat3 top 5 items: 29


In [155]:
# Get the cat1 top 5 items
top_5_items_cat1=get_top_5_cat1_items(trx_df_new)
print(f">> Got cat1 top 5 items: {len(top_5_items_cat1)}")

>> Got cat1 top 5 items: 8


In [156]:
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [157]:
# getting cat3 data from db
conn = connect_db()

cat3_df = fetch_cat_data(conn, df)
print(f">> Fetched the cat3 data containing {len(cat3_df)} rows")

%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 

In [158]:
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [159]:
# getting cat1 data from db
cat1_df = fetch_cat1_data(conn, df)
print(f">> Fetched the cat1 data containing {len(cat1_df)} rows")

%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 

In [160]:
# Map item to top item in cat3
item_to_cat3_top_map= create_cat3_to_top_item_map(df, cat3_df, top_5_items_cat3)
print(f">> Map items from item to top items in cat3 {len(item_to_cat3_top_map)} rows")

>> Map items from item to top items in cat3 501 rows


In [161]:
item_to_cat3_top_map

{10863215: '10480459',
 10864461: '11152963',
 10563810: '10480459',
 10631239: '11152963',
 10540759: '11152963',
 11114344: '10480459',
 10261989: '10480459',
 10262110: '11152963',
 10542533: '10480459',
 11108998: '10480459',
 10533874: '10480459',
 10787413: '10480459',
 11153840: '10480459',
 10787426: '10480459',
 11079423: '10480459',
 10844688: '10480459',
 10844710: '10480459',
 10844729: '10480459',
 10735257: '10480459',
 10161608: '10480459',
 10059527: '10480459',
 10059531: '10480459',
 10043023: '10480459',
 11154822: '10480459',
 11136701: '10480459',
 11152450: '10480459',
 11152963: '11152963',
 10107339: '11152963',
 10733559: '10480459',
 11119999: '10480459',
 10089883: '11152963',
 10360972: '10480459',
 10999702: '10480459',
 10770803: '10480459',
 10770896: '10480459',
 10728849: '10480459',
 10734843: '10480459',
 10113721: '10480459',
 10731351: '11152963',
 10717864: '10480459',
 10715626: '11152963',
 10631935: '10480459',
 10094247: '10480459',
 10980780: 

In [162]:
# Replace items with low freq with top items

#new_recommendation_df=replace_recommendations(recommendation_df,item_to_cat3_top_map)
#print(f">> Replacde items with low freq with top items with {len(new_recommendation_df)} rows")

In [163]:
recommendation_df

Unnamed: 0,item_cde,Recommendation 1,Recommendation 2,Recommendation 3,Recommendation 4,Recommendation 5,frequency 1,frequency 2,frequency 3,frequency 4,frequency 5
0,10029049,20021209,11090419,10036270,10036223,,CAT,CAT,CAT,CAT,CAT
1,10035501,10035502,10036223,20046533,10035503,10038836,CAT,CAT,CAT,CAT,CAT
2,10035502,10035501,10035504,20086846,10035503,10036270,CAT,CAT,CAT,CAT,CAT
3,10035503,10038136,10035501,10035502,10035504,10038137,CAT,CAT,CAT,CAT,CAT
4,10035504,10035502,10036270,10038135,10038137,10038134,CAT,CAT,CAT,CAT,CAT
...,...,...,...,...,...,...,...,...,...,...,...
113,20085935,10036223,,,,,CAT,CAT,CAT,CAT,CAT
114,20086839,20086846,10035502,10958533,,,CAT,CAT,CAT,CAT,CAT
115,20086846,10035502,20086839,,,,CAT,CAT,CAT,CAT,CAT
116,20094331,10304707,10688017,10772812,10585204,10943253,CAT,CAT,CAT,CAT,CAT


# Cat 3 operations

In [164]:
# Make cat3 level baskets for co-occurance
cat3_basket_365=replace_item_cde_with_cat3_set(flattened_basket_list_365, cat3_df)
print(f">> Made cat3 level baskets for co-occurance size: {len(cat3_basket_365)}")

>> Made cat3 level baskets for co-occurance size: 14118


In [165]:
# Create co-occurance df for cat3 level
cat3_co_occurrence_matrix, cat3_recommendation_df=create_cooccurrence_matrix_with_recommendations(cat3_basket_365, top_n=5)
print(f">> Create co-occurance df for cat3 level df Length: {len(cat3_recommendation_df)}")

100%|██████████| 14118/14118 [00:00<00:00, 535879.17it/s]
100%|██████████| 22/22 [00:00<00:00, 3109.09it/s]

>> Create co-occurance df for cat3 level df Length: 22





In [166]:
# Map cat3 and add cat3 level recos
new_cat3_recommendation_df = map_and_add_recommendations(df, cat3_df, recommendation_df, cat3_recommendation_df,top_5_items_cat3)
print(f">> Product+cat3 level df of Length: {len(new_cat3_recommendation_df)} rows")

>> Product+cat3 level df of Length: 339 rows


In [167]:
new_cat3_recommendation_df

Unnamed: 0,item_cde,Recommendation 1,Recommendation 2,Recommendation 3,Recommendation 4,Recommendation 5,frequency 1,frequency 2,frequency 3,frequency 4,frequency 5
0,10029049,20021209,11090419,10036270,10036223,,CAT,CAT,CAT,CAT,CAT
1,10035501,10035502,10036223,20046533,10035503,10038836,CAT,CAT,CAT,CAT,CAT
2,10035502,10035501,10035504,20086846,10035503,10036270,CAT,CAT,CAT,CAT,CAT
3,10035503,10038136,10035501,10035502,10035504,10038137,CAT,CAT,CAT,CAT,CAT
4,10035504,10035502,10036270,10038135,10038137,10038134,CAT,CAT,CAT,CAT,CAT
...,...,...,...,...,...,...,...,...,...,...,...
334,20117940,20021209,10036584,10958533,10038049,10035734,,,,,
335,20070165,10038031,10058192,10304707,10474083,10698827,,,,,
336,20076005,10038049,10036584,10058192,20021209,10037717,,,,,
337,11036846,10474083,10058192,10304707,20021209,10958533,,,,,


In [168]:
#new_cat3_recommendation_df.to_csv('new_cat3_recommendation_df_with_item_cde.csv',index=False)

# Cat 1 Operations

In [169]:
# Make cat1 level baskets for co-occurance
cat1_basket_365=replace_item_cde_with_cat3_set(flattened_basket_list_365, cat1_df)
print(f">> Made cat1 level baskets for co-occurance size: {len(cat1_basket_365)}")

>> Made cat1 level baskets for co-occurance size: 14118


In [170]:
# Create co-occurance df for cat1 level
cat1_co_occurrence_matrix, cat1_recommendation_df=create_cooccurrence_matrix_with_recommendations(cat1_basket_365, top_n=5)
print(f">> Create co-occurance df for cat3 level df Length: {len(cat1_recommendation_df)}")

100%|██████████| 14118/14118 [00:00<00:00, 865934.28it/s]
100%|██████████| 4/4 [00:00<?, ?it/s]

>> Create co-occurance df for cat3 level df Length: 4





In [171]:
cat1_recommendation_df

Unnamed: 0,Recommendation 1,Recommendation 2,Recommendation 3,Recommendation 4,Recommendation 5
Ancillary Packaging,Tapes,Labeling / Identification,,,
Dispensers,Tapes,Labeling / Identification,,,
Labeling / Identification,Tapes,Dispensers,Ancillary Packaging,,
Tapes,Labeling / Identification,Dispensers,Ancillary Packaging,,


In [172]:
# Map cat1 and add cat3 level recos
new_cat1_recommendation_df = map_and_add_recommendations_cat1(df, cat1_df, new_cat3_recommendation_df, cat1_recommendation_df, top_5_items_cat1)
print(f">> Product+cat1+cat3 level df of Length: {len(new_cat1_recommendation_df)} rows")

>> Product+cat1+cat3 level df of Length: 425 rows


In [173]:
new_cat1_recommendation_df

Unnamed: 0,item_cde,Recommendation 1,Recommendation 2,Recommendation 3,Recommendation 4,Recommendation 5,frequency 1,frequency 2,frequency 3,frequency 4,frequency 5
0,10029049,20021209,11090419,10036270,10036223,,CAT,CAT,CAT,CAT,CAT
1,10035501,10035502,10036223,20046533,10035503,10038836,CAT,CAT,CAT,CAT,CAT
2,10035502,10035501,10035504,20086846,10035503,10036270,CAT,CAT,CAT,CAT,CAT
3,10035503,10038136,10035501,10035502,10035504,10038137,CAT,CAT,CAT,CAT,CAT
4,10035504,10035502,10036270,10038135,10038137,10038134,CAT,CAT,CAT,CAT,CAT
...,...,...,...,...,...,...,...,...,...,...,...
420,20049301,10058192,10037717,,,,,,,,
421,20071471,10058192,10037717,,,,,,,,
422,20082716,10058192,10037717,,,,,,,,
423,10276747,10058192,10037717,,,,,,,,


# Reorder private items

In [174]:
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [None]:
# Get private label information
private_l_df = fetch_mfg_name_data(conn, df)
print(f">> Fetched the private label data containing {len(private_l_df)} rows")

>> Fetched the private label data containing 4278 rows


In [None]:
private_l_df['private_label_sw'].value_counts ()

private_label_sw
N    2565
Y    1712
Name: count, dtype: int64

In [None]:
# Re-order private items to top
reorder_private_df = reorder_recommendations(new_cat1_recommendation_df, private_l_df)
print(f">> Re-ordered the private label data containing {len(reorder_private_df)} rows")

>> Re-ordered the private label data containing 425 rows


In [178]:
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [179]:
# Get item descriptions
item_desc_df=fetch_item_descriptions(conn,df)
print(f">>Fetched item descriptions data containing {len(reorder_private_df)} rows")

>>Fetched item descriptions data containing 425 rows


In [180]:
#Select columns
reorder_df=reorder_private_df[['item_cde','Recommendation 1','Recommendation 2','Recommendation 3','Recommendation 4','Recommendation 5']]

In [181]:
# Remove duplicate recommendations and self-references
reorder_df_cleaned = remove_duplicate_and_self_references(reorder_df)
print(f">>Removed duplicate recommendations and self-references data containing {len(reorder_df_cleaned)} rows")

>>Removed duplicate recommendations and self-references data containing 425 rows


In [182]:
#Shift non empty recommendations to left
reorder_shift_left=shift_recommendations_left(reorder_df_cleaned)
print(f">>Shift non empty recommendations data containing {len(reorder_shift_left)} rows")

>>Shift non empty recommendations data containing 425 rows


## Reorder Alliance partner

In [None]:
conn = connect_db()
print(">> Connected to database")

In [None]:
# Get private label information
alliance_df = fetch_mfg_name_data(conn, df)
print(f">> Fetched the private label data containing {len(private_l_df)} rows")

In [None]:
# Re-order private items to top
reorder_mfg_df = reorder_recommendations(reorder_shift_left, alliance_df)
print(f">> Re-ordered the private label data containing {len(reorder_private_df)} rows")

# Atleast 2 recommendation (Katie's request)

In [183]:
#Check # of rows with less trhan 2 recommendation
rows_with_0_or_1_recommendation = copy_rows_with_0_or_1_recommendation(reorder_shift_left)
print(f">>Items with less than 2 recommendations are Length: {len(rows_with_0_or_1_recommendation)} rows")

>>Items with less than 2 recommendations are Length: 28 rows


In [184]:
rows_with_0_to_2_recommendation = copy_rows_with_0_to_2_recommendation(reorder_shift_left)
print(f">>Items with less than 3 recommendations are Length: {len(rows_with_0_to_2_recommendation)} rows")

>>Items with less than 3 recommendations are Length: 39 rows


In [185]:
#Make atleast 2 recommendations
#updated_reorder_df = add_recommendations2(reorder_shift_left, cat3_df, top_5_items_cat3, cat1_df, top_5_items_cat1)
#print(f">>Atleast two recommendations added for data with {len(updated_reorder_df)} rows")

In [186]:
updated_reorder_df = minimum_three_recommendations(reorder_df, cat3_df, top_5_items_cat3, cat1_df, top_5_items_cat1)
print(f">>Atleast three recommendations added for data with {len(updated_reorder_df)} rows")

>>Atleast three recommendations added for data with 425 rows


# Check if updated df has any less than 2 reco rows

In [187]:
rows_with_0_to_2_recommendation = copy_rows_with_0_to_2_recommendation(updated_reorder_df)
print(f">>Length of data with less than 2 recommendations {len(rows_with_0_to_2_recommendation)} rows")

>>Length of data with less than 2 recommendations 2 rows


In [188]:
# Add unique check column to ensure every recommendation is unique in a given row
updated_reorder_df['unique_check'] = updated_reorder_df.apply(are_values_unique, axis=1)
print(f">>Unique check column added for data with {len(updated_reorder_df)} rows")

>>Unique check column added for data with 425 rows


In [189]:
#updated_reorder_df.to_csv('unique_updated_reorder_df_min2_recos.csv')

In [190]:
# Transform the recommendations row-wise
transformed_df = transform_recommendations(updated_reorder_df)
print(f">> Transform the recommendations, new Length {len(transformed_df)} rows, which is 5 times {len(new_cat1_recommendation_df)} (Prev length)")

>> Transform the recommendations, new Length 2125 rows, which is 5 times 425 (Prev length)


In [191]:
# Remove empty items from related items column
nonempty_transformed_df=remove_empty_related_items(transformed_df)
print(f">>Data after removing empty related items have {len(nonempty_transformed_df)} rows")

>>Data after removing empty related items have 1717 rows


In [192]:
nonempty_transformed_df

Unnamed: 0,Primary Item Number,Related Item Number
0,10029049,20021209
1,10029049,11090419
2,10029049,10036270
3,10029049,10036223
5,10035501,10035502
...,...,...
2115,10276747,10058192
2116,10276747,10037717
2120,10884711,10037717
2121,10884711,10530785


In [193]:
type(nonempty_transformed_df['Primary Item Number'][1])

str

In [194]:
#nonempty_transformed_df.to_csv('nonprint_minimum2_recommendations.csv', index=False)

## Result needed for new items only

In [195]:
def filter_primary_item_numbers(df, item_numbers):
    # Filter the DataFrame to include only rows where 'Primary Item Number' is in the item_numbers list
    filtered_df = df[df['Primary Item Number'].isin(item_numbers)]
    return filtered_df

# Define the list of item numbers
item_numbers = ['20121463', '20121758'


]

nonempty_transformed_filtered_df = filter_primary_item_numbers(nonempty_transformed_df, item_numbers)
print(nonempty_transformed_filtered_df)

Empty DataFrame
Columns: [Primary Item Number, Related Item Number]
Index: []


In [196]:
nonempty_transformed_df.to_csv('results_mar25/3m_pkg.csv',index=False)

In [75]:
# Add item descriptions
updated_reorder_df_with_desc=add_descriptions(item_desc_df,updated_reorder_df)
print(f">>Added item descriptions to data containing {len(updated_reorder_df_with_desc)} rows")

>>Added item descriptions to data containing 96 rows
