In [5]:
import pandas as pd
from data_retrieval import connect_db, fetch_trx_data, fetch_cat_data, fetch_cat1_data, fetch_item_descriptions, fetch_private_label_data, fetch_sustainability_data
from data_processing import multi_aggregate_data, apply_custom_calculations
from data_analysis import create_baskets, create_baskets_365, create_baskets_365_qty, flatten_baskets
from modelling import create_cooccurrence_matrix_with_recommendations, create_pair_frequency_matrix,add_freq
from modeling_2 import replace_low_values,get_top_5_cat3_items, get_top_5_cat1_items, create_cat3_to_top_item_map, replace_recommendations
from modeling_3 import filter_mfg_name, replace_item_cde_with_cat3_set,map_and_add_recommendations, map_and_add_recommendations_cat1, transform_recommendations, filter_fs_segment, reorder_recommendations, add_descriptions
from modeling_4 import remove_duplicate_and_self_references, shift_recommendations_left, copy_rows_with_0_to_2_recommendation, copy_rows_with_0_or_1_recommendation, add_recommendations2, minimum_three_recommendations, are_values_unique, remove_empty_related_items

In [6]:
# Sittun's code review edit suggestions

#import modelling as m1
#import modeling_2 as m2

In [7]:
#m2.replace_low_values()

In [8]:
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

In [9]:
# Load items 
df = pd.read_excel('data/AB_Scope_latest.xlsx', usecols=['Item Number'])
print(f">> Loaded {len(df)} e-commerce items with their item_cde")

>> Loaded 4284 e-commerce items with their item_cde


In [10]:
df

Unnamed: 0,Item Number
0,10735678
1,10755108
2,10735704
3,10735679
4,10735697
...,...
4279,20087303
4280,20087304
4281,20087154
4282,20088556


In [11]:
# Rename the columns
df.rename(columns={'Item Number': 'item_cde'}, 
          inplace=True)
print(f">> Renamed the columns to {df.columns.tolist()}")

>> Renamed the columns to ['item_cde']


In [12]:
# connecting to db
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [13]:
# getting transaction data from db
trx_df = fetch_trx_data(conn, df)
print(f">> Fetched the transaction data containing {len(trx_df)} rows")

>> Fetched the transaction data containing 1338033 rows


## If Manufacturer filteration is needed (when scope not given)

In [14]:
trx_df_new = trx_df[trx_df['mfg_name'].str.contains('3M', na=False)]


In [15]:
# OPTIONAL: Apply if mfg filter needed
#mfg_df=filter_mfg_name(trx_df, '')

In [16]:
#Get only the non Print df entries
trx_df_nonpr = filter_fs_segment(trx_df_new)
print(f">> Sixe of data for non print  data is Length: {len(trx_df_nonpr)}")

>> Sixe of data for non print  data is Length: 4976


In [17]:
# aggregate data at bill-to
aggregated_df = multi_aggregate_data(trx_df_nonpr)
print(f">> Aggregated data at bill-to level.Length: {len(aggregated_df)} rows")

>> Aggregated data at bill-to level.Length: 182 rows


In [18]:
# custom column calculation for avg time interval
custom_calculated_df = apply_custom_calculations(aggregated_df)
print(f">> Calculated custom columns.Length: {len(custom_calculated_df)} rows")

>> Calculated custom columns.Length: 182 rows


In [19]:
# make baskets for all possible starting date
baskets_df_365_qty = create_baskets_365_qty(custom_calculated_df)
print(f">> Made baskets with size: {len(baskets_df_365_qty)} baskets")

>> Made baskets with size: 182 baskets


In [20]:
# flatten the baskets
flattened_basket_list_365 = flatten_baskets(baskets_df_365_qty)
print(f">> Flattened baskets. Length: {len(flattened_basket_list_365)}")

>> Flattened baskets. Length: 3056


In [21]:
# make item level recommendations
co_occurrence_matrix, recommendation_df = create_cooccurrence_matrix_with_recommendations(flattened_basket_list_365)
print(f">> Made item level recommendations for: {len(recommendation_df)} items")

100%|██████████| 3056/3056 [00:00<00:00, 525513.22it/s]
100%|██████████| 8/8 [00:00<00:00, 2687.58it/s]

>> Made item level recommendations for: 8 items





In [22]:
# Get the pair frequency matrix
pair_freq_matrix = create_pair_frequency_matrix(flattened_basket_list_365)
print(f">> Created pair wise frequency matrix of Length: {len(pair_freq_matrix)}")

>> Created pair wise frequency matrix of Length: 11


In [23]:
#add freq
recommendation_df=add_freq(recommendation_df,pair_freq_matrix)
print(f">>Added freq to the df {len(recommendation_df)}")

>>Added freq to the df 8


In [24]:
recommendation_df['item_cde'][0]


'10261989'

In [25]:
#Replace recommendations with low basket count with the top cat 3 item
recommendation_df=replace_low_values(recommendation_df,100)
print(f">>Replaced items of baskets with low thresholds: {len(recommendation_df)} rows")

>>Replaced items of baskets with low thresholds: 8 rows


## Top cat3 and cat1 items within trx

In [26]:
# Get the cat3 top 5 items
top_5_items_cat3=get_top_5_cat3_items(trx_df_new)
print(f">> Got cat3 top 5 items: {len(top_5_items_cat3)}")

>> Got cat3 top 5 items: 29


In [27]:
# Get the cat1 top 5 items
top_5_items_cat1=get_top_5_cat1_items(trx_df_new)
print(f">> Got cat1 top 5 items: {len(top_5_items_cat1)}")

>> Got cat1 top 5 items: 8


In [28]:
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [29]:
# getting cat3 data from db
conn = connect_db()

cat3_df = fetch_cat_data(conn, df)
print(f">> Fetched the cat3 data containing {len(cat3_df)} rows")

%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 

In [30]:
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [31]:
# getting cat1 data from db
cat1_df = fetch_cat1_data(conn, df)
print(f">> Fetched the cat1 data containing {len(cat1_df)} rows")

%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 

In [32]:
# Map item to top item in cat3
item_to_cat3_top_map= create_cat3_to_top_item_map(df, cat3_df, top_5_items_cat3)
print(f">> Map items from item to top items in cat3 {len(item_to_cat3_top_map)} rows")

>> Map items from item to top items in cat3 501 rows


In [33]:
item_to_cat3_top_map

{10863215: '10480459',
 10864461: '11152963',
 10563810: '10480459',
 10631239: '11152963',
 10540759: '11152963',
 11114344: '10480459',
 10261989: '10480459',
 10262110: '11152963',
 10542533: '10480459',
 11108998: '10480459',
 10533874: '10480459',
 10787413: '10480459',
 11153840: '10480459',
 10787426: '10480459',
 11079423: '10480459',
 10844688: '10480459',
 10844710: '10480459',
 10844729: '10480459',
 10735257: '10480459',
 10161608: '10480459',
 10059527: '10480459',
 10059531: '10480459',
 10043023: '10480459',
 11154822: '10480459',
 11136701: '10480459',
 11152450: '10480459',
 11152963: '11152963',
 10107339: '11152963',
 10733559: '10480459',
 11119999: '10480459',
 10089883: '11152963',
 10360972: '10480459',
 10999702: '10480459',
 10770803: '10480459',
 10770896: '10480459',
 10728849: '10480459',
 10734843: '10480459',
 10113721: '10480459',
 10731351: '11152963',
 10717864: '10480459',
 10715626: '11152963',
 10631935: '10480459',
 10094247: '10480459',
 10980780: 

In [34]:
# Replace items with low freq with top items

#new_recommendation_df=replace_recommendations(recommendation_df,item_to_cat3_top_map)
#print(f">> Replacde items with low freq with top items with {len(new_recommendation_df)} rows")

In [35]:
recommendation_df

Unnamed: 0,item_cde,Recommendation 1,Recommendation 2,Recommendation 3,Recommendation 4,Recommendation 5,frequency 1,frequency 2,frequency 3,frequency 4,frequency 5
0,10261989,10480459,10771731.0,,,,CAT,CAT,CAT,CAT,CAT
1,10262110,10480459,10771731.0,,,,CAT,CAT,CAT,CAT,CAT
2,10480459,10771731,10262110.0,10261989.0,,,CAT,CAT,CAT,CAT,CAT
3,10771731,11152963,10480459.0,10771735.0,10261989.0,10262110.0,508.0,CAT,CAT,CAT,CAT
4,10771735,10771731,11152963.0,,,,CAT,CAT,CAT,CAT,CAT
5,10774827,11140473,,,,,CAT,CAT,CAT,CAT,CAT
6,11140473,10774827,,,,,CAT,CAT,CAT,CAT,CAT
7,11152963,10771731,10771735.0,,,,508.0,CAT,CAT,CAT,CAT


# Cat 3 operations

In [36]:
# Make cat3 level baskets for co-occurance
cat3_basket_365=replace_item_cde_with_cat3_set(flattened_basket_list_365, cat3_df)
print(f">> Made cat3 level baskets for co-occurance size: {len(cat3_basket_365)}")

>> Made cat3 level baskets for co-occurance size: 3056


In [37]:
# Create co-occurance df for cat3 level
cat3_co_occurrence_matrix, cat3_recommendation_df=create_cooccurrence_matrix_with_recommendations(cat3_basket_365, top_n=5)
print(f">> Create co-occurance df for cat3 level df Length: {len(cat3_recommendation_df)}")

100%|██████████| 3056/3056 [00:00<00:00, 248349.09it/s]
100%|██████████| 3/3 [00:00<?, ?it/s]

>> Create co-occurance df for cat3 level df Length: 3





In [38]:
# Map cat3 and add cat3 level recos
new_cat3_recommendation_df = map_and_add_recommendations(df, cat3_df, recommendation_df, cat3_recommendation_df,top_5_items_cat3)
print(f">> Product+cat3 level df of Length: {len(new_cat3_recommendation_df)} rows")

>> Product+cat3 level df of Length: 145 rows


In [39]:
new_cat3_recommendation_df

Unnamed: 0,item_cde,Recommendation 1,Recommendation 2,Recommendation 3,Recommendation 4,Recommendation 5,frequency 1,frequency 2,frequency 3,frequency 4,frequency 5
0,10261989,10480459,10771731,,,,CAT,CAT,CAT,CAT,CAT
1,10262110,10480459,10771731,,,,CAT,CAT,CAT,CAT,CAT
2,10480459,10771731,10262110,10261989,,,CAT,CAT,CAT,CAT,CAT
3,10771731,11152963,10480459,10771735,10261989,10262110,508.0,CAT,CAT,CAT,CAT
4,10771735,10771731,11152963,,,,CAT,CAT,CAT,CAT,CAT
...,...,...,...,...,...,...,...,...,...,...,...
140,10820264,10771731,11152963,,,,,,,,
141,10854780,10771731,11152963,,,,,,,,
142,10818067,10771731,11152963,,,,,,,,
143,10817092,10771731,10480459,,,,,,,,


In [40]:
#new_cat3_recommendation_df.to_csv('new_cat3_recommendation_df_with_item_cde.csv',index=False)

# Cat 1 Operations

In [41]:
# Make cat1 level baskets for co-occurance
cat1_basket_365=replace_item_cde_with_cat3_set(flattened_basket_list_365, cat1_df)
print(f">> Made cat1 level baskets for co-occurance size: {len(cat1_basket_365)}")

>> Made cat1 level baskets for co-occurance size: 3056


In [42]:
# Create co-occurance df for cat1 level
cat1_co_occurrence_matrix, cat1_recommendation_df=create_cooccurrence_matrix_with_recommendations(cat1_basket_365, top_n=5)
print(f">> Create co-occurance df for cat3 level df Length: {len(cat1_recommendation_df)}")

100%|██████████| 3056/3056 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<00:00, 123.84it/s]

>> Create co-occurance df for cat3 level df Length: 2





In [43]:
cat1_recommendation_df

Unnamed: 0,Recommendation 1,Recommendation 2,Recommendation 3,Recommendation 4,Recommendation 5
Chemicals,Janitorial Supplies,,,,
Janitorial Supplies,Chemicals,,,,


In [44]:
# Map cat1 and add cat3 level recos
new_cat1_recommendation_df = map_and_add_recommendations_cat1(df, cat1_df, new_cat3_recommendation_df, cat1_recommendation_df, top_5_items_cat1)
print(f">> Product+cat1+cat3 level df of Length: {len(new_cat1_recommendation_df)} rows")

>> Product+cat1+cat3 level df of Length: 343 rows


In [45]:
new_cat1_recommendation_df

Unnamed: 0,item_cde,Recommendation 1,Recommendation 2,Recommendation 3,Recommendation 4,Recommendation 5,frequency 1,frequency 2,frequency 3,frequency 4,frequency 5
0,10261989,10480459,10771731,,,,CAT,CAT,CAT,CAT,CAT
1,10262110,10480459,10771731,,,,CAT,CAT,CAT,CAT,CAT
2,10480459,10771731,10262110,10261989,,,CAT,CAT,CAT,CAT,CAT
3,10771731,11152963,10480459,10771735,10261989,10262110,508.0,CAT,CAT,CAT,CAT
4,10771735,10771731,11152963,,,,CAT,CAT,CAT,CAT,CAT
...,...,...,...,...,...,...,...,...,...,...,...
338,20074842,10771731,,,,,,,,,
339,20074877,11152963,,,,,,,,,
340,20078371,10771731,,,,,,,,,
341,20034349,11152963,,,,,,,,,


# Reorder private items

In [46]:
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [47]:
# Get private label information
private_l_df = fetch_private_label_data(conn, df)
print(f">> Fetched the private label data containing {len(private_l_df)} rows")

>> Fetched the private label data containing 4284 rows


In [48]:
private_l_df['private_label_sw'].value_counts ()

private_label_sw
N    2568
Y    1715
Name: count, dtype: int64

In [49]:
# Re-order private items to top
reorder_private_df = reorder_recommendations(new_cat1_recommendation_df, private_l_df)
print(f">> Re-ordered the private label data containing {len(reorder_private_df)} rows")

>> Re-ordered the private label data containing 343 rows


In [50]:
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [51]:
# Get item descriptions
item_desc_df=fetch_item_descriptions(conn,df)
print(f">>Fetched item descriptions data containing {len(reorder_private_df)} rows")

>>Fetched item descriptions data containing 343 rows


In [52]:
#Select columns
reorder_df=reorder_private_df[['item_cde','Recommendation 1','Recommendation 2','Recommendation 3','Recommendation 4','Recommendation 5']]

In [53]:
# Remove duplicate recommendations and self-references
reorder_df_cleaned = remove_duplicate_and_self_references(reorder_df)
print(f">>Removed duplicate recommendations and self-references data containing {len(reorder_df_cleaned)} rows")

>>Removed duplicate recommendations and self-references data containing 343 rows


In [54]:
#Shift non empty recommendations to left
reorder_shift_left=shift_recommendations_left(reorder_df_cleaned)
print(f">>Shift non empty recommendations data containing {len(reorder_shift_left)} rows")

>>Shift non empty recommendations data containing 343 rows


# Atleast 2 recommendation (Katie's request)

In [55]:
#Check # of rows with less trhan 2 recommendation
rows_with_0_or_1_recommendation = copy_rows_with_0_or_1_recommendation(reorder_shift_left)
print(f">>Items with less than 2 recommendations are Length: {len(rows_with_0_or_1_recommendation)} rows")

>>Items with less than 2 recommendations are Length: 2 rows


In [56]:
rows_with_0_to_2_recommendation = copy_rows_with_0_to_2_recommendation(reorder_shift_left)
print(f">>Items with less than 3 recommendations are Length: {len(rows_with_0_to_2_recommendation)} rows")

>>Items with less than 3 recommendations are Length: 204 rows


In [57]:
#Make atleast 2 recommendations
#updated_reorder_df = add_recommendations2(reorder_shift_left, cat3_df, top_5_items_cat3, cat1_df, top_5_items_cat1)
#print(f">>Atleast two recommendations added for data with {len(updated_reorder_df)} rows")

In [58]:
updated_reorder_df = minimum_three_recommendations(reorder_df, cat3_df, top_5_items_cat3, cat1_df, top_5_items_cat1)
print(f">>Atleast three recommendations added for data with {len(updated_reorder_df)} rows")

>>Atleast three recommendations added for data with 343 rows


# Check if updated df has any less than 2 reco rows

In [59]:
rows_with_0_to_2_recommendation = copy_rows_with_0_to_2_recommendation(updated_reorder_df)
print(f">>Length of data with less than 2 recommendations {len(rows_with_0_to_2_recommendation)} rows")

>>Length of data with less than 2 recommendations 0 rows


In [60]:
# Add unique check column to ensure every recommendation is unique in a given row
updated_reorder_df['unique_check'] = updated_reorder_df.apply(are_values_unique, axis=1)
print(f">>Unique check column added for data with {len(updated_reorder_df)} rows")

>>Unique check column added for data with 343 rows


In [61]:
#updated_reorder_df.to_csv('unique_updated_reorder_df_min2_recos.csv')

In [62]:
# Transform the recommendations row-wise
transformed_df = transform_recommendations(updated_reorder_df)
print(f">> Transform the recommendations, new Length {len(transformed_df)} rows, which is 5 times {len(new_cat1_recommendation_df)} (Prev length)")

>> Transform the recommendations, new Length 1715 rows, which is 5 times 343 (Prev length)


In [63]:
# Remove empty items from related items column
nonempty_transformed_df=remove_empty_related_items(transformed_df)
print(f">>Data after removing empty related items have {len(nonempty_transformed_df)} rows")

>>Data after removing empty related items have 696 rows


In [64]:
nonempty_transformed_df

Unnamed: 0,Primary Item Number,Related Item Number
0,10261989,10480459
1,10261989,10771731
2,10261989,11152963
5,10262110,10480459
6,10262110,10771731
...,...,...
1702,20078371,11152963
1705,20034349,11152963
1707,20034349,10771731
1710,20087154,11152963


In [65]:
type(nonempty_transformed_df['Primary Item Number'][1])

str

In [66]:
#nonempty_transformed_df.to_csv('nonprint_minimum2_recommendations.csv', index=False)

## Result needed for new items only

In [67]:
def filter_primary_item_numbers(df, item_numbers):
    # Filter the DataFrame to include only rows where 'Primary Item Number' is in the item_numbers list
    filtered_df = df[df['Primary Item Number'].isin(item_numbers)]
    return filtered_df

# Define the list of item numbers
item_numbers = ['20121463', '20121758'


]

nonempty_transformed_filtered_df = filter_primary_item_numbers(nonempty_transformed_df, item_numbers)
print(nonempty_transformed_filtered_df)

Empty DataFrame
Columns: [Primary Item Number, Related Item Number]
Index: []


In [69]:
nonempty_transformed_df.to_csv('results_mar25/3m_fs.csv',index=False)

In [75]:
# Add item descriptions
updated_reorder_df_with_desc=add_descriptions(item_desc_df,updated_reorder_df)
print(f">>Added item descriptions to data containing {len(updated_reorder_df_with_desc)} rows")

>>Added item descriptions to data containing 96 rows
