In [1]:
import pandas as pd
from data_retrieval import connect_db, fetch_trx_data, fetch_cat_data, fetch_cat1_data, fetch_item_descriptions, fetch_private_label_data
from data_processing import multi_aggregate_data, apply_custom_calculations
from data_analysis import create_baskets, create_baskets_365, create_baskets_365_qty, flatten_baskets
from modelling import create_cooccurrence_matrix_with_recommendations, create_pair_frequency_matrix,add_freq
from modeling_2 import replace_low_values,get_top_5_cat3_items, get_top_5_cat1_items, create_cat3_to_top_item_map, replace_recommendations
from modeling_3 import filter_mfg_name, replace_item_cde_with_cat3_set,map_and_add_recommendations, map_and_add_recommendations_cat1, transform_recommendations, filter_non_print_segment, reorder_recommendations, add_descriptions
from modeling_4 import remove_duplicate_and_self_references, shift_recommendations_left, copy_rows_with_0_to_2_recommendation, copy_rows_with_0_or_1_recommendation, add_recommendations2, minimum_three_recommendations, are_values_unique, remove_empty_related_items

In [2]:
# Sittun's code review edit suggestions

#import modelling as m1
#import modeling_2 as m2

In [3]:
#m2.replace_low_values()

In [4]:
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

In [5]:
# Load items 
df = pd.read_excel('data/GOJO AB Product Scope.xlsx', usecols=['Item Number'])
print(f">> Loaded {len(df)} e-commerce items with their item_cde")

>> Loaded 97 e-commerce items with their item_cde


In [6]:
df

Unnamed: 0,Item Number
0,10570540
1,10561388
2,10102473
3,10664064
4,10561435
...,...
92,20090396
93,20090403
94,20090407
95,20090409


In [7]:
# Rename the columns
df.rename(columns={'Item Number': 'item_cde'}, 
          inplace=True)
print(f">> Renamed the columns to {df.columns.tolist()}")

>> Renamed the columns to ['item_cde']


In [8]:
# connecting to db
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [9]:
# getting transaction data from db
trx_df = fetch_trx_data(conn, df)
print(f">> Fetched the transaction data containing {len(trx_df)} rows")

>> Fetched the transaction data containing 70645 rows


## If Manufacturer filteration is needed (when scope not given)

In [10]:
#trx_df_new = trx_df[trx_df['mfg_name'].str.contains('3M', na=False)]


In [11]:
# OPTIONAL: Apply if mfg filter needed
#mfg_df=filter_mfg_name(trx_df, '')

In [12]:
#Get only the non Print df entries
trx_df_nonpr = filter_non_print_segment(trx_df)
print(f">> Sixe of data for non print  data is Length: {len(trx_df)}")

>> Sixe of data for non print  data is Length: 70645


In [13]:
# aggregate data at bill-to
aggregated_df = multi_aggregate_data(trx_df_nonpr)
print(f">> Aggregated data at bill-to level.Length: {len(aggregated_df)} rows")

>> Aggregated data at bill-to level.Length: 1995 rows


In [14]:
# custom column calculation for avg time interval
custom_calculated_df = apply_custom_calculations(aggregated_df)
print(f">> Calculated custom columns.Length: {len(custom_calculated_df)} rows")

>> Calculated custom columns.Length: 1995 rows


In [15]:
# make baskets for all possible starting date
baskets_df_365_qty = create_baskets_365_qty(custom_calculated_df)
print(f">> Made baskets with size: {len(baskets_df_365_qty)} baskets")

>> Made baskets with size: 1995 baskets


In [16]:
# flatten the baskets
flattened_basket_list_365 = flatten_baskets(baskets_df_365_qty)
print(f">> Flattened baskets. Length: {len(flattened_basket_list_365)}")

>> Flattened baskets. Length: 22094


In [17]:
# make item level recommendations
co_occurrence_matrix, recommendation_df = create_cooccurrence_matrix_with_recommendations(flattened_basket_list_365)
print(f">> Made item level recommendations for: {len(recommendation_df)} items")

100%|██████████| 22094/22094 [00:00<00:00, 458110.85it/s]
100%|██████████| 89/89 [00:00<00:00, 3315.60it/s]

>> Made item level recommendations for: 89 items





In [19]:
# Get the pair frequency matrix
pair_freq_matrix = create_pair_frequency_matrix(flattened_basket_list_365)
print(f">> Created pair wise frequency matrix of Length: {len(pair_freq_matrix)}")

>> Created pair wise frequency matrix of Length: 90


In [21]:
#add freq
recommendation_df=add_freq(recommendation_df,pair_freq_matrix)
print(f">>Added freq to the df {len(recommendation_df)}")

>>Added freq to the df 89


In [22]:
recommendation_df['item_cde'][0]


'10043489'

In [23]:
#Replace recommendations with low basket count with the top cat 3 item
recommendation_df=replace_low_values(recommendation_df,100)
print(f">>Replaced items of baskets with low thresholds: {len(recommendation_df)} rows")

>>Replaced items of baskets with low thresholds: 89 rows


In [24]:
# Get the cat3 top 5 items
top_5_items_cat3=get_top_5_cat3_items(trx_df)
print(f">> Got cat3 top 5 items: {len(top_5_items_cat3)}")

>> Got cat3 top 5 items: 8


In [25]:
# Get the cat1 top 5 items
top_5_items_cat1=get_top_5_cat1_items(trx_df)
print(f">> Got cat1 top 5 items: {len(top_5_items_cat1)}")

>> Got cat1 top 5 items: 4


In [26]:
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [27]:
# getting cat3 data from db
conn = connect_db()

cat3_df = fetch_cat_data(conn, df)
print(f">> Fetched the cat3 data containing {len(cat3_df)} rows")

%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s
>> Fetched the cat3 data containing 97 rows


In [28]:
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [29]:
# getting cat1 data from db
cat1_df = fetch_cat1_data(conn, df)
print(f">> Fetched the cat1 data containing {len(cat1_df)} rows")

%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s
>> Fetched the cat1 data containing 97 rows


In [31]:
# Map item to top item in cat3
item_to_cat3_top_map= create_cat3_to_top_item_map(df, cat3_df, top_5_items_cat3)
print(f">> Map items from item to top items in cat3 {len(item_to_cat3_top_map)} rows")

>> Map items from item to top items in cat3 97 rows


In [32]:
item_to_cat3_top_map

{10570540: '10568358',
 10561388: '10943252',
 10102473: '10568358',
 10664064: '10568358',
 10561435: '10568358',
 10561454: '10943252',
 10561422: '10943252',
 11157487: '10568358',
 11160268: '10568358',
 11141927: '10943252',
 11157483: '10568358',
 10562613: '10943252',
 10559422: '10943252',
 10559699: '10943252',
 11148094: '11148094',
 11109702: '10568358',
 10787413: '10844710',
 11126675: '11126675',
 10787426: '10844710',
 11153989: '10943252',
 10844688: '10844710',
 10844710: '10844710',
 10844729: '10844710',
 10499244: '10943252',
 10576587: '10568358',
 10105196: '10748903',
 10583076: '10568358',
 11100207: '10943252',
 11155222: '10943252',
 11155231: '10568358',
 11155237: '10568358',
 10580144: '10943252',
 11133601: '10568358',
 10466265: '10568358',
 11048293: '10568358',
 11133627: '20017743',
 11134604: '20017743',
 10495917: '10943252',
 11108424: '10568358',
 10431273: '10568358',
 10843315: '10568358',
 10601450: '10943252',
 10346499: '10568358',
 10729064: 

In [31]:
# Replace items with low freq with top items

#new_recommendation_df=replace_recommendations(recommendation_df,item_to_cat3_top_map)
#print(f">> Replacde items with low freq with top items with {len(new_recommendation_df)} rows")

In [33]:
recommendation_df

Unnamed: 0,item_cde,Recommendation 1,Recommendation 2,Recommendation 3,Recommendation 4,Recommendation 5,frequency 1,frequency 2,frequency 3,frequency 4,frequency 5
0,10043489,11157483,10466265,10062551,10943252,10559422,CAT,CAT,CAT,CAT,CAT
1,10062528,10355644,10062551,11155222,10043489,10572500,CAT,CAT,CAT,CAT,CAT
2,10062551,10942965,10844710,11157483,11160268,10466265,CAT,CAT,109.0,CAT,CAT
3,10076998,10572500,10572501,10580144,11133627,10729064,1606.0,544.0,573.0,571.0,234.0
4,10102473,11157483,11157487,11160268,10431273,10118393,CAT,CAT,CAT,CAT,CAT
...,...,...,...,...,...,...,...,...,...,...,...
84,20090387,20090317,20090322,10573468,10561388,10568358,CAT,CAT,CAT,CAT,CAT
85,20090394,20090212,20090387,20090307,10062551,20090322,CAT,CAT,CAT,CAT,CAT
86,20090403,20090322,20090387,10943252,,,CAT,CAT,CAT,CAT,CAT
87,20090407,20090387,20090317,20090380,20090322,10943252,CAT,CAT,CAT,CAT,CAT


# Cat 3 operations

In [34]:
# Make cat3 level baskets for co-occurance
cat3_basket_365=replace_item_cde_with_cat3_set(flattened_basket_list_365, cat3_df)
print(f">> Made cat3 level baskets for co-occurance size: {len(cat3_basket_365)}")

>> Made cat3 level baskets for co-occurance size: 22094


In [35]:
# Create co-occurance df for cat3 level
cat3_co_occurrence_matrix, cat3_recommendation_df=create_cooccurrence_matrix_with_recommendations(cat3_basket_365, top_n=5)
print(f">> Create co-occurance df for cat3 level df Length: {len(cat3_recommendation_df)}")

100%|██████████| 22094/22094 [00:00<00:00, 681695.12it/s]
100%|██████████| 8/8 [00:00<00:00, 2634.20it/s]

>> Create co-occurance df for cat3 level df Length: 8





In [36]:
# Map cat3 and add cat3 level recos
new_cat3_recommendation_df = map_and_add_recommendations(df, cat3_df, recommendation_df, cat3_recommendation_df,top_5_items_cat3)
print(f">> Product+cat3 level df of Length: {len(new_cat3_recommendation_df)} rows")

>> Product+cat3 level df of Length: 97 rows


In [40]:
new_cat3_recommendation_df

Unnamed: 0,item_cde,Recommendation 1,Recommendation 2,Recommendation 3,Recommendation 4,Recommendation 5,frequency 1,frequency 2,frequency 3,frequency 4,frequency 5
0,10043489,11157483,10466265,10062551,10943252,10559422,CAT,CAT,CAT,CAT,CAT
1,10062528,10355644,10062551,11155222,10043489,10572500,CAT,CAT,CAT,CAT,CAT
2,10062551,10942965,10844710,11157483,11160268,10466265,CAT,CAT,109.0,CAT,CAT
3,10076998,10572500,10572501,10580144,11133627,10729064,1606.0,544.0,573.0,571.0,234.0
4,10102473,11157483,11157487,11160268,10431273,10118393,CAT,CAT,CAT,CAT,CAT
...,...,...,...,...,...,...,...,...,...,...,...
92,20090335,10568358,10943252,10748903,10844710,11148094,,,,,
93,20090386,10568358,10748903,10844710,20017743,11148094,,,,,
94,20090391,10568358,10748903,10844710,20017743,11148094,,,,,
95,20090396,10568358,10943252,10748903,10844710,11148094,,,,,


In [98]:
#new_cat3_recommendation_df.to_csv('new_cat3_recommendation_df_with_item_cde.csv',index=False)

# Cat 1 Operations

In [41]:
# Make cat1 level baskets for co-occurance
cat1_basket_365=replace_item_cde_with_cat3_set(flattened_basket_list_365, cat1_df)
print(f">> Made cat1 level baskets for co-occurance size: {len(cat1_basket_365)}")

>> Made cat1 level baskets for co-occurance size: 22094


In [42]:
# Create co-occurance df for cat1 level
cat1_co_occurrence_matrix, cat1_recommendation_df=create_cooccurrence_matrix_with_recommendations(cat1_basket_365, top_n=5)
print(f">> Create co-occurance df for cat3 level df Length: {len(cat1_recommendation_df)}")

100%|██████████| 22094/22094 [00:00<00:00, 965190.99it/s]
100%|██████████| 4/4 [00:00<00:00, 2424.10it/s]

>> Create co-occurance df for cat3 level df Length: 4





In [43]:
cat1_recommendation_df

Unnamed: 0,Recommendation 1,Recommendation 2,Recommendation 3,Recommendation 4,Recommendation 5
Chemicals,Skincare,Wipers,,,
Personal Hygiene,Skincare,,,,
Skincare,Chemicals,Wipers,Personal Hygiene,,
Wipers,Skincare,Chemicals,,,


In [46]:
# Map cat1 and add cat3 level recos
new_cat1_recommendation_df = map_and_add_recommendations_cat1(df, cat1_df, new_cat3_recommendation_df, cat1_recommendation_df, top_5_items_cat1)
print(f">> Product+cat1+cat3 level df of Length: {len(new_cat1_recommendation_df)} rows")

>> Product+cat1+cat3 level df of Length: 97 rows


In [47]:
new_cat1_recommendation_df

Unnamed: 0,item_cde,Recommendation 1,Recommendation 2,Recommendation 3,Recommendation 4,Recommendation 5,frequency 1,frequency 2,frequency 3,frequency 4,frequency 5
0,10043489,11157483,10466265,10062551,10943252,10559422,CAT,CAT,CAT,CAT,CAT
1,10062528,10355644,10062551,11155222,10043489,10572500,CAT,CAT,CAT,CAT,CAT
2,10062551,10942965,10844710,11157483,11160268,10466265,CAT,CAT,109.0,CAT,CAT
3,10076998,10572500,10572501,10580144,11133627,10729064,1606.0,544.0,573.0,571.0,234.0
4,10102473,11157483,11157487,11160268,10431273,10118393,CAT,CAT,CAT,CAT,CAT
...,...,...,...,...,...,...,...,...,...,...,...
92,20090335,10568358,10943252,10748903,10844710,11148094,,,,,
93,20090386,10568358,10748903,10844710,20017743,11148094,,,,,
94,20090391,10568358,10748903,10844710,20017743,11148094,,,,,
95,20090396,10568358,10943252,10748903,10844710,11148094,,,,,


# Reorder private items

In [39]:
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [40]:
# Get private label information
private_l_df = fetch_private_label_data(conn, df)
print(f">> Fetched the private label data containing {len(private_l_df)} rows")

>> Fetched the private label data containing 97 rows


In [41]:
private_l_df['private_label_sw'].value_counts ()

private_label_sw
N    97
Name: count, dtype: int64

In [42]:
# Re-order private items to top
reorder_private_df = reorder_recommendations(new_cat1_recommendation_df, private_l_df)
print(f">> Re-ordered the private label data containing {len(reorder_private_df)} rows")

>> Re-ordered the private label data containing 97 rows


In [43]:
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [44]:
# Get item descriptions
item_desc_df=fetch_item_descriptions(conn,df)
print(f">>Fetched item descriptions data containing {len(reorder_private_df)} rows")

>>Fetched item descriptions data containing 97 rows


In [45]:
#Select columns
reorder_df=reorder_private_df[['item_cde','Recommendation 1','Recommendation 2','Recommendation 3','Recommendation 4','Recommendation 5']]

In [46]:
# Remove duplicate recommendations and self-references
reorder_df_cleaned = remove_duplicate_and_self_references(reorder_df)
print(f">>Removed duplicate recommendations and self-references data containing {len(reorder_df_cleaned)} rows")

>>Removed duplicate recommendations and self-references data containing 97 rows


In [47]:
#Shift non empty recommendations to left
reorder_shift_left=shift_recommendations_left(reorder_df_cleaned)
print(f">>Shift non empty recommendations data containing {len(reorder_shift_left)} rows")

>>Shift non empty recommendations data containing 97 rows


In [108]:
reorder_shift_left.iloc[[reorder_shift_left['Recommendation 3']], ]

Unnamed: 0,item_cde,Recommendation 1,Recommendation 2,Recommendation 3,Recommendation 4,Recommendation 5,unique_check
0,10043489,10568358,10943252,10076998,,,True
1,10062528,10943252,10568358,,,,True
2,10062551,10568358,10844710,11157483,,,True
3,10076998,10572500,10572501,10580144,11133627,10729064,True
4,10102473,10568358,10076998,10943261,,,True
...,...,...,...,...,...,...,...
92,20090335,10844710,11148094,10062528,,,True
93,20090386,10844710,11148094,10062528,,,True
94,20090391,10844710,11148094,10062528,,,True
95,20090396,10844710,11148094,10062528,,,True


# Atleast 2 recommendation (Katie's request)

In [92]:
#Check # of rows with less trhan 2 recommendation
rows_with_0_or_1_recommendation = copy_rows_with_0_or_1_recommendation(reorder_shift_left)
print(f">>Items with less than 2 recommendations are Length: {len(rows_with_0_or_1_recommendation)} rows")

>>Items with less than 2 recommendations are Length: 0 rows


In [49]:
rows_with_0_to_2_recommendation = copy_rows_with_0_to_2_recommendation(reorder_shift_left)
print(f">>Items with less than 3 recommendations are Length: {len(rows_with_0_to_2_recommendation)} rows")

>>Items with less than 3 recommendations are Length: 34 rows


In [90]:
#Make atleast 2 recommendations
#updated_reorder_df = add_recommendations2(reorder_shift_left, cat3_df, top_5_items_cat3, cat1_df, top_5_items_cat1)
#print(f">>Atleast two recommendations added for data with {len(updated_reorder_df)} rows")

In [51]:
updated_reorder_df = minimum_three_recommendations(reorder_df, cat3_df, top_5_items_cat3, cat1_df, top_5_items_cat1)
print(f">>Atleast three recommendations added for data with {len(updated_reorder_df)} rows")

>>Atleast three recommendations added for data with 97 rows


# Check if updated df has any less than 2 reco rows

In [52]:
rows_with_0_to_2_recommendation = copy_rows_with_0_to_2_recommendation(updated_reorder_df)
print(f">>Length of data with less than 2 recommendations {len(rows_with_0_to_2_recommendation)} rows")

>>Length of data with less than 2 recommendations 1 rows


In [62]:
updated_reorder_df[updated_reorder_df['item_cde']=='20121738']

Unnamed: 0,item_cde,Recommendation 1,Recommendation 2,Recommendation 3,Recommendation 4,Recommendation 5,unique_check
96,20121738,10943252,10844710,,,,True


In [53]:
# Add unique check column to ensure every recommendation is unique in a given row
updated_reorder_df['unique_check'] = updated_reorder_df.apply(are_values_unique, axis=1)
print(f">>Unique check column added for data with {len(updated_reorder_df)} rows")

>>Unique check column added for data with 97 rows


In [194]:
#updated_reorder_df.to_csv('unique_updated_reorder_df_min2_recos.csv')

In [54]:
# Transform the recommendations row-wise
transformed_df = transform_recommendations(updated_reorder_df)
print(f">> Transform the recommendations, new Length {len(transformed_df)} rows, which is 5 times {len(new_cat1_recommendation_df)} (Prev length)")

>> Transform the recommendations, new Length 485 rows, which is 5 times 97 (Prev length)


In [55]:
# Remove empty items from related items column
nonempty_transformed_df=remove_empty_related_items(transformed_df)
print(f">>Data after removing empty related items have {len(nonempty_transformed_df)} rows")

>>Data after removing empty related items have 330 rows


In [56]:
nonempty_transformed_df

Unnamed: 0,Primary Item Number,Related Item Number
0,10043489,10568358
1,10043489,10943252
2,10043489,10076998
5,10062528,10943252
6,10062528,10568358
...,...,...
475,20090396,10844710
476,20090396,11148094
477,20090396,10062528
480,20121738,10943252


In [198]:
type(nonempty_transformed_df['Primary Item Number'][1])

str

In [199]:
#nonempty_transformed_df.to_csv('nonprint_minimum2_recommendations.csv', index=False)

## Result needed for new items only

In [57]:
def filter_primary_item_numbers(df, item_numbers):
    # Filter the DataFrame to include only rows where 'Primary Item Number' is in the item_numbers list
    filtered_df = df[df['Primary Item Number'].isin(item_numbers)]
    return filtered_df

# Define the list of item numbers
item_numbers = [
    "20121738"
]

nonempty_transformed_filtered_df = filter_primary_item_numbers(nonempty_transformed_df, item_numbers)
print(nonempty_transformed_filtered_df)

    Primary Item Number Related Item Number
480            20121738            10943252
481            20121738            10844710


In [136]:
nonempty_transformed_filtered_df.to_csv('Gojo_results_2.csv',index=False)

In [75]:
# Add item descriptions
updated_reorder_df_with_desc=add_descriptions(item_desc_df,updated_reorder_df)
print(f">>Added item descriptions to data containing {len(updated_reorder_df_with_desc)} rows")

>>Added item descriptions to data containing 96 rows
