In [6]:
import pandas as pd
from data_retrieval import connect_db, fetch_trx_data, fetch_cat_data, fetch_cat1_data, fetch_item_descriptions, fetch_private_label_data
from data_processing import multi_aggregate_data, apply_custom_calculations
from data_analysis import create_baskets, create_baskets_365, create_baskets_365_qty, flatten_baskets
from modelling import create_cooccurrence_matrix_with_recommendations, create_pair_frequency_matrix,add_freq
from modeling_2 import replace_low_values,get_top_5_cat3_items, get_top_5_cat1_items, create_cat3_to_top_item_map, replace_recommendations
from modeling_3 import replace_item_cde_with_cat3_set,map_and_add_recommendations, map_and_add_recommendations_cat1, transform_recommendations, filter_non_print_segment, reorder_recommendations, add_descriptions
from modeling_4 import remove_duplicate_and_self_references, shift_recommendations_left, copy_rows_with_0_or_1_recommendation, add_recommendations2, are_values_unique, remove_empty_related_items

In [7]:
# Sittun's code review edit suggestions

#import modelling as m1
#import modeling_2 as m2

In [8]:
#m2.replace_low_values()

In [9]:
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

In [10]:
# Load items 
df = pd.read_excel('data/AB Scope 5.22.xlsx', usecols=['Item Number'])
print(f">> Loaded {len(df)} e-commerce items with their item_cde")

>> Loaded 4068 e-commerce items with their item_cde


In [11]:
# Rename the columns
df.rename(columns={'Item Number': 'item_cde'}, 
          inplace=True)
print(f">> Renamed the columns to {df.columns.tolist()}")

>> Renamed the columns to ['item_cde']


In [12]:
# connecting to db
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [13]:
# getting transaction data from db
trx_df = fetch_trx_data(conn, df)
print(f">> Fetched the transaction data containing {len(trx_df)} rows")

>> Fetched the transaction data containing 1308581 rows


In [14]:
#Get only the non Print df entries
trx_df = filter_non_print_segment(trx_df)
print(f">> Sixe of data for non print  data is Length: {len(trx_df)}")

>> Sixe of data for non print  data is Length: 1124971


In [15]:
# aggregate data at bill-to
aggregated_df = multi_aggregate_data(trx_df)
print(f">> Aggregated data at bill-to level.Length: {len(aggregated_df)} rows")

>> Aggregated data at bill-to level.Length: 15370 rows


In [16]:
# custom column calculation for avg time interval
custom_calculated_df = apply_custom_calculations(aggregated_df)
print(f">> Calculated custom columns.Length: {len(custom_calculated_df)} rows")

>> Calculated custom columns.Length: 15370 rows


In [17]:
# make baskets for all possible starting date
baskets_df_365_qty = create_baskets_365_qty(custom_calculated_df)
print(f">> Made baskets with size: {len(baskets_df_365_qty)} baskets")

>> Made baskets with size: 15370 baskets


In [18]:
# flatten the baskets
flattened_basket_list_365 = flatten_baskets(baskets_df_365_qty)
print(f">> Flattened baskets. Length: {len(flattened_basket_list_365)}")

>> Flattened baskets. Length: 273009


In [19]:
# make item level recommendations
co_occurrence_matrix, recommendation_df = create_cooccurrence_matrix_with_recommendations(flattened_basket_list_365)
print(f">> Made item level recommendations for: {len(recommendation_df)} items")

100%|██████████| 273009/273009 [00:12<00:00, 21788.65it/s]
100%|██████████| 2963/2963 [00:06<00:00, 459.55it/s] 


>> Made item level recommendations for: 2963 items


In [20]:
# Get the pair frequency matrix
pair_freq_matrix = create_pair_frequency_matrix(flattened_basket_list_365)
print(f">> Created pair wise frequency matrix of Length: {len(pair_freq_matrix)}")

>> Created pair wise frequency matrix of Length: 3061


In [21]:
#add freq
recommendation_df=add_freq(recommendation_df,pair_freq_matrix)
print(f">>Added freq to the df {len(recommendation_df)}")

>>Added freq to the df 2963


In [22]:
#Replace recommendations with low basket count with the top cat 3 item
recommendation_df=replace_low_values(recommendation_df,100)
print(f">>Replaced items of baskets with low thresholds: {len(recommendation_df)} rows")

>>Replaced items of baskets with low thresholds: 2963 rows


In [23]:
# Get the cat3 top 5 items
top_5_items_cat3=get_top_5_cat3_items(trx_df)
print(f">> Got cat3 top 5 items: {len(top_5_items_cat3)}")

>> Got cat3 top 5 items: 231


In [24]:
# Get the cat1 top 5 items
top_5_items_cat1=get_top_5_cat1_items(trx_df)
print(f">> Got cat1 top 5 items: {len(top_5_items_cat1)}")

>> Got cat1 top 5 items: 25


In [25]:
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [26]:
# getting cat3 data from db
cat3_df = fetch_cat_data(conn, df)
print(f">> Fetched the cat3 data containing {len(cat3_df)} rows")

%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 

In [27]:
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [28]:
# getting cat1 data from db
cat1_df = fetch_cat1_data(conn, df)
print(f">> Fetched the cat1 data containing {len(cat1_df)} rows")

%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 

In [29]:
# Map item to top item in cat3
item_to_cat3_top_map= create_cat3_to_top_item_map(df, cat3_df, top_5_items_cat3)
print(f">> Map items from item to top items in cat3 {len(item_to_cat3_top_map)} rows")

>> Map items from item to top items in cat3 3492 rows


In [30]:
# Replace items with low freq with top items
new_recommendation_df=replace_recommendations(recommendation_df,item_to_cat3_top_map)
print(f">> Replacde items with low freq with top items with {len(new_recommendation_df)} rows")

>> Replacde items with low freq with top items with 2963 rows


# Cat 3 operations

In [31]:
# Make cat3 level baskets for co-occurance
cat3_basket_365=replace_item_cde_with_cat3_set(flattened_basket_list_365, cat3_df)
print(f">> Made cat3 level baskets for co-occurance size: {len(cat3_basket_365)}")

>> Made cat3 level baskets for co-occurance size: 273009


In [32]:
# Create co-occurance df for cat3 level
cat3_co_occurrence_matrix, cat3_recommendation_df=create_cooccurrence_matrix_with_recommendations(cat3_basket_365, top_n=5)
print(f">> Create co-occurance df for cat3 level df Length: {len(cat3_recommendation_df)}")

100%|██████████| 273009/273009 [00:04<00:00, 66259.72it/s]
100%|██████████| 227/227 [00:00<00:00, 1064.15it/s]

>> Create co-occurance df for cat3 level df Length: 227





In [33]:
# Map cat3 and add cat3 level recos
new_cat3_recommendation_df = map_and_add_recommendations(df, cat3_df, new_recommendation_df, cat3_recommendation_df,top_5_items_cat3)
print(f">> Product+cat3 level df of Length: {len(new_cat3_recommendation_df)} rows")

>> Product+cat3 level df of Length: 3485 rows


In [34]:
#new_cat3_recommendation_df.to_csv('new_cat3_recommendation_df_with_item_cde.csv',index=False)

# Cat 1 Operations

In [35]:
# Make cat1 level baskets for co-occurance
cat1_basket_365=replace_item_cde_with_cat3_set(flattened_basket_list_365, cat1_df)
print(f">> Made cat1 level baskets for co-occurance size: {len(cat1_basket_365)}")

>> Made cat1 level baskets for co-occurance size: 273009


In [36]:
# Create co-occurance df for cat1 level
cat1_co_occurrence_matrix, cat1_recommendation_df=create_cooccurrence_matrix_with_recommendations(cat1_basket_365, top_n=5)
print(f">> Create co-occurance df for cat3 level df Length: {len(cat1_recommendation_df)}")

  0%|          | 0/273009 [00:00<?, ?it/s]

100%|██████████| 273009/273009 [00:01<00:00, 187148.60it/s]
100%|██████████| 25/25 [00:00<00:00, 1596.47it/s]

>> Create co-occurance df for cat3 level df Length: 25





In [37]:
# Map cat1 and add cat3 level recos
new_cat1_recommendation_df = map_and_add_recommendations_cat1(df, cat1_df, new_recommendation_df, cat1_recommendation_df, top_5_items_cat1)
print(f">> Product+cat1+cat3 level df of Length: {len(new_cat1_recommendation_df)} rows")

>> Product+cat1+cat3 level df of Length: 3502 rows


In [38]:
#transformed_df.to_csv('Transformed_result_non_print.csv', index=False)

# Reorder private items

In [39]:
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [40]:
# Get private label information
private_l_df = fetch_private_label_data(conn, df)
print(f">> Fetched the private label data containing {len(private_l_df)} rows")

>> Fetched the private label data containing 4057 rows


In [41]:
private_l_df['private_label_sw'].value_counts ()

private_label_sw
N    2690
Y    1367
Name: count, dtype: int64

In [42]:
# Re-order private items to top
reorder_private_df = reorder_recommendations(new_cat1_recommendation_df, private_l_df)
print(f">> Re-ordered the private label data containing {len(reorder_private_df)} rows")

>> Re-ordered the private label data containing 3502 rows


In [43]:
conn = connect_db()
print(">> Connected to database")

>> Connected to database


In [44]:
# Get item descriptions
item_desc_df=fetch_item_descriptions(conn,df)
print(f">>Fetched item descriptions data containing {len(reorder_private_df)} rows")

>>Fetched item descriptions data containing 3502 rows


In [45]:
#Select columns
reorder_df=reorder_private_df[['item_cde','Recommendation 1','Recommendation 2','Recommendation 3','Recommendation 4','Recommendation 5']]

In [46]:
# Remove duplicate recommendations and self-references
reorder_df_cleaned = remove_duplicate_and_self_references(reorder_df)
print(f">>Removed duplicate recommendations and self-references data containing {len(reorder_df_cleaned)} rows")

>>Removed duplicate recommendations and self-references data containing 3502 rows


In [47]:
#Shift non empty recommendations to left
reorder_shift_left=shift_recommendations_left(reorder_df_cleaned)
print(f">>Shift non empty recommendations data containing {len(reorder_shift_left)} rows")

>>Shift non empty recommendations data containing 3502 rows


# Atleast 2 recommendation (Katie's request)

In [48]:
#Check # of rows with less trhan 2 recommendation
rows_with_0_or_1_recommendation = copy_rows_with_0_or_1_recommendation(reorder_shift_left)
print(f">>Items with less than 2 recommendations are Length: {len(rows_with_0_or_1_recommendation)} rows")

>>Items with less than 2 recommendations are Length: 125 rows


In [49]:
rows_with_0_or_1_recommendation

Unnamed: 0,item_cde,Recommendation 1,Recommendation 2,Recommendation 3,Recommendation 4,Recommendation 5
175,10069277,10112665,,,,
232,10094247,10943393,,,,
234,10095252,10061675,,,,
242,10095914,10746874,,,,
325,10128916,10775665,,,,
...,...,...,...,...,...,...
2906,20079294,,,,,
2909,20079976,20064066,,,,
2943,20092990,10807250,,,,
2945,20092995,10058192,,,,


In [50]:
#Make atleast 2 recommendations
updated_reorder_df = add_recommendations2(reorder_shift_left, cat3_df, top_5_items_cat3, cat1_df, top_5_items_cat1)
print(f">>Atleast two recommendations added for data with {len(updated_reorder_df)} rows")

>>Atleast two recommendations added for data with 3502 rows


# Check if updated df has any less than 2 reco rows

In [51]:
rows_with_0_or_1_recommendation = copy_rows_with_0_or_1_recommendation(updated_reorder_df)
print(f">>Length of data with less than 2 recommendations {len(rows_with_0_or_1_recommendation)} rows")

>>Length of data with less than 2 recommendations 0 rows


In [52]:
# Add unique check column to ensure every recommendation is unique in a given row
updated_reorder_df['unique_check'] = updated_reorder_df.apply(are_values_unique, axis=1)
print(f">>Unique check column added for data with {len(updated_reorder_df)} rows")

>>Unique check column added for data with 3502 rows


In [53]:
#updated_reorder_df.to_csv('unique_updated_reorder_df_min2_recos.csv')

In [54]:
# Transform the recommendations row-wise
transformed_df = transform_recommendations(updated_reorder_df)
print(f">> Transform the recommendations, new Length {len(transformed_df)} rows, which is 5 times {len(new_cat1_recommendation_df)} (Prev length)")

>> Transform the recommendations, new Length 17510 rows, which is 5 times 3502 (Prev length)


In [55]:
# Remove empty items from related items column
nonempty_transformed_df=remove_empty_related_items(transformed_df)
print(f">>Data after removing empty related items have {len(nonempty_transformed_df)} rows")

>>Data after removing empty related items have 15100 rows


In [56]:
nonempty_transformed_df

Unnamed: 0,Primary Item Number,Related Item Number
0,10000633,20017743
1,10000633,10568358
5,10012415,10012427
6,10012415,10357365
7,10012415,10098323
...,...,...
17505,20103409,10769252
17506,20103409,20076005
17507,20103409,10058192
17508,20103409,10553070


In [57]:
type(nonempty_transformed_df['Primary Item Number'][1])

str

In [58]:
#nonempty_transformed_df.to_csv('nonprint_minimum2_recommendations.csv', index=False)

## Result needed for new items only

In [59]:
def filter_primary_item_numbers(df, item_numbers):
    # Filter the DataFrame to include only rows where 'Primary Item Number' is in the item_numbers list
    filtered_df = df[df['Primary Item Number'].isin(item_numbers)]
    return filtered_df

# Define the list of item numbers
item_numbers = ["11148094", "10787426", "10844688", "10844729", "20103861", "10844710", "20003326", "11141927", "20003476", "20003479", "10645366", "10645073", "20009689", "20059769", "20059771", "20059773", "10280339", "10523012", "10667701", "0032881", "10683337", "20023036", "20030097", "110957719", "10805248", "10733145", "10903857", "20046951", "20096682", "20096689", "20096700", "20096711", "20103409", "20071296", "11114160", "10790906", "10391637", "10911305", "10911952", "10845703", "10696366", "10569852", "11147726", "710996009", "10620638", "10355009", "10833658", "10501355", "20037784", "20014557", "20046242", "20046068", "20045610", "20045342", "111126675", "10878068"]
nonempty_transformed_filtered_df = filter_primary_item_numbers(nonempty_transformed_df, item_numbers)
print(nonempty_transformed_filtered_df)

      Primary Item Number Related Item Number
2285             10280339            10099538
2286             10280339            10389081
2287             10280339            10099522
2288             10280339            10032876
2289             10280339            10631239
...                   ...                 ...
17505            20103409            10769252
17506            20103409            20076005
17507            20103409            10058192
17508            20103409            10553070
17509            20103409            10061675

[207 rows x 2 columns]


In [60]:
nonempty_transformed_filtered_df.to_csv('result_new_items_11_4.csv',index=False)

In [None]:
# Add item descriptions
updated_reorder_df_with_desc=add_descriptions(item_desc_df,updated_reorder_df)
print(f">>Added item descriptions to data containing {len(updated_reorder_df_with_desc)} rows")

>>Added item descriptions to data containing 3394 rows
