In [1]:
import pandas as pd
all_sales_desc=pd.read_csv("all_sales_202401251719.csv")

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  all_sales_desc=pd.read_csv("all_sales_202401251719.csv")


# Function for baskets grouping and product-wise association

In [2]:
def calculate_avg_time_interval(row):
    if row['so_key_frequency'] == 1:
        return 1  # Set avg_time_interval = 1 for customers who ordered only 1 item
    else:
        return row['time_span_min_max'] / row['so_key_frequency']

In [3]:
def create_date_to_item_dict(row):
    return dict(zip(row['invc_date'], row['item_cde']))

In [34]:
from datetime import timedelta

def create_baskets(row):
    baskets = []
    dates_items = row['invc_date_TO_item_cde']
    avg_interval = row['avg_interval']

    # Extract and sort the dates to ensure they are processed in chronological order
    dates = sorted(dates_items.keys())
    
    i = 0
    while i < len(dates):
        current_date = dates[i]
        current_basket = set(dates_items[current_date])  # Use a set to avoid duplicates
        
        # Calculate the end of the current interval
        interval_end = current_date + timedelta(days=avg_interval)
        
        # Move to the next date to check if it falls within the current interval
        i += 1
        while i < len(dates) and dates[i] <= interval_end:
            # Add items to the current basket since they are within the range
            current_basket.update(dates_items[dates[i]])
            i += 1  # Move to the next date
        
        # Add the current basket to the list of baskets, converting it back to a list
        baskets.append(list(current_basket))
    
    return baskets

In [32]:
def create_baskets_365(row):
    baskets = []
    dates_items = row['invc_date_TO_item_cde']
    avg_interval = round(row['avg_interval'])  # Round to nearest whole number

    # Convert datetime dates to day of year (1-365)
    day_to_items = {}
    for date, items in dates_items.items():
        day_of_year = date.timetuple().tm_yday
        if day_of_year in day_to_items:
            day_to_items[day_of_year].update(items)
        else:
            day_to_items[day_of_year] = set(items)

    # Iterate through all 365 days
    for day in range(1, 366):
        if day in day_to_items:
            current_basket = day_to_items[day]
        else:
            continue  # Skip days without purchases

        # Look ahead within the interval, considering wrap around
        for delta in range(1, avg_interval + 1):
            next_day = (day + delta - 1) % 365 + 1  # Wrap around if exceeds 365
            if next_day in day_to_items:
                current_basket.update(day_to_items[next_day])

        if current_basket:  # Only add non-empty baskets
            baskets.append(list(current_basket))  # Converting it back to a list

    return baskets

In [31]:
def recommend_items(item, rules): # given an item and association rules, generate recommendation
    item_rules = rules[rules['antecedents'].apply(lambda x: item in set(x))]
    recommended_items = item_rules['consequents'].explode().unique()
    return recommended_items

In [2]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
def product_grouped_association(data,all_sales):

    #Merege saLes data and Ecomm data
    data_inter = pd.merge(data, all_sales, left_on='Item Number', right_on='item_cde')
    data_inter['cat_old'] = data_inter['productcategory_1'] + '_' + data_inter['productcategory_2']
    data_inter['cat'] = data_inter['cat_old']
    data_inter.loc[~data_inter['productcategory_3'].isna(), 'cat'] += '_' +data_inter['productcategory_3']

    #Groupby to gent day-wise baskets
    billto_itemfrequency = data_inter.groupby(['bill_to', 'invc_date', 'item_cde', 'so_key']).agg({'qty_ship': 'sum'}).reset_index()
    billto_itemfrequency['invc_date'] = pd.to_datetime(billto_itemfrequency['invc_date'])

    billto_itemfrequency = billto_itemfrequency.groupby(['bill_to', 'invc_date']).agg(
    item_cde=('item_cde', list),
    so_key=('so_key', list),
    qty_ship=('qty_ship', list),
    so_key_frequency=('so_key', lambda x: len(set(x)))  # Count unique 'so_key' values
    ).reset_index()

    transactions_grpby_bill_to = billto_itemfrequency.groupby(['bill_to']).agg(
    invc_date=('invc_date', list),
    item_cde=('item_cde', list),
    so_key=('so_key', list),
    qty_ship=('qty_ship', list),
    so_key_frequency=('so_key_frequency', 'sum')
    ).reset_index()


    #Find customer's average interval b/w purchases
    transactions_grpby_bill_to['invc_date_TO_item_cde'] = transactions_grpby_bill_to.apply(create_date_to_item_dict, axis=1)

    transactions_grpby_bill_to['time_span_min_max'] = transactions_grpby_bill_to['invc_date'].apply(lambda x: (max(x) - min(x)).days)
    transactions_grpby_bill_to['avg_interval'] = transactions_grpby_bill_to.apply(calculate_avg_time_interval, axis=1)

    transactions_grpby_bill_to['avg_interval'] = transactions_grpby_bill_to['avg_interval'].apply(lambda x: x if x >= 7 else 7)
    transactions_basket = transactions_grpby_bill_to.copy()

    # Create baskets using cyclic resampling to overcome non-uniform distribution issue
    transactions_basket['baskets'] = transactions_basket.apply(create_baskets_365, axis=1)

    flatten_basket_list=[item for sublist in list(transactions_basket['baskets']) for item in sublist]
    te = TransactionEncoder()
    te_ary = te.fit(flatten_basket_list).transform(flatten_basket_list)
    df = pd.DataFrame(te_ary, columns=te.columns_)

    # Use Apriori algorithm to find frequent itemsets
    frequent_itemsets = apriori(df, min_support=0.0001, use_colnames=True)

    # Generate association rules
    rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.5)

    #Write the top 5 recommendation to the items dataframe
    
    all_3700_items=list(data['Item Number'])

    all_recos=[]
    all_recos_nonzero=[]
    items=[]
    
    column_names=['Level','reco1', 'title1', 'reco2', 'title2', 'reco3', 'title3', 'reco4', 'title4', 'reco5', 'title5']

    for col_name in column_names:
        data[col_name] = None
    
    i=0
    for ite in all_3700_items:
        reco=recommend_items(ite, rules)
        all_recos.append(reco)
        if len(reco)>0:
            items.append(ite)
            j=1
            data.loc[data['Item Number']==ite, 'Level'] = 'Product'
            for r in reco:
                if j<6:   # Limit to top 5 recommendations
                    t=data.loc[data['Item Number'] == r, 'Product Title'].values[0]
                    prod_col=f"reco{j}"
                    tit_col=f"title{j}"
                    reco_to_add= {prod_col: r, tit_col: t }

                    for column, value in reco_to_add.items():
                        data.loc[data['Item Number']==ite, column] = value
                j+=1
        
            i+=1
            all_recos_nonzero.append(reco)
        

    

# Function for baskets combined cat association

In [48]:
# Use daywise cat 3 level baskets to form association and write top 5 recommendations to the
def combined_cat_association(df_inter,data):

    bill_to_items = df_inter.groupby(['bill_to', 'invc_date','Category']).agg({'qty_ship': sum}).reset_index()
    billto_items_qty= bill_to_items.groupby(['bill_to', 'invc_date']).agg({'Category': list,'qty_ship': list}).reset_index()
    item_pairs=list(billto_items_qty['Category'])


    te = TransactionEncoder()
    te_ary = te.fit(item_pairs).transform(item_pairs)
    df = pd.DataFrame(te_ary, columns=te.columns_)

    frequent_itemsets = apriori(df, min_support=0.00005, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.5)


    all_cat=set(data['Category'])
    i=0
    for ite in all_cat:
        reco=recommend_items(ite, rules)        
        if len(reco)>0:
              
            data.loc[data['Item Number']==ite, 'Level'] = 'Cat3'
            #m1=data['reco1'].isnull()
            j=5
            for r in reco:
                if j>=1:
                    prod_col=f"reco{j}"
                    mask = (data['Category'] == ite) & (data['reco1'].isnull())
                    data.loc[mask, prod_col] = r
                j-=1        
            i+=1


# Function for baskets cat1 association

In [11]:
def cat1_association(df_inter,data):
    bill_to_items = df_inter.groupby(['bill_to', 'invc_date','productcategory_1']).agg({'qty_ship': sum}).reset_index()
    billto_items_qty= bill_to_items.groupby(['bill_to', 'invc_date']).agg({'productcategory_1': list,'qty_ship': list}).reset_index()
    item_pairs=list(billto_items_qty['productcategory_1'])


    te = TransactionEncoder()
    te_ary = te.fit(item_pairs).transform(item_pairs)
    df = pd.DataFrame(te_ary, columns=te.columns_)

    frequent_itemsets = apriori(df, min_support=0.00005, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.5)



    all_cat=set(df_inter['productcategory_1'])
    i=0
    for ite in all_cat:
        reco=recommend_items(ite, rules)        
        if len(reco)>0:
            data.loc[data['Item Number']==ite, 'Level'] = 'Cat1'
            j=1
            for r in reco:
                if j<6:
                    prod_col=f"reco{j}"
                     
                    mask = (df_inter['productcategory_1'] == ite) & (data['reco1'].isnull())  # Only taking the most associated cat.
                    data.loc[mask, prod_col] = r
                j+=1        
            i+=1


# get items of interest custom input 

In [None]:
data_ecom=pd.read_excel('test_category_column.xlsx')


In [3]:
#Create a copy of ecom items, in order to 
data_ecom1=data_ecom.copy()
product_grouped_association(data_ecom1,all_sales_desc)



NameError: name 'data_ecom' is not defined

# Items with product level recommendations

In [51]:
filtered_values = data_ecom1.loc[~data_ecom1['reco1'].isnull(), 'Product Title']
filtered_values

7       Wypall® X70 White Medium Duty Cloth (8.34 in. ...
11      Oxivir® White Ready-To-Use Disinfectant Cleani...
14                PURELL HEALTHY SOAP™ Gentle & Free Foam
15      Ecolab® Clear Oxycide Daily Disinfectant Clean...
18                PURELL HEALTHY SOAP™ Gentle & Free Foam
                              ...                        
3659    Reliable Brand® 1-Ply White Paper Hardwound To...
3660    Reliable Brand® 2-Ply White Kitchen Paper Towe...
3663    Reliable Brand® 2-Ply White Jumbo JRT Bath Tis...
3665    Reliable Brand® 2-Ply White STD Bath Tissue Ro...
3725    Fresh Products Orange 30-Day Wave Urinal Scree...
Name: Product Title, Length: 379, dtype: object

In [53]:
#If any item has no recommendation, execute category-3 level association

if data_ecom1['reco1'].isnull().any():
    data_inter = pd.merge(data_ecom1, all_sales_desc, left_on='Item Number', right_on='item_cde')

    combined_cat_association(data_inter,data_ecom1)


  bill_to_items = df_inter.groupby(['bill_to', 'invc_date','Category']).agg({'qty_ship': sum}).reset_index()


In [55]:
filtered_values_2 = data_ecom1.loc[~data_ecom1['reco5'].isnull(), 'Product Title']
filtered_values_2

0       Wypall® X70 White Medium Duty Cloth (14.875 in...
1       Oxivir® TB White Disinfect Deodor Wipes (6 in....
2       Wypall® White Wypall® X70 Hydroknit Wiper with...
3       Oxivir® Clear Five Disinfectant Cleaner (1 gal...
4       Oxivir® Clear Five Disinfectant Cleaner (84.5 ...
                              ...                        
3769    VGuard® Yellow Latex Flock Lined Chemical-Resi...
3770    VGuard® Yellow Latex Flock Lined Chemical-Resi...
3771    VGuard® 16-mil Yellow Latex Flock Lined Chemic...
3772    VGuard® 13-mil Natural Latex Chemical-Resistan...
3773    VGuard® 13-mil Natural Latex Chemical-Resistan...
Name: Product Title, Length: 2920, dtype: object

In [54]:
data_ecom1.to_csv('data_ecom_with_recos_1.csv')