In [2]:
import numpy as np
import pandas as pd
import pickle
import time
from collections import defaultdict

from tqdm.notebook import tqdm
pd.set_option('future.no_silent_downcasting', True)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
file_path = 'flattened_basket_list_365_qty.pkl'
with open(file_path, 'rb') as f:
    baskets_flattened = pickle.load(f)

In [4]:
baskets_flattened

[{'10728601': Decimal('80.0000000000'),
  '10036270': Decimal('540.0000000000'),
  '10248951': Decimal('2.0000000000'),
  '10506457': Decimal('700.0000000000')},
 {'10036270': Decimal('540.0000000000'),
  '10248951': Decimal('2.0000000000'),
  '10506457': Decimal('700.0000000000')},
 {'10506457': Decimal('700.0000000000'),
  '10728306': Decimal('4.0000000000'),
  '10248951': Decimal('1.0000000000')},
 {'10728306': Decimal('4.0000000000'),
  '10248951': Decimal('1.0000000000'),
  '10728643': Decimal('40.0000000000')},
 {'10248951': Decimal('1.0000000000'), '10728643': Decimal('40.0000000000')},
 {'10728643': Decimal('40.0000000000')},
 {'10248951': Decimal('2.0000000000')},
 {'10248951': Decimal('1.0000000000')},
 {'10347391': Decimal('10.0000000000')},
 {'10347391': Decimal('8.0000000000')},
 {'10248951': Decimal('1.0000000000')},
 {'10024244': Decimal('1.0000000000'),
  '10472119': Decimal('1.0000000000'),
  '10248951': Decimal('1.0000000000')},
 {'10248951': Decimal('1.0000000000'),


# only cooccurance

In [5]:
# ITEMS WITH QTY

import pandas as pd
from collections import defaultdict
from tqdm import tqdm
from decimal import Decimal

def create_cooccurrence_matrix(baskets):
    # Initialize the dictionary of dictionaries
    co_occurrence = defaultdict(lambda: defaultdict(Decimal))

    # Loop through each basket
    for basket in tqdm(baskets):
        # Extract items and their quantities from the basket
        items = list(basket.keys())
        quantities = list(basket.values())
        
        # Sort the items to ensure that each pair is counted once
        sorted_items = sorted(items)
        
        # Count each pair in the basket
        for i in range(len(sorted_items)):
            for j in range(i + 1, len(sorted_items)):
                item1, item2 = sorted_items[i], sorted_items[j]
                quantity1, quantity2 = quantities[items.index(item1)], quantities[items.index(item2)]
                
                # Increment the co-occurrence count by the minimum quantity of the two items
                co_occurrence[item1][item2] += min(quantity1, quantity2)
                co_occurrence[item2][item1] += min(quantity1, quantity2)

    # Convert to DataFrame
    # Extract items and sort them to ensure DataFrame columns and rows are aligned
    items = sorted(co_occurrence.keys())
    df = pd.DataFrame(index=items, columns=items).fillna(0)
    
    # Fill the DataFrame
    for item1, neighbors in tqdm(co_occurrence.items()):
        for item2, count in neighbors.items():
            df.at[item1, item2] = count

    return df

start_time = time.time()
cooccurrence_matrix = create_cooccurrence_matrix(baskets_flattened)
display(cooccurrence_matrix.head(3))
end_time = time.time()
elapsed_time = round((end_time - start_time), 2)
print(f"Time Taken = {elapsed_time} seconds")

  0%|          | 0/345958 [00:00<?, ?it/s]

 60%|█████▉    | 206715/345958 [00:09<00:06, 22331.57it/s]


KeyboardInterrupt: 

In [None]:
# ONLY ITEMS
def create_cooccurrence_matrix(baskets):
    # Initialize the dictionary of dictionaries
    co_occurrence = defaultdict(lambda: defaultdict(int))

    # Loop through each basket
    for basket in tqdm(baskets):
        # Sort the basket to ensure that each pair is counted once
        sorted_basket = sorted(basket)
        # Count each pair in the basket
        for i in range(len(sorted_basket)):
            for j in range(i + 1, len(sorted_basket)):
                item1, item2 = sorted_basket[i], sorted_basket[j]
                co_occurrence[item1][item2] += 1
                co_occurrence[item2][item1] += 1

    # Convert to DataFrame
    # Extract items and sort them to ensure DataFrame columns and rows are aligned
    items = sorted(co_occurrence.keys())
    df = pd.DataFrame(index=items, columns=items).fillna(0)
    
    # Fill the DataFrame
    for item1, neighbors in tqdm(co_occurrence.items()):
        for item2, count in neighbors.items():
            df.at[item1, item2] = count

    return df

start_time = time.time()
cooccurrence_matrix = create_cooccurrence_matrix(baskets_flattened)
display(cooccurrence_matrix.head(3))
end_time = time.time()
elapsed_time = round((end_time - start_time), 2)
print(f"Time Taken = {elapsed_time} seconds")


  0%|          | 0/345958 [00:00<?, ?it/s]

  0%|          | 0/3367 [00:00<?, ?it/s]

Unnamed: 0,10012415,10012416,10012418,10012422,10012427,10012428,10012628,10012665,10012712,10012724,...,20080830,20080840,20080849,20080888,20080891,20082513,20082920,20083097,20085123,20092466
10012415,0,435,79,213,606,435,9,10,394,408,...,0,0,0,204,0,0,0,0,0,0
10012416,435,0,0,67,409,376,0,1,164,469,...,0,0,0,235,0,0,0,0,0,0
10012418,79,0,0,126,63,106,79,0,77,54,...,0,0,0,0,0,0,0,0,0,0


Time Taken = 10.35 seconds


# co-occur with reco (top 5)

In [14]:
def create_cooccurrence_matrix_with_recommendations(baskets, top_n=5):
    # Initialize the dictionary of dictionaries
    co_occurrence = defaultdict(lambda: defaultdict(int))
    recommendations = defaultdict(lambda: defaultdict(int))

    # Loop through each basket
    for basket in tqdm(baskets):
        # Extract items and their quantities from the basket
        items = list(basket.keys())
        quantities = list(basket.values())
        
        # Sort the items to ensure that each pair is counted once
        sorted_items = sorted(items)
        
        # Count each pair in the basket
        for i in range(len(sorted_items)):
            for j in range(i + 1, len(sorted_items)):
                item1, item2 = sorted_items[i], sorted_items[j]
                #quantity1, quantity2 = quantities[items.index(item1)], quantities[items.index(item2)]
                
                # Increment the co-occurrence count by the minimum quantity of the two items
                co_occurrence[item1][item2] += 1
                co_occurrence[item2][item1] += 1

    # Convert to DataFrame
    # Extract items and sort them to ensure DataFrame columns and rows are aligned
    items = sorted(co_occurrence.keys())
    df = pd.DataFrame(index=items, columns=items).fillna(0)
    
    # Fill the DataFrame and make recommendations
    for item1, neighbors in tqdm(co_occurrence.items()):
        for item2, count in neighbors.items():
            df.at[item1, item2] = count
            recommendations[item1][item2] = count
            recommendations[item2][item1] = count
    
    # Sort the recommendations based on count and take top-N
    rec_df = pd.DataFrame(index=items, columns=[f"Recommendation {i+1}" for i in range(top_n)]+ [f"prop {i+1}" for i in range(top_n)])
    for item, recs in recommendations.items():
        # Remove the queried item from recommendations
        recs.pop(item, None)
        # Sort by count and take top-N
        sorted_recs = sorted(recs.items(), key=lambda x: x[1], reverse=True)
        total_count = sum(count for _, count in sorted_recs)
        #print(item)
        #print(sorted_recs)
        for i, (rec_item, fre) in enumerate(sorted_recs[:top_n]):
            rec_df.at[item, f"Recommendation {i+1}"] = rec_item
            rec_df.at[item, f"prop {i+1}"] = fre/total_count

    return df, rec_df


In [23]:
co_occurrence_matrix, recommendation_df = create_cooccurrence_matrix_with_recommendations(baskets_flattened)

# print("Co-occurrence matrix:")
# print(co_occurrence_matrix)
print("\nRecommendation DataFrame:")
print(recommendation_df)

100%|██████████| 345958/345958 [00:03<00:00, 109692.07it/s]
100%|██████████| 3367/3367 [00:07<00:00, 469.08it/s] 



Recommendation DataFrame:
         Recommendation 1 Recommendation 2 Recommendation 3 Recommendation 4  \
10012415         10012427         10941576         10098323         10012428   
10012416         10012724         10012415         10012427         10012428   
10012418         11083647         10941576         10770896         10050648   
10012422         10098323         20037330         20037325         10624692   
10012427         10012712         10012415         10012724         10012422   
...                   ...              ...              ...              ...   
20082513         10941576         20064070         20064071         10396065   
20082920         20064071         20064073         20064074         10366590   
20083097         20037330         11097301         10957701         10805431   
20085123         11079694         10941576         11136317         10360972   
20092466         10728601         10049675         10766745         10961173   

         Rec

In [24]:
recommendation_df = recommendation_df.rename_axis('item_cde')
recommendation_df.reset_index(inplace=True)

for i in range(1, 6):
    col_name = f'Recommendation {i}'  # Recommendation column name
    desc_col_name = f'Description {i}'  # Description column name
    # Insert description column next to recommendation column
    recommendation_df.insert(recommendation_df.columns.get_loc(col_name) + 1, desc_col_name, map_item_descriptions(recommendation_df[col_name], ecom_df))

#df_cooccur=pd.read_csv('cooccur_top_5_reco_with_description.csv')
#df_cooccur_qty=pd.read_csv('cooccur_top_5_reco_with_description_3_prop.csv')

recommendation_df['item_cde'] = recommendation_df['item_cde'].astype(str)
# #df_cooccur_qty['item_cde'] = df_cooccur_qty['item_cde'].astype(str)


recommendation_df.insert(recommendation_df.columns.get_loc('item_cde') + 1, 'item description', map_item_descriptions(df_cooccur['item_cde'], ecom_df))
# #df_cooccur_qty.insert(df_cooccur_qty.columns.get_loc('item_cde') + 1, 'item description', map_item_descriptions(df_cooccur_qty['item_cde'], ecom_df))

#recommendation_df.to_csv('results_nonweighted_4_withprop')


In [25]:
import pandas as pd
from collections import defaultdict
from itertools import combinations

def create_pair_frequency_matrix(baskets):
    # Extract unique products from all baskets
    unique_products = set()
    for basket in baskets:
        unique_products.update(basket.keys())
    unique_products = sorted(unique_products)
    num_products = len(unique_products)
    
    # Initialize nested dictionary to store pair frequencies
    pair_freq_dict = defaultdict(lambda: defaultdict(int))
    
    # Iterate over each basket
    for basket in baskets:
        products = sorted(basket.keys())
        
        # Get all pairs of items in the basket
        pairs = list(combinations(products, 2))
        
        # Update frequencies in the pair frequency dictionary
        for pair in pairs:
            pair_freq_dict[pair[0]][pair[1]] += 1
            pair_freq_dict[pair[1]][pair[0]] += 1
    
    # Convert nested dictionary to DataFrame
    pair_freq_matrix = pd.DataFrame(pair_freq_dict).reindex(index=unique_products, columns=unique_products).fillna(0)
    
    return pair_freq_matrix


# Get the pair frequency matrix
pair_freq_matrix = create_pair_frequency_matrix(baskets_flattened)

#pair_freq_matrix.to_csv('pair_freq_matrix.csv', index=True)


In [26]:
# Find the maximum frequency
max_frequency = pair_freq_matrix.max().max()

# Count the number of frequencies greater than 200
num_frequencies_gt_200 = (pair_freq_matrix > 200).sum().sum()

print("Maximum frequency:", max_frequency)

Maximum frequency: 6339.0


In [27]:
# Extract upper triangular part of the matrix (excluding diagonal)
upper_triangular_matrix = np.triu(pair_freq_matrix, k=1)

# Count the number of frequencies greater than 200
num_frequencies_gt_200 = np.sum(upper_triangular_matrix > 200)

print("Number of frequencies > 200 (excluding double counting):", num_frequencies_gt_200)


Number of frequencies > 200 (excluding double counting): 9092


In [30]:
num_frequencies_eq_0 = np.sum(upper_triangular_matrix > 100)

print("Number of frequencies = 0  :", num_frequencies_eq_0/((len(pair_freq_matrix)-1)*(len(pair_freq_matrix)-1)) )

Number of frequencies = 0  : 0.0019375041311388724


# Plot takes long (200+min)

In [None]:
import seaborn as sns

# Extract upper triangular part of the matrix (excluding diagonal)
upper_triangular_matrix = pair_freq_matrix.where(np.triu(np.ones(pair_freq_matrix.shape), k=1).astype(bool))

# Plot the heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(upper_triangular_matrix, cmap='viridis', annot=True, fmt='g')
plt.title('Pair Frequency Matrix (Upper Triangular)')
plt.xlabel('Products')
plt.ylabel('Products')
plt.xticks(ticks=range(len(pair_freq_matrix.columns)), labels=pair_freq_matrix.columns, rotation=45)
plt.yticks(ticks=range(len(pair_freq_matrix.index)), labels=pair_freq_matrix.index)
plt.tight_layout()
plt.show()


# add freq columns to the weighted df

In [49]:
result_with_prop=pd.read_csv('results_weighted_4_withprop_old.csv')

In [38]:
result_with_prop.dtypes


Unnamed: 0            int64
item_cde              int64
item description     object
Recommendation 1      int64
Description 1        object
Recommendation 2    float64
Description 2        object
Recommendation 3    float64
Description 3        object
Recommendation 4    float64
Description 4        object
Recommendation 5    float64
Description 5        object
freq 1              float64
freq 2              float64
freq 3              float64
freq 4              float64
freq 5              float64
dtype: object

In [50]:
# Convert 'item_cde' column to string type
result_with_prop['item_cde'] = result_with_prop['item_cde'].astype(str)

# Change data type of recommendation columns to string
result_with_prop[['Recommendation 1', 'Recommendation 2', 'Recommendation 3', 'Recommendation 4', 'Recommendation 5']] = result_with_prop[['Recommendation 1', 'Recommendation 2', 'Recommendation 3', 'Recommendation 4', 'Recommendation 5']].astype(str)

# Now, iterate over each recommendation column
for i in range(1, 6):
    recommendation_col = 'Recommendation {}'.format(i)
    freq_col = 'frequency {}'.format(i)
    result_with_prop[recommendation_col] = result_with_prop[recommendation_col].apply(lambda x: str(x).replace('.0', '') if pd.notnull(x) else x)
    
    # Compute frequencies using the pair frequency matrix
    result_with_prop[freq_col] = result_with_prop.apply(
        lambda row: pair_freq_matrix.get(str(row['item_cde']), {}).get(row[recommendation_col], 0), axis=1)

# Display the updated DataFrame
print(result_with_prop)


      Unnamed: 0  item_cde                                   item description  \
0              0  10012415  8J8 FOAM CUP 8OZ SMALL INSULATED EPS HOT/COLD ...   
1              1  10012416  12J16 FOAM CUP 12OZ J CUP INSULATED EPS HOT/CO...   
2              2  10012418  10J10 CUP 10 OUNCE HOT OR COLD INSULATED FOAM ...   
3              3  10012422  12J12 CUP 12 OUNCE HOT OR COLD INSULATED FOAM ...   
4              4  10012427  16J16 FOAM CUP 16OZ J CUP INSULATED EPS HOT/CO...   
...          ...       ...                                                ...   
3362        3362  20082513  MWHWT-4035 CLEANING SYSTEM 72X52X60 MYSTICWASH...   
3363        3363  20082920  9022386 FLOOR SCRUBBER 18IN TENNANT I-MOP XL P...   
3364        3364  20083097  53800 CARTON SEALING TAPE 48MMX1500M 1.9MIL 3M...   
3365        3365  20085123  D725332 AUTONOMOUS VACUUM ROBOT 15IN TASKI GS ...   
3366        3366  20092466  8 1/2X11 10M 20# WHITE NATURAL CHOICE MULTIPUR...   

     Recommendation 1      

In [51]:
print(result_with_prop['Recommendation 2'])


0       10659222
1       20012592
2       10012724
3       10781406
4       10012415
          ...   
3362    20064070
3363    20064073
3364    10728236
3365    10941576
3366    10735775
Name: Recommendation 2, Length: 3367, dtype: object


In [52]:
result_with_prop.to_csv('result_with_freq_3.csv')

In [100]:
co_occurrence_matrix.to_csv('count_matrix_1.csv')

# weighted

In [167]:
def create_cooccurrence_matrix_with_recommendations_2(baskets, top_n=5):
    # Initialize the dictionary of dictionaries
    co_occurrence = defaultdict(lambda: defaultdict(int))
    recommendations = defaultdict(lambda: defaultdict(int))

    # Loop through each basket
    for basket in tqdm(baskets):
        # Extract items and their quantities from the basket
        items = list(basket.keys())
        quantities = list(basket.values())
        
        # Sort the items to ensure that each pair is counted once
        sorted_items = sorted(items)
        
        # Count each pair in the basket
        for i in range(len(sorted_items)):
            for j in range(i + 1, len(sorted_items)):
                item1, item2 = sorted_items[i], sorted_items[j]
                quantity1, quantity2 = quantities[items.index(item1)], quantities[items.index(item2)]
                
                # Increment the co-occurrence count by the minimum quantity of the two items
                co_occurrence[item1][item2] += min(quantity1, quantity2)
                co_occurrence[item2][item1] += min(quantity1, quantity2)

    # Convert to DataFrame
    # Extract items and sort them to ensure DataFrame columns and rows are aligned
    items = sorted(co_occurrence.keys())
    df = pd.DataFrame(index=items, columns=items).fillna(0)
    
    # Fill the DataFrame and make recommendations
    for item1, neighbors in tqdm(co_occurrence.items()):
        for item2, count in neighbors.items():
            df.at[item1, item2] = count
            recommendations[item1][item2] = count
            recommendations[item2][item1] = count
    
    # Sort the recommendations based on count and take top-N
    rec_df = pd.DataFrame(index=items, columns=[f"Recommendation {i+1}" for i in range(top_n)]+ [f"freq {i+1}" for i in range(top_n)])
    for item, recs in recommendations.items():
        # Remove the queried item from recommendations
        recs.pop(item, None)
        # Sort by count and take top-N
        sorted_recs = sorted(recs.items(), key=lambda x: x[1], reverse=True)
        #print(item)
        #print(sorted_recs)
        total_count = sum(count for _, count in sorted_recs)
        for i, (rec_item, fre) in enumerate(sorted_recs[:top_n]):
            rec_df.at[item, f"Recommendation {i+1}"] = rec_item
            rec_df.at[item, f"freq {i+1}"] = fre/total_count

    return df, rec_df


In [168]:
co_occurrence_matrix, recommendation_df = create_cooccurrence_matrix_with_recommendations_2(baskets_flattened)

# print("Co-occurrence matrix:")
# print(co_occurrence_matrix)
print("\nRecommendation DataFrame:")
print(recommendation_df)

  0%|          | 0/345958 [00:00<?, ?it/s]

100%|██████████| 345958/345958 [00:13<00:00, 25541.82it/s]
100%|██████████| 3367/3367 [00:06<00:00, 482.24it/s] 



Recommendation DataFrame:
         Recommendation 1 Recommendation 2 Recommendation 3 Recommendation 4  \
10012415         10012427         10659222         10098323         10357365   
10012416         10012724         20012592         10357365         10659222   
10012418         10012428         10012724         10941576         10050648   
10012422         10098323         10781406         10049812         10928677   
10012427         10012724         10012415         10012416         10357365   
...                   ...              ...              ...              ...   
20082513         10941576         20064070         20064071         10396065   
20082920         20064071         20064073         20064074         10366590   
20083097         10036584         10728236         10040884         10059435   
20085123         11079694         10941576         11136317         10360972   
20092466         10735777         10735775         10802527         10766029   

         Rec

In [169]:
recommendation_df = recommendation_df.rename_axis('item_cde')
recommendation_df.reset_index(inplace=True)

for i in range(1, 6):
    col_name = f'Recommendation {i}'  # Recommendation column name
    desc_col_name = f'Description {i}'  # Description column name
    # Insert description column next to recommendation column
    recommendation_df.insert(recommendation_df.columns.get_loc(col_name) + 1, desc_col_name, map_item_descriptions(recommendation_df[col_name], ecom_df))

recommendation_df['item_cde'] = recommendation_df['item_cde'].astype(str)



recommendation_df.insert(recommendation_df.columns.get_loc('item_cde') + 1, 'item description', map_item_descriptions(df_cooccur['item_cde'], ecom_df))

recommendation_df.to_csv('results_weighted_4_withprop')

# With proportions calculated (error in the number counts)

weighted method with quantities

In [112]:
from collections import defaultdict
import pandas as pd
from tqdm import tqdm

def create_cooccurrence_matrix_with_recommendations_2_normalize(baskets, top_n=5):
    # Initialize the dictionary of dictionaries
    co_occurrence = defaultdict(dict)
    item_counts = defaultdict(int)

    # Loop through each basket
    for basket in tqdm(baskets):
        # Extract items and their quantities from the basket
        items = list(basket.keys())
        
        # Update item counts
        for item in items:
            item_counts[item] += 1
        
        # Sort the items to ensure that each pair is counted once
        sorted_items = sorted(items)
        
        # Count each pair in the basket
        for i in range(len(sorted_items)):
            for j in range(i + 1, len(sorted_items)):
                item1, item2 = sorted_items[i], sorted_items[j]
                
                # Increment the co-occurrence count by the minimum quantity of the two items
                co_occurrence[item1][item2] = co_occurrence[item1].get(item2, 0) + 1
                co_occurrence[item2][item1] = co_occurrence[item2].get(item1, 0) + 1

    # Convert to DataFrame
    # Extract items and sort them to ensure DataFrame columns and rows are aligned
    items = sorted(co_occurrence.keys())
    df = pd.DataFrame(index=items, columns=items).fillna(0)
    
    # Fill the DataFrame and make recommendations
    rec_df = pd.DataFrame(index=items, columns=[f"Recommendation {i+1}" for i in range(top_n)])
    for item1, neighbors in tqdm(co_occurrence.items()):
        for item2, count in neighbors.items():
            df.at[item1, item2] = count
            df.at[item2, item1] = count  # Ensure symmetry
            # Store recommendation
            co_occurrence[item1][item2] = count
            co_occurrence[item2][item1] = count
    
    # Sort the recommendations based on count and take top-N
    for item, recs in co_occurrence.items():
        # Remove the queried item from recommendations
        recs.pop(item, None)
        # Sort by count and take top-N
        sorted_recs = sorted(recs.items(), key=lambda x: x[1], reverse=True)[:top_n]
        for i, (rec_item, count) in enumerate(sorted_recs):
            proportion = count / item_counts[rec_item]
            rec_df.at[item, f"Recommendation {i+1}"] = rec_item
            rec_df.at[item, f"Proportion {i+1}"] = proportion

    return df, rec_df


non-weighted method

In [140]:
def create_cooccurrence_matrix_with_recommendations_3_noqty(baskets, top_n=5):
    # Initialize the dictionary of dictionaries
    co_occurrence = defaultdict(lambda: defaultdict(int))
    recommendations = defaultdict(lambda: defaultdict(int))
    item_counts = defaultdict(int)

    # Loop through each basket
    for basket in tqdm(baskets):
        # Extract items from the basket
        items = list(basket.keys())
        
        # Update item counts
        for item in items:
            item_counts[item] += 1
        
        # Sort the items to ensure that each pair is counted once
        sorted_items = sorted(items)
        
        # Count each pair in the basket
        for i in range(len(sorted_items)):
            for j in range(i + 1, len(sorted_items)):
                item1, item2 = sorted_items[i], sorted_items[j]
                
                # Increment the co-occurrence count
                co_occurrence[item1][item2] += 1
                co_occurrence[item2][item1] += 1

    # Convert co-occurrence counts to DataFrame
    items = sorted(co_occurrence.keys())
    df = pd.DataFrame(index=items, columns=items).fillna(0)
    
    # Fill the DataFrame and make recommendations
    for item1, neighbors in tqdm(co_occurrence.items()):
        for item2, count in neighbors.items():
            df.at[item1, item2] = count
            recommendations[item1][item2] = count
            recommendations[item2][item1] = count
    
    # Sort the recommendations based on count and take top-N
    rec_df = pd.DataFrame(index=items, columns=[f"Recommendation {i+1}" for i in range(top_n)])
    for item, recs in recommendations.items():
        # Remove the queried item from recommendations
        recs.pop(item, None)
        # Sort by count and take top-N
        sorted_recs = sorted(recs.items(), key=lambda x: x[1], reverse=True)[:top_n]
        for i, (rec_item, count) in enumerate(sorted_recs):
            proportion = count / item_counts[rec_item]
            rec_df.at[item, f"Recommendation {i+1}"] = rec_item
            rec_df.at[item, f"Proportion {i+1}"] = proportion

    return df, rec_df

In [141]:
co_occurrence_matrix, recommendation_df = create_cooccurrence_matrix_with_recommendations_3_noqty(baskets_flattened)

# print("Co-occurrence matrix:")
# print(co_occurrence_matrix)
print("\nRecommendation DataFrame:")
print(recommendation_df)

100%|██████████| 345958/345958 [00:03<00:00, 107203.31it/s]
100%|██████████| 3367/3367 [00:06<00:00, 492.59it/s] 



Recommendation DataFrame:
         Recommendation 1 Recommendation 2 Recommendation 3 Recommendation 4  \
10012415         10012427         10941576         10098323         10012428   
10012416         10012724         10012415         10012427         10012428   
10012418         11083647         10941576         10770896         10050648   
10012422         10098323         20037330         20037325         10624692   
10012427         10012712         10012415         10012724         10012422   
...                   ...              ...              ...              ...   
20082513         10941576         20064070         20064071         10396065   
20082920         20064071         20064073         20064074         10366590   
20083097         20037330         11097301         10957701         10805431   
20085123         11079694         10941576         11136317         10360972   
20092466         10728601         10049675         10766745         10961173   

         Rec

In [131]:
recommendation_df = recommendation_df.rename_axis('item_cde')


In [132]:
recommendation_df

Unnamed: 0_level_0,Recommendation 1,Recommendation 2,Recommendation 3,Recommendation 4,Recommendation 5,Proportion 1,Proportion 2,Proportion 3,Proportion 4,Proportion 5
item_cde,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
10012415,10012427,10941576,10098323,10012416,10012428,0.254515,0.027842,0.039326,0.553435,0.380245
10012416,10012724,10012415,10012427,10012428,10504765,0.686676,0.135599,0.171777,0.328671,0.620278
10012418,11083647,10941576,10770896,10050648,10624692,0.094693,0.021225,0.164581,0.598246,0.072380
10012422,10098323,20037330,20037325,10624692,10012427,0.061811,0.039243,0.071789,0.103952,0.175556
10012427,10012712,10012415,10012724,10012422,10012416,0.615672,0.188903,0.644217,0.116825,0.520356
...,...,...,...,...,...,...,...,...,...,...
20082513,10042222,10042733,10042792,10043274,10048312,0.005181,0.002312,0.002435,0.002464,0.001868
20082920,20064071,20064073,20064074,10366590,10396076,0.001397,0.001420,0.002009,0.001099,0.002482
20083097,20037330,10805431,10957701,11097301,10728010,0.002934,0.042674,0.015568,0.014111,0.110266
20085123,10042222,10042733,10042792,10042795,10043274,0.005181,0.002312,0.002435,0.002017,0.002464


In [116]:
recommendation_df.to_csv('only_itemcde_3.csv')

In [117]:
recommendation_df.to_csv('co_occurrence_matrix_results_4_prop.csv')

In [17]:
ecom_df=pd.read_excel('digital_commerce_items.xlsx')

In [18]:
ecom_df['Item Number'] = ecom_df['Item Number'].astype(str)


In [120]:
type(ecom_df['Item Number'][0])

str

In [122]:
for i in range(1, 6):
    col_name = f'Recommendation {i}'  # Recommendation column name
    desc_col_name = f'Description {i}'  # Description column name
    # Insert description column next to recommendation column
    recommendation_df.insert(recommendation_df.columns.get_loc(col_name) + 1, desc_col_name, map_item_descriptions(recommendation_df[col_name], ecom_df))

print(recommendation_df)

         Recommendation 1                                      Description 1  \
item_cde                                                                       
10012415         10012427  16J16 FOAM CUP 16OZ J CUP INSULATED EPS HOT/CO...   
10012416         10012724  16FTLS LID SNAP-TIGHT FOAM CUP LIFT N LOCK W/S...   
10012418         11083647  429791 URINAL SCREEN 4OZ RELIABLE BRAND NON-PA...   
10012422         10098323  07006 TOILET TISSUE 3.78X1150FT 2 PLY SCOTT CO...   
10012427         10012712  16SL LID DART SNAP-TIGHT STRAW SLOTTED FITS 12...   
...                   ...                                                ...   
20082513         10042222  07223 TOILET TISSUE 3.55X2000FT 1PLY KC SCOTT ...   
20082920         20064071  A11A12 EXAM GLOVE MED 3.5MIL VGUARD NITRILE DI...   
20083097         20037330  V00351 BATH TISSUE 3.3X1000FT 2PLY RELIABLE BR...   
20085123         10042222  07223 TOILET TISSUE 3.55X2000FT 1PLY KC SCOTT ...   
20092466         10766745  V00304 CT SEA

In [123]:
recommendation_df.to_csv('cooccur_top_5_reco_with_description_3_prop.csv')

In [22]:
df_cooccur=pd.read_csv('cooccur_top_5_reco_with_description.csv')
#df_cooccur_qty=pd.read_csv('cooccur_top_5_reco_with_description_3_prop.csv')

df_cooccur['item_cde'] = df_cooccur['item_cde'].astype(str)
#df_cooccur_qty['item_cde'] = df_cooccur_qty['item_cde'].astype(str)


df_cooccur.insert(df_cooccur.columns.get_loc('item_cde') + 1, 'item description', map_item_descriptions(df_cooccur['item_cde'], ecom_df))
#df_cooccur_qty.insert(df_cooccur_qty.columns.get_loc('item_cde') + 1, 'item description', map_item_descriptions(df_cooccur_qty['item_cde'], ecom_df))

In [126]:
df_cooccur.to_csv('cooccur_results_2_prop_no weight.csv')
#df_cooccur_qty.to_csv('cooccur_results_4_prop.csv')

In [31]:
ass_result=pd.read_csv('results_apriori_s_0.002_c_0.3.csv')
ass_result_product=ass_result[ass_result['Level'] == 'Product']

# Rearrange for comparison w apriori result

In [35]:

# Assuming ass_result_product is your DataFrame containing 164 unique item_cde
# Assuming cooccur_top_5_reco_with_description is your DataFrame to be rearranged

# Step 1: Get the 164 unique item_cde
unique_item_cde = ass_result_product['item_cde'].unique()

# Step 2: Merge cooccur_top_5_reco_with_description with ass_result_product
rearranged_df = pd.merge(ass_result_product[['item_cde']], top_5_reco_df, on='item_cde', how='left')

# Step 3: Concatenate the remaining rows from cooccur_top_5_reco_with_description
remaining_df = top_5_reco_df[~top_5_reco_df['item_cde'].isin(unique_item_cde)]
rearranged_df = pd.concat([rearranged_df, remaining_df])

# Now rearranged_df contains the DataFrame with 164 unique item_cde brought to the top, maintaining the same order as in ass_result_product


In [38]:
rearranged_df.to_csv('rearranged_top5_df_cooccur.csv')