In [1]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Sample Data: Replace this with your dataset
data = pd.read_csv('Orders_New_query_2024_07_01.csv')

data['billing_address_state'] = data['billing_address_state'].str.strip().str.upper()

# Create an area-item matrix where rows are areas and columns are items, values are sum of ordered_quantity
area_item_matrix = data.pivot_table(index='billing_address_state', columns='sku_id', values='ordered_quantity', aggfunc='sum').fillna(0)

# Compute item-item similarity using cosine similarity
item_similarity_matrix = pd.DataFrame(cosine_similarity(area_item_matrix.T), index=area_item_matrix.columns, columns=area_item_matrix.columns)

# Function to recommend items based on area and calculate similarity
def recommend_items_with_similarity(area, item, area_item_matrix, item_similarity_matrix, n_recommendations=5):
    # Convert area to uppercase to match the format in the dataset
    area = area.strip().upper()
    
    # Check if the area is in the matrix
    if area not in area_item_matrix.index:
        return f"Area '{area}' not found in the dataset."

    # Get items bought in this area and sort by popularity
    items_in_area = area_item_matrix.loc[area].sort_values(ascending=False)

    # If the input item is not in the area or its quantity is zero, return a message
    if item not in items_in_area or items_in_area[item] == 0:
        return f"The item '{item}' has not been bought in area '{area}'. Popular items:\n{items_in_area.head(n_recommendations)}"
    
    # Recommend other items frequently bought in this area, excluding the given item
    recommendations = items_in_area[items_in_area.index != item].head(n_recommendations)
    
    # Get similarity scores for recommended items compared to the input item
    similarity_scores = item_similarity_matrix.loc[item, recommendations.index]
    
    # Combine recommendations with similarity scores
    recommendations_with_similarity = pd.DataFrame({
        'Recommended Item': recommendations.index,
        'Ordered Quantity': recommendations.values,
        'Similarity Score': similarity_scores.values
    }).sort_values(by='Similarity Score', ascending=False)

    return recommendations_with_similarity

# Example usage:
area = "TAMIL NADU"  # Replace with the desired area
input_item = 'amazon_B0CQM5CFS4'  # Replace with the SKU ID of the item input by the user
recommendations_with_similarity = recommend_items_with_similarity(area, input_item, area_item_matrix, item_similarity_matrix, 20)

# Print recommendations with similarity scores
print(f"Recommended items for area '{area}' excluding item '{input_item}':\n", recommendations_with_similarity)



Recommended items for area 'TAMIL NADU' excluding item 'amazon_B0CQM5CFS4':
                         Recommended Item  Ordered Quantity  Similarity Score
15                     amazon_B0CQK9T3B6              17.0          0.876930
10  shopify_9315034366241_48921914212641              22.0          0.876479
6                      amazon_B0CQK7KQPW              27.0          0.873644
13                     amazon_B0CTQ1NHQQ              18.0          0.869687
5                      amazon_B0CQK7NG2H              28.0          0.868324
0                      amazon_B0CTMQPVJ2              67.0          0.853492
9                      amazon_B0CQK69CQ5              22.0          0.853333
16                     amazon_B0CQM62MKW              16.0          0.850397
17                     amazon_B0CQM7CT8Z              15.0          0.839548
4                      amazon_B0CTMQF324              36.0          0.832995
7                      amazon_B0CTQBR3S3              26.0          0.829679

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load your dataset
df = pd.read_csv('Orders_New_query_2024_07_01.csv')

# Step 1: Preprocess Dataset
# Fill missing values
df.fillna(0, inplace=True)

# Normalize numerical features
scaler = MinMaxScaler()
numerical_features = ['gross_merchandise_value', 'net_sales_before_tax', 'gift_wrap_expense',
                      'packaging_expense', 'handling_expense', 'shipping_expense']

# Convert to numeric and coerce errors
for col in numerical_features:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows with NaN values in numerical features
df.dropna(subset=numerical_features, inplace=True)

# Normalize numerical features
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# One-hot encode categorical variables
categorical_features = ['source', 'refund_status', 'rto_status', 'cancellation_status',
                        'order_status', 'payment_status', 'billing_address_state']
df = pd.get_dummies(df, columns=categorical_features)

# Step 2: Create Product Profiles
product_profiles = df.groupby('sku_id')[numerical_features].mean().reset_index()

# Step 3: Calculate Similarity Matrix
# Exclude 'sku_id' from feature set for similarity calculation
feature_matrix = product_profiles.drop('sku_id', axis=1)
similarity_matrix = cosine_similarity(feature_matrix)

# Map SKU IDs to similarity matrix indices
sku_id_to_index = {sku_id: index for index, sku_id in enumerate(product_profiles['sku_id'])}

# Step 4: Define Recommendation Function with Similarity Scores
def get_recommendations(sku_id, top_n=20):
    if sku_id not in sku_id_to_index:
        return pd.DataFrame(columns=['Recommended SKU', 'Similarity Score'])
    
    index = sku_id_to_index[sku_id]
    similarity_scores = list(enumerate(similarity_matrix[index]))

    # Sort by similarity score in descending order
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get top N similar products excluding itself
    top_sku_indices = [i for i, score in similarity_scores[1:top_n+1]]
    top_scores = [score for i, score in similarity_scores[1:top_n+1]]

    # Get the recommended SKU IDs and their similarity scores
    recommended_skus = product_profiles.iloc[top_sku_indices]['sku_id'].tolist()

    # Create a DataFrame to display recommendations along with similarity scores
    recommendation_df = pd.DataFrame({
        'Recommended SKU': recommended_skus,
        'Similarity Score': top_scores
    })
    return recommendation_df

# Example usage
input_sku = 'amazon_B0CQM5CFS4'
recommendations_df = get_recommendations(input_sku, top_n=10)

print(f"Recommended Products for SKU '{input_sku}' with Similarity Scores:\n")
print(recommendations_df)


Recommended Products for SKU 'amazon_B0CQM5CFS4' with Similarity Scores:

                        Recommended SKU  Similarity Score
0  shopify_9315474571553_48922644250913          0.999972
1  shopify_8878160478497_47580711223585          0.999934
2  shopify_8878160478497_47580711387425          0.999877
3  shopify_8878160478497_47580711321889          0.999861
4  shopify_9317907464481_48927605096737          0.999842
5  shopify_9319430717729_48933176705313          0.999841
6  shopify_9362407489825_49057674133793          0.999834
7  shopify_9362411716897_49057687961889          0.999830
8  shopify_9485180371233_49635563176225          0.999755
9  shopify_8848983261473_47478863069473          0.999732
