In [5]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns
from gensim.models import Word2Vec
from rank_bm25 import BM25Okapi

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [6]:
# Load your dataset
try:
    df = pd.read_csv('/Users/eliduba/Documents/GitHub/edmrec/backend/recommendations/testing/cleaned_for_experiment.csv')  # Replace with your actual file path
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: The file 'datasets.csv' was not found. Please check the file path and try again.")
    import sys
    sys.exit()
except pd.errors.EmptyDataError:
    print("Error: The file 'datasets.csv' is empty.")
    sys.exit()
except pd.errors.ParserError:
    print("Error: The file 'datasets.csv' is malformed or contains parsing errors.")
    sys.exit()

# Display the first few rows to verify
print(df.head())


Dataset loaded successfully.
     id                                       title  \
0     1                             E-Commerce Data   
1    10  Sales of summer clothes in E-commerce Wish   
2  1000         AV JanataHack Cross-Sell Prediction   
3  1000                             E-Commerce Data   
4  1000                             E-Commerce Data   

                                         description  \
0               Actual transactions from UK retailer   
1  Top products with ratings and sales performanc...   
2                             Janata Hack Cross-Sell   
3               Actual transactions from UK retailer   
4               Actual transactions from UK retailer   

                                                 url   size format  \
0      https://www.kaggle.com/carrie1/ecommerce-data    7MB    csv   
1  https://www.kaggle.com/jmmvutu/summer-products...  406KB    csv   
2  https://www.kaggle.com/jinxzed/av-janatahack-c...    6MB    csv   
3      https://www.kagg

In [7]:
# Combine processed title and description for TF-IDF
df['combined_text'] = df['title'] + ' ' + df['description']


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer with stopword removal
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the combined text
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_text'])

# Convert to dense array for similarity computations if needed
# (Note: For large datasets, keep it sparse)
tfidf_embeddings = tfidf_matrix.toarray()

print("TF-IDF vectorization complete.")


TF-IDF vectorization complete.


In [10]:
from rank_bm25 import BM25Okapi

# Prepare documents for BM25 (tokenized)
documents = [doc.split() for doc in df['combined_text']]

# Initialize BM25
bm25 = BM25Okapi(documents)

print("BM25 indexing complete.")


BM25 indexing complete.


In [11]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def recommend_datasets(query, top_k=10):
    """
    Generate dataset recommendations based on the input query using BM25 and TF-IDF.
    
    Parameters:
        query (str): The user query.
        top_k (int): Number of top recommendations to return.
    
    Returns:
        pd.DataFrame: DataFrame containing the recommended datasets with 'id', 'title', 'description', and 'relevance_score'.
    """
  
    
    # BM25 scores
    bm25_scores = bm25.get_scores(query)
    
    # TF-IDF similarity
    query_tfidf = tfidf_vectorizer.transform([query]).toarray()
    tfidf_sim = cosine_similarity(query_tfidf, tfidf_embeddings)[0]
    
    # Normalize BM25 scores and TF-IDF similarities
    if np.max(bm25_scores) != 0:
        bm25_norm = bm25_scores / np.max(bm25_scores)
    else:
        bm25_norm = bm25_scores
    
    if np.max(tfidf_sim) != 0:
        tfidf_norm = tfidf_sim / np.max(tfidf_sim)
    else:
        tfidf_norm = tfidf_sim
    
    # Combined relevance score (equal weights)
    combined_score = (bm25_norm + tfidf_norm) / 2.0
    
    # Get top_k indices
    top_indices = combined_score.argsort()[-top_k:][::-1]
    
    # Retrieve the top_k datasets
    top_datasets = df.iloc[top_indices].copy()
    top_datasets['relevance_score'] = combined_score[top_indices]
    
    return top_datasets[['id', 'title', 'description', 'relevance_score']]


In [12]:
# Example query
user_query = "customer purchase behavior analysis"

# Generate recommendations
top_k = 10
recommendations = recommend_datasets(user_query, top_k=top_k * 2)
unique_recommendations = recommendations.drop_duplicates(subset=['id']).head(top_k)

# Display the recommendations
print(unique_recommendations)


       id                                             title  \
146  1134  Avaliações em Português - Amazon e Mercado Livre   
42   1013                e-Commerce (Walmart) Sales Dataset   
343    14                e-Commerce (Walmart) Sales Dataset   
268    13              E-commerce Customer Behavior Dataset   
38   1012              E-commerce Customer Behavior Dataset   
62   1024                          Sales data of e commerce   
67   1031                               Etailers DATA India   
424  1519                           Customer_buying_dataset   
390  1477                    BigBasket Descriptive Analysis   
261  1290                               HPE servers dataset   

                                           description  relevance_score  
146   Avaliações de produtos da amazon e mercado livre         0.500000  
42         Customer Purchase Patterns and Demographics         0.500000  
343        Customer Purchase Patterns and Demographics         0.500000  
268  Explo

In [13]:
def precision_at_k(actual, predicted, k):
    """
    Calculate Precision@K.
    """
    predicted_at_k = predicted[:k]
    relevant = set(actual)
    recommended = set(predicted_at_k)
    return len(relevant & recommended) / k if k > 0 else 0

def recall_at_k(actual, predicted, k):
    """
    Calculate Recall@K.
    """
    predicted_at_k = predicted[:k]
    relevant = set(actual)
    recommended = set(predicted_at_k)
    return len(relevant & recommended) / len(relevant) if len(relevant) > 0 else 0

def f1_score_at_k(actual, predicted, k):
    """
    Calculate F1 Score@K.
    """
    precision = precision_at_k(actual, predicted, k)
    recall = recall_at_k(actual, predicted, k)
    return 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

def average_precision_at_k(actual, predicted, k):
    """
    Compute Average Precision@K.
    """
    if not actual:
        return 0.0
    predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    return score / min(len(actual), k)

def dcg_at_k(actual, predicted, k):
    """
    Compute Discounted Cumulative Gain@K.
    """
    dcg = 0.0
    for i, p in enumerate(predicted[:k]):
        if p in actual:
            dcg += 1 / np.log2(i + 2)
    return dcg

def idcg_at_k(actual, k):
    """
    Compute Ideal Discounted Cumulative Gain@K.
    """
    idcg = 0.0
    for i in range(min(len(actual), k)):
        idcg += 1 / np.log2(i + 2)
    return idcg

def ndcg_at_k(actual, predicted, k):
    """
    Compute Normalized Discounted Cumulative Gain@K.
    """
    dcg = dcg_at_k(actual, predicted, k)
    idcg = idcg_at_k(actual, k)
    return dcg / idcg if idcg > 0 else 0


In [14]:
def evaluate_recommendations(ground_truth_df, recommendation_func, k=10):
    """
    Evaluate the recommendation system using various metrics.
    
    Parameters:
        ground_truth_df (pd.DataFrame): DataFrame containing 'query' and 'relevant' columns.
        recommendation_func (function): Function that takes a query string and returns recommended datasets.
        k (int): Number of top recommendations to consider.
        
    Returns:
        pd.DataFrame: Evaluation results with metrics for each query.
    """
    evaluation_results = []
    
    for index, row in ground_truth_df.iterrows():
        query = row['query']
        true_relevant_ids = row['relevant']
        
        # Generate recommendations
        recommended_datasets = recommendation_func(query, top_k=k)
        
        if recommended_datasets.empty or 'id' not in recommended_datasets.columns:
            print(f"No recommendations returned for query: {query}")
            predicted_relevant_ids = []
        else:
            predicted_relevant_ids = recommended_datasets['id'].tolist()
        
        # Calculate metrics
        precision = precision_at_k(true_relevant_ids, predicted_relevant_ids, k)
        recall = recall_at_k(true_relevant_ids, predicted_relevant_ids, k)
        f1 = f1_score_at_k(true_relevant_ids, predicted_relevant_ids, k)
        ap = average_precision_at_k(true_relevant_ids, predicted_relevant_ids, k)
        ndcg = ndcg_at_k(true_relevant_ids, predicted_relevant_ids, k)
        
        # Store the results
        evaluation_results.append({
            'query': query,
            f'precision@{k}': precision,
            f'recall@{k}': recall,
            f'f1_score@{k}': f1,
            f'average_precision@{k}': ap,
            f'ndcg@{k}': ndcg
        })
    
    # Convert to DataFrame
    evaluation_df = pd.DataFrame(evaluation_results)
    
    # Display the evaluation results
    print("Evaluation Results:")
    display(evaluation_df)
    
    return evaluation_df


In [2]:
def evaluate_recommendations_rank_based(ground_truth_df, recommendation_func, k=10):
    """
    Evaluate the recommendation system using Precision@K, Recall@K, F1 Score@K, Average Precision@K, and NDCG@K.
    
    Parameters:
        ground_truth_df (pd.DataFrame): DataFrame containing 'query' and 'relevant' columns.
        recommendation_func (function): Function that takes a query string and returns recommended datasets.
        k (int): Number of top recommendations to consider.
        
    Returns:
        pd.DataFrame: Evaluation results with metrics for each query.
    """
    evaluation_results = []
    
    for index, row in ground_truth_df.iterrows():
        query = row['query']
        true_relevant_ids = row['relevant']
        
        # Generate recommendations using the provided recommendation function
        recommended_datasets = recommendation_func(query, top_k=k)
        
        # Check if recommended_datasets is not empty and has 'id' column
        if recommended_datasets.empty or 'id' not in recommended_datasets.columns:
            print(f"No recommendations returned for query: {query}")
            predicted_relevant_ids = []
        else:
            # Extract the recommended dataset IDs
            predicted_relevant_ids = recommended_datasets['id'].tolist()
        
        # Calculate all metrics
        precision = precision_at_k(true_relevant_ids, predicted_relevant_ids, k)
        recall = recall_at_k(true_relevant_ids, predicted_relevant_ids, k)
        f1 = f1_score_at_k(true_relevant_ids, predicted_relevant_ids, k)
        ap = average_precision_at_k(true_relevant_ids, predicted_relevant_ids, k)
        ndcg = ndcg_at_k(true_relevant_ids, predicted_relevant_ids, k)
        
        # Store the results
        evaluation_results.append({
            'query': query,
            f'precision@{k}': precision,
            f'recall@{k}': recall,
            f'f1_score@{k}': f1,
            f'average_precision@{k}': ap,
            f'ndcg@{k}': ndcg
        })
    
    # Convert the evaluation results to a DataFrame
    evaluation_df = pd.DataFrame(evaluation_results)
    
    # Display the evaluation results
    print("Evaluation Results:")
    display(evaluation_df)
    
    return evaluation_df


In [None]:
# # Set the desired value for K
# k = 10

# # Evaluate the Recommendation System
# evaluation_df = evaluate_recommendations(
#     ground_truth_df, 
#     recommend_datasets,  # This is your recommendation function
#     k=k
# )

# # Optionally, save the evaluation results
# evaluation_df.to_csv('evaluation_results.csv', index=False)


In [None]:
# ============================
# Import Necessary Libraries
# ============================

import pandas as pd
import numpy as np
import string

from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import ttest_rel

import matplotlib.pyplot as plt
import seaborn as sns

# ============================
# Load and Clean the Dataset
# ============================

# Path to your dataset CSV file
DATASET_PATH = '/Users/eliduba/Documents/GitHub/edmrec/backend/recommendations/testing/cleaned_for_experiment.csv'  # Replace with your actual file path

# Load the dataset
try:
    df = pd.read_csv(DATASET_PATH)
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print(f"Error: The file '{DATASET_PATH}' was not found.")
    exit()
except pd.errors.EmptyDataError:
    print(f"Error: The file '{DATASET_PATH}' is empty.")
    exit()
except pd.errors.ParserError:
    print(f"Error: The file '{DATASET_PATH}' is malformed.")
    exit()

# Display the first few rows to verify
print("\nFirst few rows of the dataset:")
print(df.head())

# Handle missing data by filling with empty strings
df['title'] = df['title'].fillna('')
df['description'] = df['description'].fillna('')

# Remove duplicates based on 'id' to ensure uniqueness
df = df.drop_duplicates(subset=['id'], keep='first').copy()
print("\nDataset after removing duplicates:")
print(df.head())

# ============================
# Define Queries and Associated Keywords
# ============================

# Define the queries and associated keywords
queries = {
    'customer purchase behavior analysis': ['customer', 'purchase', 'behavior', 'analysis', 'buying patterns', 'consumer behavior'],
    'product pricing trends': ['product', 'pricing', 'price trends', 'cost analysis', 'market trends', 'price changes'],
    'inventory management optimization': ['inventory', 'management', 'optimization', 'stock control', 'supply management', 'inventory levels'],
    'e-commerce website traffic analysis': ['e-commerce', 'website', 'traffic', 'site visitors', 'web analytics', 'user traffic'],
    'online sales forecasting': ['online', 'sales', 'forecasting', 'prediction', 'future sales', 'sales trends'],
    'customer segmentation in e-commerce': ['customer', 'segmentation', 'e-commerce', 'grouping', 'market segments', 'consumer categories'],
    'digital marketing effectiveness': ['digital marketing', 'effectiveness', 'ROI', 'campaign performance', 'online advertising', 'marketing impact'],
    'product return rates': ['product', 'return rates', 'returns', 'customer returns', 'return analysis', 'refunds'],
    'seasonal sales patterns': ['seasonal', 'sales', 'patterns', 'seasonal trends', 'holiday sales', 'time-based sales'],
    'supply chain management in e-commerce': ['supply chain', 'management', 'e-commerce', 'logistics', 'distribution', 'supply network'],
    'customer loyalty programs': ['customer', 'loyalty', 'programs', 'loyalty rewards', 'customer retention', 'loyalty schemes'],
    'cross-selling strategies in e-commerce': ['cross-selling', 'strategies', 'e-commerce', 'upselling', 'additional sales', 'related products'],
    'social media impact on e-commerce sales': ['social media', 'impact', 'e-commerce', 'sales', 'influence', 'social commerce'],
    'e-commerce fraud detection': ['e-commerce', 'fraud', 'detection', 'fraud prevention', 'scam', 'online fraud'],
    'multi-channel retail strategy': ['multi-channel', 'retail', 'strategy', 'omnichannel', 'sales channels', 'customer touchpoints'],
    'personalized product recommendations': ['personalized', 'product', 'recommendations', 'customized', 'recommendation engine', 'suggested products'],
    'user experience design for e-commerce websites': ['user experience', 'design', 'e-commerce', 'websites', 'UX', 'interface design'],
    'impact of mobile commerce': ['impact', 'mobile commerce', 'm-commerce', 'mobile sales', 'smartphone shopping', 'mobile influence'],
    'data-driven decision making in e-commerce': ['data-driven', 'decision making', 'e-commerce', 'analytics', 'data analysis', 'business intelligence'],
    'customer churn prediction in e-commerce': ['customer churn', 'prediction', 'e-commerce', 'attrition', 'churn analysis', 'customer loss'],
}

# ============================
# Create Ground Truth Data
# ============================

# Create an empty list to store the ground truth
ground_truth = []

# Function to preprocess text (consistent with embeddings)
def preprocess_text(text: str) -> str:
    """
    Preprocess text by lowercasing. Extend this function as needed.
    
    Parameters:
        text (str): Text to preprocess.
    
    Returns:
        str: Preprocessed text.
    """
    return text.lower()

# Iterate over each query and its associated keywords to build ground truth
for query, keywords in queries.items():
    relevant_datasets = []
    
    # Iterate through all datasets to find relevant ones
    for idx, row in df.iterrows():
        title = row['title']
        description = row['description']
        dataset_id = row['id']
        
        # Preprocess title and description
        title_lower = preprocess_text(title)
        description_lower = preprocess_text(description)
        
        # Check if any keyword is in title or description
        if any(keyword.lower() in title_lower or keyword.lower() in description_lower for keyword in keywords):
            relevant_datasets.append(dataset_id)
    
    # Remove duplicates
    relevant_datasets = list(set(relevant_datasets))
    
    # Append to ground truth
    ground_truth.append({
        'query': query,
        'relevant': relevant_datasets
    })

# Convert ground truth to DataFrame
ground_truth_df = pd.DataFrame(ground_truth)

# Display the ground truth DataFrame
print("\nGround Truth DataFrame:")
print(ground_truth_df)

# ============================
# Load or Define Other System's Recommendations
# ============================

# Example: Load recommendations from a CSV or define them manually
# For demonstration, we'll define them manually as per your initial example
# Replace this with your actual other system's recommendations

other_system_recommendations = {
    'customer purchase behavior analysis': pd.DataFrame({
        'id': [305, 270, 304, 386, 857, 438, 1519, 286, 999, 888],
        'title': [
            "🛒 E-commerce Customer Data For Behavior Analysis",
            "Customer Purchases Behaviour Dataset",
            "E-commerce Customer Behavior Dataset",
            "Digital Marketing | E-Commerce | Customer Behavior",
            "E commerce product purchase data",
            "E-Commerce Customer Dataset",
            "Customer_buying_dataset",
            "E-commerce Customer Behavior Dataset",
            "Additional Dataset 1",
            "Additional Dataset 2"
        ],
        'description': [
            "Explore Customer Shopping Habits, Churn, and Purchase Patterns.",
            "Simulated Dataset of Customer Purchase Behavior.",
            "Synthetic Customer Behavior Dataset for E-commerce Analysis.",
            "Customer behavior data on e-commerce for churn analysis.",
            "Detailed product purchase data from various e-commerce platforms.",
            "E-Commerce Customer Purchase and Interaction Data.",
            "Customer Segmentation and Buying Patterns Analysis.",
            "Exploring Customer Engagement and Purchasing Patterns.",
            "Extra dataset description 1.",
            "Extra dataset description 2."
        ],
        'relevance_score': [0.655477, 0.639732, 0.616212, 0.561961, 0.523510, 0.498224, 0.489399, 0.468744, 0.450000, 0.440000]
    }),
    'product pricing trends': pd.DataFrame({
        'id': [1013, 14, 13, 1012, 1024],
        'title': [
            "e-Commerce (Walmart) Sales Dataset",
            "e-Commerce (Walmart) Sales Dataset",
            "E-commerce Customer Behavior Dataset",
            "E-commerce Customer Behavior Dataset",
            "Sales data of e commerce"
        ],
        'description': [
            "Customer Purchase Patterns and Demographics.",
            "Customer Purchase Patterns and Demographics.",
            "Exploring Customer Engagement and Purchasing Patterns.",
            "Exploring Customer Engagement and Purchasing Patterns.",
            "ECOMMERCE SALES DATASHEET"
        ],
        'relevance_score': [0.500000, 0.500000, 0.483099, 0.483099, 0.455674]
    }),
    # Add more query recommendations as needed
}

# ============================
# Define Evaluation Metrics
# ============================

def precision_at_k(actual, predicted, k):
    """
    Calculate Precision@K.
    
    Parameters:
        actual (list): List of relevant dataset IDs.
        predicted (list): List of recommended dataset IDs.
        k (int): Number of top recommendations to consider.
    
    Returns:
        float: Precision@K score.
    """
    predicted_at_k = predicted[:k]
    relevant = set(actual)
    recommended = set(predicted_at_k)
    return len(relevant & recommended) / k if k > 0 else 0

def recall_at_k(actual, predicted, k):
    """
    Calculate Recall@K.
    
    Parameters:
        actual (list): List of relevant dataset IDs.
        predicted (list): List of recommended dataset IDs.
        k (int): Number of top recommendations to consider.
    
    Returns:
        float: Recall@K score.
    """
    predicted_at_k = predicted[:k]
    relevant = set(actual)
    recommended = set(predicted_at_k)
    return len(relevant & recommended) / len(relevant) if len(relevant) > 0 else 0

def f1_score_at_k(actual, predicted, k):
    """
    Calculate F1 Score@K.
    
    Parameters:
        actual (list): List of relevant dataset IDs.
        predicted (list): List of recommended dataset IDs.
        k (int): Number of top recommendations to consider.
    
    Returns:
        float: F1 Score@K.
    """
    precision = precision_at_k(actual, predicted, k)
    recall = recall_at_k(actual, predicted, k)
    return 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

def average_precision_at_k(actual, predicted, k):
    """
    Compute Average Precision@K.
    
    Parameters:
        actual (list): List of relevant dataset IDs.
        predicted (list): List of recommended dataset IDs.
        k (int): Number of top recommendations to consider.
    
    Returns:
        float: Average Precision@K.
    """
    if not actual:
        return 0.0
    predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    return score / min(len(actual), k)

def dcg_at_k(actual, predicted, k):
    """
    Compute Discounted Cumulative Gain@K.
    
    Parameters:
        actual (list): List of relevant dataset IDs.
        predicted (list): List of recommended dataset IDs.
        k (int): Number of top recommendations to consider.
    
    Returns:
        float: DCG@K.
    """
    dcg = 0.0
    for i, p in enumerate(predicted[:k]):
        if p in actual:
            dcg += 1 / np.log2(i + 2)
    return dcg

def idcg_at_k(actual, k):
    """
    Compute Ideal Discounted Cumulative Gain@K.
    
    Parameters:
        actual (list): List of relevant dataset IDs.
        k (int): Number of top recommendations to consider.
    
    Returns:
        float: IDCG@K.
    """
    idcg = 0.0
    for i in range(min(len(actual), k)):
        idcg += 1 / np.log2(i + 2)
    return idcg

def ndcg_at_k(actual, predicted, k):
    """
    Compute Normalized Discounted Cumulative Gain@K.
    
    Parameters:
        actual (list): List of relevant dataset IDs.
        predicted (list): List of recommended dataset IDs.
        k (int): Number of top recommendations to consider.
    
    Returns:
        float: NDCG@K.
    """
    dcg = dcg_at_k(actual, predicted, k)
    idcg = idcg_at_k(actual, k)
    return dcg / idcg if idcg > 0 else 0

# ============================
# Evaluate the Other Recommendation System
# ============================

def evaluate_other_system(ground_truth_df, other_system_recs, top_k=10):
    """
    Evaluate the other recommendation system against the ground truth.
    
    Parameters:
        ground_truth_df (pd.DataFrame): DataFrame with 'query' and 'relevant' columns.
        other_system_recs (dict): Dictionary with queries as keys and DataFrames of recommendations as values.
        top_k (int): Number of top recommendations to consider.
    
    Returns:
        pd.DataFrame: Evaluation metrics for each query.
    """
    evaluation_results = []
    
    for index, row in ground_truth_df.iterrows():
        query = row['query']
        true_relevant = row['relevant']
        
        # Get recommendations for the query
        if query in other_system_recs:
            recs_df = other_system_recs[query]
            recommended_ids = recs_df['id'].tolist()
        else:
            print(f"Warning: No recommendations found for query '{query}'.")
            recommended_ids = []
        
        # Calculate metrics
        prec = precision_at_k(true_relevant, recommended_ids, top_k)
        rec = recall_at_k(true_relevant, recommended_ids, top_k)
        f1 = f1_score_at_k(true_relevant, recommended_ids, top_k)
        ap = average_precision_at_k(true_relevant, recommended_ids, top_k)
        ndcg = ndcg_at_k(true_relevant, recommended_ids, top_k)
        
        # Append results
        evaluation_results.append({
            'query': query,
            'precision@{}'.format(top_k): prec,
            'recall@{}'.format(top_k): rec,
            'f1_score@{}'.format(top_k): f1,
            'average_precision@{}'.format(top_k): ap,
            'ndcg@{}'.format(top_k): ndcg
        })
    
    # Convert to DataFrame
    evaluation_df = pd.DataFrame(evaluation_results)
    
    return evaluation_df

# ============================
# Run the Evaluation
# ============================

# Perform evaluation
evaluation_df_other_system = evaluate_other_system(
    ground_truth_df=ground_truth_df,
    other_system_recs=other_system_recommendations,
    top_k=10
)

# Display Evaluation Results
print("\nEvaluation Results for Other System:")
print(evaluation_df_other_system)

# ============================
# Aggregate Metrics
# ============================

# # Calculate average metrics across all queries
# aggregated_metrics = evaluation_df_other_system.mean().to_frame().T
# aggregated_metrics.insert(0, 'system', 'Other System')

# print("\nAggregated Evaluation Metrics for Other System:")
# print(aggregated_metrics)

# # ============================
# # Visualize the Evaluation Metrics
# # ============================

# def visualize_metrics(evaluation_df, aggregated_metrics, system_name='Other System'):
#     """
#     Visualize the evaluation metrics.
    
#     Parameters:
#         evaluation_df (pd.DataFrame): DataFrame with per-query evaluation metrics.
#         aggregated_metrics (pd.DataFrame): DataFrame with aggregated metrics.
#         system_name (str): Name of the system being visualized.
#     """
#     # Melt the evaluation_df for easier plotting
#     melted_df = evaluation_df.melt(id_vars=['query'], 
#                                    value_vars=['precision@10', 'recall@10', 'f1_score@10', 'average_precision@10', 'ndcg@10'],
#                                    var_name='metric', 
#                                    value_name='score')
    
#     # Set plot style
#     sns.set(style="whitegrid")
    
#     # Create a bar plot for per-query metrics
#     plt.figure(figsize=(14, 8))
#     sns.barplot(x='metric', y='score', data=melted_df, palette='viridis')
#     plt.title(f'Evaluation Metrics for {system_name}')
#     plt.xlabel('Metric')
#     plt.ylabel('Score')
#     plt.ylim(0, 1)
#     plt.xticks(rotation=45)
#     plt.show()
    
#     # Create a bar plot for aggregated metrics
#     melted_agg = aggregated_metrics.melt(id_vars=['system'], 
#                                          value_vars=['precision@10', 'recall@10', 'f1_score@10', 'average_precision@10', 'ndcg@10'],
#                                          var_name='metric', 
#                                          value_name='score')
    
#     plt.figure(figsize=(14, 8))
#     sns.barplot(x='metric', y='score', data=melted_agg, palette='magma')
#     plt.title(f'Aggregated Evaluation Metrics for {system_name}')
#     plt.xlabel('Metric')
#     plt.ylabel('Average Score')
#     plt.ylim(0, 1)
#     plt.xticks(rotation=45)
#     plt.show()

# # Visualize the metrics for the other system
# visualize_metrics(evaluation_df_other_system, aggregated_metrics, system_name='Other System')

# ============================
# Optional: Statistical Significance Testing
# ============================

# If you have evaluation metrics from your system, you can perform a paired t-test to determine if the differences are statistically significant.
# Here, we'll outline the steps, but you'll need to provide your system's evaluation DataFrame.

# Example:
# Suppose you have your system's evaluation metrics in `evaluation_df_your_system`
# Ensure that both DataFrames have the same queries in the same order.

# Uncomment and modify the following code as needed.

# # Example: Define your system's evaluation DataFrame
# evaluation_df_your_system = pd.DataFrame([
#     {'query': 'customer purchase behavior analysis', 'precision@10': 0.80, 'recall@10': 0.60, 'f1_score@10': 0.69, 'average_precision@10': 0.70, 'ndcg@10': 0.75},
#     {'query': 'product pricing trends', 'precision@10': 0.65, 'recall@10': 0.50, 'f1_score@10': 0.57, 'average_precision@10': 0.60, 'ndcg@10': 0.65},
#     # Add more queries
# ])

# # Merge the two evaluation DataFrames on 'query'
# merged_df = pd.merge(
#     evaluation_df_your_system[['query', 'f1_score@10']],
#     evaluation_df_other_system[['query', 'f1_score@10']],
#     on='query',
#     suffixes=('_your_system', '_other_system')
# )

# # Perform paired t-test on F1 scores
# t_stat, p_value = ttest_rel(merged_df['f1_score@10_your_system'], merged_df['f1_score@10_other_system'])

# print(f"\nPaired t-test results: t-statistic = {t_stat:.4f}, p-value = {p_value:.4f}")

# if p_value < 0.05:
#     print("The difference in F1 scores is statistically significant.")
# else:
#     print("The difference in F1 scores is not statistically significant.")


Dataset loaded successfully.

First few rows of the dataset:
     id                                       title  \
0     1                             E-Commerce Data   
1    10  Sales of summer clothes in E-commerce Wish   
2  1000         AV JanataHack Cross-Sell Prediction   
3  1000                             E-Commerce Data   
4  1000                             E-Commerce Data   

                                         description  \
0               Actual transactions from UK retailer   
1  Top products with ratings and sales performanc...   
2                             Janata Hack Cross-Sell   
3               Actual transactions from UK retailer   
4               Actual transactions from UK retailer   

                                                 url   size format  \
0      https://www.kaggle.com/carrie1/ecommerce-data    7MB    csv   
1  https://www.kaggle.com/jmmvutu/summer-products...  406KB    csv   
2  https://www.kaggle.com/jinxzed/av-janatahack-c...    6MB  

TypeError: Could not convert ['customer purchase behavior analysisproduct pricing trendsinventory management optimizatione-commerce website traffic analysisonline sales forecastingcustomer segmentation in e-commercedigital marketing effectivenessproduct return ratesseasonal sales patternssupply chain management in e-commercecustomer loyalty programscross-selling strategies in e-commercesocial media impact on e-commerce salese-commerce fraud detectionmulti-channel retail strategypersonalized product recommendationsuser experience design for e-commerce websitesimpact of mobile commercedata-driven decision making in e-commercecustomer churn prediction in e-commerce'] to numeric