# Task 1

In [None]:
# Imports:
import json
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
import torch

In [None]:
# Load CSV data into dataframe
csv = pd.read_csv("../data/G2 software product overview.csv")
df = pd.DataFrame(csv)
df.head()

In [None]:
df.columns

### Data Cleaning

In [None]:
def relevant_attributes(df):
    # Select columns: 'seller', 'product_name', 'Features', 'categories', 'rating', and 'main_category'
    df = df.loc[:, [ 'seller', 'product_name', 'Features', 'categories', 'rating', 'main_category']]
    return df

vendors_data = relevant_attributes(df.copy())
vendors_data.head()

In [None]:
def clean_data(vendors_data):
    # Drop rows with missing data in column: 'Features'
    vendors_data = vendors_data.dropna(subset=['Features'])
    return vendors_data

vendors_data_clean = clean_data(vendors_data.copy())
vendors_data_clean.head()

In [None]:
# convert column values to list
vendors_data_clean["Features"] = vendors_data_clean["Features"].apply(lambda x: json.loads(x))
vendors_data_clean["categories"] = vendors_data_clean["categories"].apply(lambda x: json.loads(x))

In [None]:
# Extract Features 
extracted_features = []

for category in vendors_data_clean["Features"]:
    feature_list = []
    for features in category:
        for feature in features['features']:
            feature_list.append(feature['name'])
        
    extracted_features.append(feature_list)

vendors_data_clean["feature_names"] = extracted_features

# Convert the list of categories into a concatenated string
vendors_data_clean["categories_text"] = vendors_data_clean["categories"].apply(lambda x: " ".join(x) if isinstance(x, list) else "")

vendors_data_clean    

### Generate Embeddings

In [147]:
# Load a pre-trained sentence embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

def get_embedding(text):
    """Generate embedding for a given text using the sentence transformer model."""
    return model.encode(text, convert_to_tensor=True)

vendors_data_clean["main_category_embedding"] = vendors_data_clean["main_category"].apply(get_embedding)
vendors_data_clean["categories_text_embedding"] = vendors_data_clean["categories_text"].apply(get_embedding)
vendors_data_clean["feature_embeddings"] = vendors_data_clean["feature_names"].apply(
    lambda features: [get_embedding(feature) for feature in features]
)



In [178]:
def compute_feature_similarities(capability_embeddings, vendor_feature_embeddings):
    """Compute pairwise similarity scores between user capabilities and vendor features."""
    if not vendor_feature_embeddings:
        return []  # Return an empty list if no features are available

    vendor_feature_embeddings = torch.stack(vendor_feature_embeddings)  # Convert list to tensor
    similarity_matrix = util.pytorch_cos_sim(torch.stack(capability_embeddings), vendor_feature_embeddings)

    return similarity_matrix.tolist()  # Keeping as list of lists for now

def compute_similarity(input_embedding, vendor_embeddings):
    """
    Compute cosine similarity between software_category and (main_category + categories_text)
    """
    similarity_scores = util.pytorch_cos_sim(input_embedding, vendor_embeddings)
    return similarity_scores.squeeze().tolist()

# Example inputs
software_category = "Project Management Software"
capabilities = ["Task Scheduling", "Time Tracking"]

# Generate embeddings
software_category_embedding = get_embedding(software_category)  # Already a tensor
capability_embeddings = [get_embedding(feature) for feature in capabilities]  # List of tensors

# Compute category similarity
vendors_data_clean["category_similarity"] = vendors_data_clean.apply(
    lambda row: max(compute_similarity(software_category_embedding, 
                                       torch.stack([row["main_category_embedding"], row["categories_text_embedding"]]))),
    axis=1
)

# Compute feature similarity (list of scores for each vendor)
vendors_data_clean["feature_similarities"] = vendors_data_clean["feature_embeddings"].apply(
    lambda feature_emb: compute_feature_similarities(capability_embeddings, feature_emb)
)


In [196]:

filtered_vendors = vendors_data_clean[
    vendors_data_clean["feature_similarities"].apply(
        lambda scores: any(score >= 0.6 for row in scores for score in row)  # Flatten the nested lists properly
    )
].copy() 

In [198]:
filtered_vendors.shape

(427, 16)

In [201]:
# Step 1: Ensure vendors have at least one feature similarity score >= 0.6
filtered_vendors = vendors_data_clean[
    vendors_data_clean["feature_similarities"].apply(
        lambda scores: any(score >= 0.6 for row in scores for score in row)  # Flatten the nested lists properly
    )
].copy()  # Copy to avoid modifying the original DataFrame


#  Step 1: Create a new copy for ranking to avoid modifying the original filtered vendors
ranked_vendors = filtered_vendors.copy()

#  Step 2: Compute weighted feature similarity per vendor
ranked_vendors.loc[:, "weighted_feature_similarity"] = ranked_vendors["feature_similarities"].apply(
    lambda scores: sum(score for row in scores for score in row) / sum(len(row) for row in scores) if scores else 0
)

#  Step 3: Normalize vendor ratings between 0 and 1
if not ranked_vendors.empty:
    min_rating = ranked_vendors["rating"].min()
    max_rating = ranked_vendors["rating"].max()
    if max_rating > min_rating:  # Avoid division by zero
        ranked_vendors.loc[:, "normalized_rating"] = ranked_vendors["rating"].apply(
            lambda r: (r - min_rating) / (max_rating - min_rating)
        )
    else:
        ranked_vendors.loc[:, "normalized_rating"] = 0
else:
    ranked_vendors["normalized_rating"] = []

#  Step 4: Compute final ranking score (70% feature similarity, 30% rating)
ranked_vendors.loc[:, "final_score"] = (
    0.7 * ranked_vendors["weighted_feature_similarity"] + 
    0.3 * ranked_vendors["normalized_rating"]
)

#  Step 5: Sort vendors by final ranking score (descending order)
ranked_vendors = ranked_vendors.sort_values(by="final_score", ascending=False)

#  Step 6: Select relevant columns for output
top_vendors = ranked_vendors[["seller", "final_score", "weighted_feature_similarity", "category_similarity", "rating"]]

#  Step 7: Display top vendors (only if they exist)
if not top_vendors.empty:
    print(top_vendors.head(min(10, len(top_vendors))))  # Show up to 10 vendors, or fewer if less exist
else:
    print("No vendors met the similarity threshold.")


                     seller  final_score  weighted_feature_similarity  \
916                     QAD     0.525779                     0.331113   
174                 CAST AI     0.472296                     0.263279   
988                 Fullbay     0.468945                     0.249921   
363                  Intuit     0.465330                     0.279043   
894           Take 44, Inc.     0.460226                     0.246037   
765  Willo Technologies Ltd     0.458308                     0.243297   
683                AlignOps     0.458302                     0.243288   
334              Pocketstop     0.457726                     0.242465   
535                  Deputy     0.456907                     0.258438   
873      Contractor Foreman     0.456373                     0.266247   

     category_similarity  rating  
916             0.288011     4.9  
174             0.413441     4.8  
988             0.382038     4.9  
363             0.393246     4.5  
894             0.382

In [203]:
vendors_data_clean['feature_names'][916]

['Templates',
 'Workflows',
 'Procedures',
 'Planning',
 'Scheduling',
 'Collaboration',
 'Monitoring',
 'KPIs',
 'Optimization']