# Task 1

In [152]:
# Imports:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer


  from .autonotebook import tqdm as notebook_tqdm


In [30]:
# Load CSV data into dataframe
csv = pd.read_csv("../data/G2 software product overview.csv")
df = pd.DataFrame(csv)
df.head()

Unnamed: 0,url,product_name,rating,description,product_url,seller,ownership,seller_website,headquarters,total_revenue,...,full_pricing_page,badge,what_is_description,main_category,main_subject,Features,region,country_code,software_product_id,overview_provided_by
0,https://www.g2.com/products/newforma-project-c...,Newforma Project Center,4.0,Newforma PIM solution an integrated solution f...,https://www.newforma.com/newforma-project-center/,Newforma,,https://www.newforma.com/,"Manchester, NH",,...,https://www.g2.com/products/newforma-project-c...,https://images.g2crowd.com/uploads/report_meda...,,Construction Software,Home>Construction Software>Construction Projec...,"[{""Category"":""Library"",""features"":[{""descripti...",,US,newforma-project-center,Henry Auger
1,https://www.g2.com/products/nitro-pro/reviews,Nitro Pro,4.3,Nitro deliver trusted PDF & eSign software for...,https://www.gonitro.com/pricing,"Nitro, Inc",,https://www.gonitro.com/,"San Francisco, CA",,...,https://www.g2.com/products/nitro-pro/pricing,https://images.g2crowd.com/uploads/report_meda...,,Document Creation Software,Home>Document Creation Software>Nitro Pro>Nitr...,"[{""Category"":""Platform"",""features"":[{""descript...",,US,nitro-pro,Jaclyn Core
2,https://www.g2.com/products/netmera/reviews,Netmera,4.2,"Netmera enables marketers to create, schedule,...",https://www.netmera.com/mobile-marketing-autom...,Netmera,,https://netmera.com/,"İstanbul, TR",,...,https://www.g2.com/products/netmera/pricing,https://images.g2crowd.com/uploads/report_meda...,,Mobile Marketing Software,Home>Mobile Marketing Software>Netmera>Netmera...,"[{""Category"":""Integration"",""features"":[{""descr...",AS,TR,netmera,Irem BaylanNetmera şirketinde Product Marketin...
3,https://www.g2.com/products/netlify/reviews,Netlify,4.5,Netlify provides a full-featured CDN hosting s...,https://www.netlify.com/features/,Netlify,,https://www.netlify.com/,"San Francisco, CA",,...,https://www.g2.com/products/netlify/pricing,https://images.g2crowd.com/uploads/report_meda...,,WebOps Platforms,Home>WebOps Platforms>Netlify>Netlify Reviews,"[{""Category"":""Content"",""features"":[{""descripti...",,US,netlify,Lisa Kretsch
4,https://www.g2.com/products/openbuildings-desi...,OpenBuildings Designer,4.3,OpenBuildings Designer is a single building in...,https://www.g2.com/products/openbuildings-desi...,Bentley Systems,NASDAQ: BSY,https://www.bentley.com/,"Exton, PA",,...,https://www.g2.com/products/openbuildings-desi...,,,CAD Software,Home>CAD Software>Building Design and Building...,"[{""Category"":""Design"",""features"":[{""descriptio...",,US,openbuildings-designer,Prathamesh Gawde


### Data Cleaning

In [130]:
def relevant_attributes(df):
    # Select columns: 'product_name', 'Features', 'rating', and 'main_category'
    df = df.loc[:, [ 'product_name', 'Features', 'rating', 'main_category']]
    return df

vendors_data = relevant_attributes(df.copy())
vendors_data.head()

Unnamed: 0,product_name,Features,rating,main_category
0,Newforma Project Center,"[{'Category': 'Library', 'features': [{'descri...",4.0,Construction Software
1,Nitro Pro,"[{'Category': 'Platform', 'features': [{'descr...",4.3,Document Creation Software
2,Netmera,"[{'Category': 'Integration', 'features': [{'de...",4.2,Mobile Marketing Software
3,Netlify,"[{'Category': 'Content', 'features': [{'descri...",4.5,WebOps Platforms
4,OpenBuildings Designer,"[{'Category': 'Design', 'features': [{'descrip...",4.3,CAD Software


In [132]:
def clean_data(vendors_data):
    # Drop rows with missing data in column: 'Features'
    vendors_data = vendors_data.dropna(subset=['Features'])
    return vendors_data

vendors_data_clean = clean_data(vendors_data.copy())
vendors_data_clean.head()

Unnamed: 0,product_name,Features,rating,main_category
0,Newforma Project Center,"[{'Category': 'Library', 'features': [{'descri...",4.0,Construction Software
1,Nitro Pro,"[{'Category': 'Platform', 'features': [{'descr...",4.3,Document Creation Software
2,Netmera,"[{'Category': 'Integration', 'features': [{'de...",4.2,Mobile Marketing Software
3,Netlify,"[{'Category': 'Content', 'features': [{'descri...",4.5,WebOps Platforms
4,OpenBuildings Designer,"[{'Category': 'Design', 'features': [{'descrip...",4.3,CAD Software


In [133]:
# Extract Features 
extracted_features = []

for category in vendors_data_clean["Features"]:
    feature_list = []
    for features in category:
        for feature in features['features']:
            feature_list.append(feature['name'])
        
    extracted_features.append(feature_list)

vendors_data_clean["Features"] = extracted_features

# Convert the list of features into a concatenated string
vendors_data_clean["Features"] = vendors_data_clean["Features"].apply(lambda x: " ".join(x) if isinstance(x, list) else "")

    

In [134]:
vendors_data_clean

Unnamed: 0,product_name,Features,rating,main_category
0,Newforma Project Center,Objects Materials Textures Shading Lighting Ex...,4.0,Construction Software
1,Nitro Pro,"Custom Branding User, Role, and Access Managem...",4.3,Document Creation Software
2,Netmera,Data Import & Export Tools Integration APIs Br...,4.2,Mobile Marketing Software
3,Netlify,Static Content Caching Dynamic Content Routing...,4.5,WebOps Platforms
4,OpenBuildings Designer,Visualizing Rendering Drawing Editing Sequence...,4.3,CAD Software
...,...,...,...,...
995,Securiti,Data Modelling Recommendations Workflow Manage...,4.8,Data Privacy Management Software
996,SentinelOne Singularity,Issue Tracking Detection Rate False Positives ...,4.7,Endpoint Protection Software
997,Semrush,Social Analytics Social Publishing Social Enga...,4.5,SEO Tools
998,SAP Business ByDesign,Journal Entries Tags / Dimensions Audit Trail ...,4.0,ERP Systems


In [135]:
# Convert vendor data into text format
vendors_data_clean['all_features_text'] = vendors_data_clean.apply(
    lambda row: f"{row['main_category']} {row['product_name']} {row['Features']}".strip(), 
    axis=1
)

vendors_data_clean.head()

Unnamed: 0,product_name,Features,rating,main_category,all_features_text
0,Newforma Project Center,Objects Materials Textures Shading Lighting Ex...,4.0,Construction Software,Construction Software Newforma Project Center ...
1,Nitro Pro,"Custom Branding User, Role, and Access Managem...",4.3,Document Creation Software,Document Creation Software Nitro Pro Custom Br...
2,Netmera,Data Import & Export Tools Integration APIs Br...,4.2,Mobile Marketing Software,Mobile Marketing Software Netmera Data Import ...
3,Netlify,Static Content Caching Dynamic Content Routing...,4.5,WebOps Platforms,WebOps Platforms Netlify Static Content Cachin...
4,OpenBuildings Designer,Visualizing Rendering Drawing Editing Sequence...,4.3,CAD Software,CAD Software OpenBuildings Designer Visualizin...


In [151]:
# Convert user query into text format
user_query = {
    "software_category": "Accounting & Finance Software",
    "capabilities": ["Budgeting"]
}

input_text = f"{' '.join(user_query['capabilities']) if isinstance(user_query['capabilities'], list) else ''} {user_query['software_category']}"
input_text

'Budgeting Accounting & Finance Software'

In [153]:
# Load pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Create embeddings for vendor features and user query
vendor_embeddings = model.encode(vendors_data_clean['all_features_text'].tolist())
user_query_embedding = model.encode([input_text])

# Compute similarity
similarity_scores = cosine_similarity(user_query_embedding, vendor_embeddings).flatten()


In [154]:
# # Initialize TF-IDF Vectorizer
# vectorizer = TfidfVectorizer()

# # Combine all vendor features into a list for vectorization
# vendor_texts = vendors_data_clean['all_features_text'].tolist()

# # Fit and transform the vendor features
# vendor_vectors = vectorizer.fit_transform(vendor_texts)

# # Vectorize the user query
# user_query_vector = vectorizer.transform([input_text])

In [155]:
# Calculate the cosine similarity between user query and vendor features
# similarity_scores = cosine_similarity(user_query_vector, vendor_vectors).flatten()

In [156]:
# Add similarity scores to the DataFrame
vendors_data_clean['similarity_score'] = similarity_scores

# Filter vendors where at least one feature has a similarity score >= 0.6
filtered_vendors = vendors_data_clean[vendors_data_clean['similarity_score'] >= 0.6]

# Sort vendors based on similarity score and rating (if available)
filtered_vendors['weighted_score'] = filtered_vendors['similarity_score'] * filtered_vendors['rating']

# Sort by the weighted score in descending order
ranked_vendors = filtered_vendors.sort_values(by='weighted_score', ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_vendors['weighted_score'] = filtered_vendors['similarity_score'] * filtered_vendors['rating']


In [None]:
vendors_data_clean

Unnamed: 0,product_name,Features,rating,main_category,all_features_text,similarity_score
0,Newforma Project Center,Objects Materials Textures Shading Lighting Ex...,4.0,Construction Software,Construction Software Newforma Project Center ...,0.364281
1,Nitro Pro,"Custom Branding User, Role, and Access Managem...",4.3,Document Creation Software,Document Creation Software Nitro Pro Custom Br...,0.342178
2,Netmera,Data Import & Export Tools Integration APIs Br...,4.2,Mobile Marketing Software,Mobile Marketing Software Netmera Data Import ...,0.264292
3,Netlify,Static Content Caching Dynamic Content Routing...,4.5,WebOps Platforms,WebOps Platforms Netlify Static Content Cachin...,0.174473
4,OpenBuildings Designer,Visualizing Rendering Drawing Editing Sequence...,4.3,CAD Software,CAD Software OpenBuildings Designer Visualizin...,0.248202
...,...,...,...,...,...,...
995,Securiti,Data Modelling Recommendations Workflow Manage...,4.8,Data Privacy Management Software,Data Privacy Management Software Securiti Data...,0.229752
996,SentinelOne Singularity,Issue Tracking Detection Rate False Positives ...,4.7,Endpoint Protection Software,Endpoint Protection Software SentinelOne Singu...,0.198783
997,Semrush,Social Analytics Social Publishing Social Enga...,4.5,SEO Tools,SEO Tools Semrush Social Analytics Social Publ...,0.224864
998,SAP Business ByDesign,Journal Entries Tags / Dimensions Audit Trail ...,4.0,ERP Systems,ERP Systems SAP Business ByDesign Journal Entr...,0.352138


In [158]:

print(ranked_vendors[['product_name', 'similarity_score', 'rating', 'weighted_score']])


                product_name  similarity_score  rating  weighted_score
685                   Budgyt          0.625701     4.8        3.003367
399  OneStream - Unified EPM          0.638680     4.6        2.937926
589              Mosaic Tech          0.619359     4.7        2.910986
166                   Causal          0.619076     4.6        2.847749
934                    Jirav          0.602885     4.7        2.833558
784                  onPhase          0.605027     4.5        2.722621
