In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


Adding Necessary Imports

In [None]:
import pandas as pd
import numpy as np
import re
import string
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from tabulate import tabulate


Loading Data --> Renaming Columns  --> Merging content data with labels data

In [None]:
def load_data():
    """Load and preprocess the datasets."""
    content_df = pd.read_csv("/content/drive/MyDrive/Home_depo/content_data_MASTER.csv")  # Contains 'title', 'slug'
    labels_df = pd.read_csv("/content/drive/MyDrive/Home_depo/labels_MASTER.csv")      # Has incorrect column names
    test_df = pd.read_csv("/content/drive/MyDrive/Home_depo/test_MASTER.csv")          # Contains 'searchTerm'

    # Rename columns in labels_df
    labels_df.rename(columns={'#4 #14 connector': 'searchTerm',
                              'types-of-pipe-fittings': 'slug',
                              'RELEVANT': 'label'}, inplace=True)

    # Merge labels with content to get titles (only labeled rows will have a title)
    labels_df = labels_df.merge(content_df, on="slug", how="left")

    # Remove rows with missing titles, drop duplicates, and reset index
    labels_df.dropna(subset=["title"], inplace=True)
    labels_df.drop_duplicates(inplace=True)
    labels_df.reset_index(drop=True, inplace=True)
    return content_df, labels_df, test_df


Preprocess text by lowercasing, removing punctuation, and extra spaces.

In [None]:
def preprocess_text(text):
    """Preprocess text by lowercasing, removing punctuation, and extra spaces."""
    text = str(text).lower()
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    text = re.sub(r"\s+", " ", text).strip()             # Remove extra spaces
    return text

Apply text preprocessing to search terms and titles.

In [None]:
def preprocess_data(labels_df):
    """Apply text preprocessing to search terms and titles."""
    labels_df["processed_search"] = labels_df["searchTerm"].apply(preprocess_text)
    labels_df["processed_title"] = labels_df["title"].apply(preprocess_text)
    labels_df["processed_slug"] = labels_df["slug"].apply(preprocess_text)
    return labels_df

Vectorize processed text using Hugging Face Sentense Transformer all-MiniLM-L6-v2

In [None]:
def fit_tfidf_and_embeddings(labels_df):
    """Fit TF-IDF vectorizer and generate sentence embeddings."""
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(labels_df["processed_title"])

    model = SentenceTransformer("all-MiniLM-L6-v2")
    sentence_embeddings = model.encode(labels_df["processed_title"].tolist(), convert_to_numpy=True)

    return tfidf_vectorizer, tfidf_matrix, model, sentence_embeddings


Compute Cosine similarity scores using Sentence Tranformer vectors

In [None]:
def compute_similarity(search_term, tfidf_vectorizer, tfidf_matrix, model, sentence_embeddings):
    """Compute similarity scores using TF-IDF and sentence embeddings."""
    search_term_processed = preprocess_text(search_term)

    # Compute similarity with TF-IDF
    search_tfidf = tfidf_vectorizer.transform([search_term_processed])
    tfidf_similarities = cosine_similarity(search_tfidf, tfidf_matrix).flatten()

    # Compute similarity with Sentence Embeddings
    search_embedding = model.encode([search_term_processed], convert_to_numpy=True)
    embedding_similarities = cosine_similarity(search_embedding, sentence_embeddings).flatten()

    # Hybrid Score (Weighted Combination)
    hybrid_scores = 0.2 * tfidf_similarities + 0.7 * embedding_similarities
    return hybrid_scores


Find the best threshold for classifying search terms as RELEVANT or NOT RELEVANT.

In [None]:
def find_best_threshold(labels_df, tfidf_vectorizer, tfidf_matrix, model, sentence_embeddings):
    """Find the best threshold for classifying search terms as RELEVANT or NOT RELEVANT."""
    thresholds = np.linspace(0, 1, 101)
    best_threshold = None
    best_f1 = 0

    # Split labeled data for evaluation
    X_train, X_test, y_train, y_test = train_test_split(labels_df["processed_search"], labels_df["label"], test_size=0.2, random_state=42)

    # Mapping for binary evaluation
    label_mapping = {"NOT RELEVANT": 0, "RELEVANT": 1}
    y_test_num = [label_mapping[label] for label in y_test]

    for t in thresholds:
        preds = []
        for search_term in X_test:
            scores = compute_similarity(search_term, tfidf_vectorizer, tfidf_matrix, model, sentence_embeddings)
            pred = "RELEVANT" if max(scores) > t else "NOT RELEVANT"
            preds.append(pred)

        y_pred_num = [label_mapping[p] for p in preds]
        f1 = f1_score(y_test_num, y_pred_num, average="macro")
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = t

    return best_threshold, best_f1

**Model Evaluation**

In [None]:
def evaluate_model(labels_df, tfidf_vectorizer, tfidf_matrix, model, sentence_embeddings, threshold):
    """Evaluate model performance using the best threshold."""
    X_train, X_test, y_train, y_test = train_test_split(labels_df["processed_search"], labels_df["label"], test_size=0.2, random_state=42)
    predictions = []

    for search_term in X_test:
        scores = compute_similarity(search_term, tfidf_vectorizer, tfidf_matrix, model, sentence_embeddings)
        predicted_label = "RELEVANT" if max(scores) > threshold else "NOT RELEVANT"
        predictions.append(predicted_label)

    print(classification_report(y_test, predictions))

Generating Recommendations for the test set that we have been given

In [None]:
def recommend_articles_for_test(test_df, labels_df, tfidf_vectorizer, tfidf_matrix, model, sentence_embeddings, threshold, top_k=5):
    """Generate recommendations for the test set, ensuring no duplicate slugs."""
    recommendations = {}

    for search_term in test_df["searchTerm"]:
        scores = compute_similarity(search_term, tfidf_vectorizer, tfidf_matrix, model, sentence_embeddings)
        top_indices = np.argsort(scores)[::-1][:top_k]
        top_scores = scores[top_indices]

        if max(top_scores) < threshold:
            recommendations[search_term] = "Result not found"
        else:
            # Get candidate rows: each row is a list [slug, title, processed_title]
            candidate_rows = labels_df.iloc[top_indices][["slug", "title", "processed_title"]].values.tolist()

            # Remove duplicates by slug
            seen_slugs = set()  # This set keeps track of the slugs we've already seen
            unique_candidates = []

            for candidate in candidate_rows:
                slug, title, _ = candidate
                if slug not in seen_slugs:  # If the slug has not been seen before, add to the result
                    unique_candidates.append([slug, title])
                    seen_slugs.add(slug)  # Mark this slug as seen

            # Add to recommendations
            recommendations[search_term] = unique_candidates

    return recommendations


Formatting Recommendation : The recommendations are not visually appealing

In [None]:
def format_recommendations(recommendations):
    """Format recommendations for display."""
    formatted_output = []

    for search_term, articles in recommendations.items():
        formatted_output.append(f"\nSearch Term: **{search_term}**\n")
        if articles == "Result not found":
            formatted_output.append("Result not found")
        else:
            table_data = []
            for slug, title in articles:
                table_data.append([slug, title])

            formatted_output.append(tabulate(table_data, headers=["Slug", "Recommended Title"], tablefmt="grid"))

    return "\n".join(formatted_output)


Define Main Function:

In [None]:
def main():
    # Load and preprocess data
    content_df, labels_df, test_df = load_data()
    labels_df = preprocess_data(labels_df)

    # Fit TF-IDF and generate embeddings
    tfidf_vectorizer, tfidf_matrix, model, sentence_embeddings = fit_tfidf_and_embeddings(labels_df)

    # Find the best threshold
    best_threshold, best_f1 = find_best_threshold(labels_df, tfidf_vectorizer, tfidf_matrix, model, sentence_embeddings)
    print(f"Best threshold: {best_threshold} with macro F1: {best_f1}")

    # Evaluate model performance
    evaluate_model(labels_df, tfidf_vectorizer, tfidf_matrix, model, sentence_embeddings, best_threshold)

    # Generate recommendations for the test set
    recommendations = recommend_articles_for_test(test_df, labels_df, tfidf_vectorizer, tfidf_matrix, model, sentence_embeddings, best_threshold, top_k=5)
    print(format_recommendations(recommendations))


In [None]:
if __name__ == "__main__":
    main()

Best threshold: 0.33 with macro F1: 0.7502165883908622
              precision    recall  f1-score   support

NOT RELEVANT       0.60      0.67      0.64        43
    RELEVANT       0.88      0.85      0.86       123

    accuracy                           0.80       166
   macro avg       0.74      0.76      0.75       166
weighted avg       0.81      0.80      0.80       166


Search Term: **34 in. to 36 in. x 72 in. shower door**

+------------------------------------+------------------------------------+
| Slug                               | Recommended Title                  |
| tips-for-selecting-shower-doors    | Tips for Selecting Shower Doors    |
+------------------------------------+------------------------------------+
| how-to-install-a-pivot-shower-door | How To Install a Pivot Shower Door |
+------------------------------------+------------------------------------+

Search Term: **outdoor prelit christmas tree**

+------------------------------------------------+------