In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
/content/drive/MyDrive/Home_depo/content_data_MASTER.csv

NameError: name 'content' is not defined

Importing Imports that we require

In [6]:
import pandas as pd
import numpy as np
import re
import string
import tabulate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from tabulate import tabulate

Loading Data --> Renaming Columns  --> Merging content data with labels data

In [7]:
def load_data():
    """Load and preprocess the datasets."""
    content_df = pd.read_csv("/content/drive/MyDrive/Home_depo/content_data_MASTER.csv")  # Contains 'title', 'slug'
    labels_df = pd.read_csv("/content/drive/MyDrive/Home_depo/labels_MASTER.csv")      # Has incorrect column names
    test_df = pd.read_csv("/content/drive/MyDrive/Home_depo/test_MASTER.csv")          # Contains 'searchTerm'

    # Rename columns in labels_df
    labels_df.rename(columns={'#4 #14 connector': 'searchTerm',
                              'types-of-pipe-fittings': 'slug',
                              'RELEVANT': 'label'}, inplace=True)

    # Merge labels with content to get titles (only labeled rows will have a title)
    labels_df = labels_df.merge(content_df, on="slug", how="left")

    # Remove rows with missing titles, drop duplicates, and reset index
    labels_df.dropna(subset=["title"], inplace=True)
    labels_df.drop_duplicates(inplace=True)
    labels_df.reset_index(drop=True, inplace=True)
    return content_df, labels_df, test_df


Preprocess text by lowercasing, removing punctuation, and extra spaces.

In [8]:
def preprocess_text(text):
    """Preprocess text by lowercasing, removing punctuation, and extra spaces."""
    text = str(text).lower()
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    text = re.sub(r"\s+", " ", text).strip()             # Remove extra spaces
    return text

Apply text preprocessing to search terms and titles.

In [9]:
def preprocess_data(labels_df):
    """Apply text preprocessing to search terms and titles."""
    labels_df["processed_search"] = labels_df["searchTerm"].apply(preprocess_text)
    labels_df["processed_title"] = labels_df["title"].apply(preprocess_text)
    labels_df["processed_slug"] = labels_df["slug"].apply(preprocess_text)
    return labels_df

Vectorize processed text using TF-IDF

In [10]:
def fit_tfidf(labels_df):
    """Fit TF-IDF vectorizer."""
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(labels_df["processed_title"])
    return tfidf_vectorizer, tfidf_matrix


Compute Cosine similarity scores using TF-IDF vectors

In [11]:
def compute_similarity(search_term, tfidf_vectorizer, tfidf_matrix):
    """Compute similarity scores using TF-IDF."""
    search_term_processed = preprocess_text(search_term)

    # Compute similarity with TF-IDF
    search_tfidf = tfidf_vectorizer.transform([search_term_processed])
    tfidf_similarities = cosine_similarity(search_tfidf, tfidf_matrix).flatten()
    return tfidf_similarities


Find the best threshold for classifying search terms as RELEVANT or NOT RELEVANT.

In [12]:
def find_best_threshold(labels_df, tfidf_vectorizer, tfidf_matrix):
    """Find the best threshold for classifying search terms as RELEVANT or NOT RELEVANT."""
    thresholds = np.linspace(0, 1, 101)
    best_threshold = None
    best_f1 = 0

    # Split labeled data for evaluation
    X_train, X_test, y_train, y_test = train_test_split(labels_df["processed_search"], labels_df["label"], test_size=0.2, random_state=42)

    # Mapping for binary evaluation
    label_mapping = {"NOT RELEVANT": 0, "RELEVANT": 1}
    y_test_num = [label_mapping[label] for label in y_test]

    for t in thresholds:
        preds = []
        for search_term in X_test:
            scores = compute_similarity(search_term, tfidf_vectorizer, tfidf_matrix)
            pred = "RELEVANT" if max(scores) > t else "NOT RELEVANT"
            preds.append(pred)

        y_pred_num = [label_mapping[p] for p in preds]
        f1 = f1_score(y_test_num, y_pred_num, average="macro")
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = t

    return best_threshold, best_f1

**Model Evaluation**

In [13]:
def evaluate_model(labels_df, tfidf_vectorizer, tfidf_matrix, threshold):
    """Evaluate model performance using the best threshold."""
    X_train, X_test, y_train, y_test = train_test_split(labels_df["processed_search"], labels_df["label"], test_size=0.2, random_state=42)
    predictions = []

    for search_term in X_test:
        scores = compute_similarity(search_term, tfidf_vectorizer, tfidf_matrix)
        predicted_label = "RELEVANT" if max(scores) > threshold else "NOT RELEVANT"
        predictions.append(predicted_label)

    print(classification_report(y_test, predictions))

Generating Recommendations for the test set

In [14]:
def recommend_articles_for_test(test_df, labels_df, tfidf_vectorizer, tfidf_matrix, threshold, top_k=5):
    """Generate recommendations for the test set."""
    recommendations = {}

    for search_term in test_df["searchTerm"]:
        scores = compute_similarity(search_term, tfidf_vectorizer, tfidf_matrix)
        top_indices = np.argsort(scores)[::-1][:top_k]
        top_scores = scores[top_indices]

        if max(top_scores) < threshold:
            recommendations[search_term] = "Result not found"
        else:
            # Get candidate rows: each row is a list [slug, title, processed_title]
            candidate_rows = labels_df.iloc[top_indices][["slug", "title", "processed_title"]].values.tolist()

            # Remove duplicates by checking the normalized slug:
            filtered_candidates = []
            seen_slugs = set()
            for candidate in candidate_rows:
                slug, title, _ = candidate
                slug_norm = "".join(str(slug).lower().split())  # Normalizing the slug

                if slug_norm not in seen_slugs:
                    filtered_candidates.append([slug, title])
                    seen_slugs.add(slug_norm)

            recommendations[search_term] = filtered_candidates

    return recommendations


Formatting Recommendation : The recommendations are not visually appealing

In [17]:
def format_recommendations(recommendations):
    """Format recommendations for display."""
    formatted_output = []

    for search_term, articles in recommendations.items():
        formatted_output.append(f"\nSearch Term: **{search_term}**\n")
        if articles == "Result not found":
            formatted_output.append("Result not found")
        else:
            table_data = []
            for slug, title in articles:
                table_data.append([slug, title])

            formatted_output.append(tabulate(table_data, headers=["Slug", "Recommended Title"], tablefmt="grid"))

    return "\n".join(formatted_output)


Define Main

In [18]:
def main():
    # Load and preprocess data
    content_df, labels_df, test_df = load_data()
    labels_df = preprocess_data(labels_df)

    # Fit TF-IDF
    tfidf_vectorizer, tfidf_matrix = fit_tfidf(labels_df)

    # Find the best threshold
    best_threshold, best_f1 = find_best_threshold(labels_df, tfidf_vectorizer, tfidf_matrix)
    print(f"Best threshold: {best_threshold} with macro F1: {best_f1}")

    # Evaluate model performance
    evaluate_model(labels_df, tfidf_vectorizer, tfidf_matrix, best_threshold)

    # Generate recommendations for the test set
    recommendations = recommend_articles_for_test(test_df, labels_df, tfidf_vectorizer, tfidf_matrix, best_threshold, top_k=5)
    print(format_recommendations(recommendations))


In [19]:
if __name__ == "__main__":
    main()

Best threshold: 0.35000000000000003 with macro F1: 0.6078363583981561
              precision    recall  f1-score   support

NOT RELEVANT       0.55      0.28      0.37        43
    RELEVANT       0.78      0.92      0.85       123

    accuracy                           0.75       166
   macro avg       0.67      0.60      0.61       166
weighted avg       0.72      0.75      0.72       166


Search Term: **34 in. to 36 in. x 72 in. shower door**

+---------------------------------------------+---------------------------------------------+
| Slug                                        | Recommended Title                           |
| how-to-screen-in-a-porch                    | How to Screen in a Porch                    |
+---------------------------------------------+---------------------------------------------+
| how-to-care-for-roses-in-summer             | How to Care for Roses in Summer             |
+---------------------------------------------+-----------------------------