In [1]:
!pip install yake

Collecting yake
  Downloading yake-0.6.0-py3-none-any.whl.metadata (10 kB)
Collecting jellyfish (from yake)
  Downloading jellyfish-1.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.6 kB)
Collecting segtok (from yake)
  Downloading segtok-1.5.11-py3-none-any.whl.metadata (9.0 kB)
Downloading yake-0.6.0-py3-none-any.whl (80 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.7/80.7 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jellyfish-1.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (355 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m355.9/355.9 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading segtok-1.5.11-py3-none-any.whl (24 kB)
Installing collected packages: segtok, jellyfish, yake
Successfully installed jellyfish-1.2.0 segtok-1.5.11 yake-0.6.0


In [3]:
!pip install keybert



In [4]:
import kagglehub
import pandas as pd
import numpy as np
import os
import pickle
import re
import warnings
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util

import yake

import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
path = kagglehub.dataset_download("hsankesara/medium-articles")
print(f"Dataset path: {path}")

csv_path = os.path.join(path, 'articles.csv')
df_raw = pd.read_csv(csv_path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/hsankesara/medium-articles?dataset_version_number=1...


100%|██████████| 1.34M/1.34M [00:00<00:00, 146MB/s]

Extracting files...
Dataset path: /root/.cache/kagglehub/datasets/hsankesara/medium-articles/versions/1





In [6]:
print(df_raw.shape)
print(df_raw.columns)
print(df_raw.head())

(337, 6)
Index(['author', 'claps', 'reading_time', 'link', 'title', 'text'], dtype='object')
             author claps  reading_time  \
0        Justin Lee  8.3K            11   
1       Conor Dewey  1.4K             7   
2  William Koehrsen  2.8K            11   
3      Gant Laborde  1.3K             7   
4  Emmanuel Ameisen   935            11   

                                                link  \
0  https://medium.com/swlh/chatbots-were-the-next...   
1  https://towardsdatascience.com/python-for-data...   
2  https://towardsdatascience.com/automated-featu...   
3  https://medium.freecodecamp.org/machine-learni...   
4  https://blog.insightdatascience.com/reinforcem...   

                                               title  \
0  Chatbots were the next big thing: what happene...   
1  Python for Data Science: 8 Concepts You May Ha...   
2  Automated Feature Engineering in Python – Towa...   
3  Machine Learning: how to go from Zero to Hero ...   
4  Reinforcement Learning from 

# Data Preprocessing Pipeline

In [8]:
def clean_article_text(text):
    """
    Clean and preprocess article text for NLP processing.

    Args:
        text (str): Raw article text

    Returns:
        str: Cleaned text
    """
    if not isinstance(text, str) or pd.isna(text):
        return ""

    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # Remove URLs and email addresses
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\S+@\S+', '', text)

    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^a-zA-Z\s\.\!\?]', '', text)

    # Convert to lowercase and normalize whitespace
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def preprocess_dataset(df, min_words=50):
    """
    Complete preprocessing pipeline with enhanced filtering.

    Args:
        df (pd.DataFrame): Raw dataset
        min_words (int): Minimum word count threshold

    Returns:
        pd.DataFrame: Preprocessed dataset
    """
    print("🔧 ENHANCED DATA PREPROCESSING PIPELINE")
    print("=" * 60)

    df_processed = df.copy()
    initial_rows = len(df_processed)

    # Step 1: Handle missing values
    print("Step 1: Handling missing values...")
    df_processed.dropna(subset=['text'], inplace=True)
    print(f"Removed {initial_rows - len(df_processed)} rows with missing text")

    # Step 2: Remove duplicates
    print("\nStep 2: Removing duplicates...")
    initial_rows = len(df_processed)
    df_processed.drop_duplicates(subset=['text'], inplace=True, keep='first')
    print(f"Removed {initial_rows - len(df_processed)} duplicate articles")

    # Step 3: Clean text
    print("\nStep 3: Cleaning article text...")
    df_processed['cleaned_text'] = df_processed['text'].apply(clean_article_text)

    # Step 4: Calculate text statistics
    print("\nStep 4: Calculating text statistics...")
    df_processed['word_count'] = df_processed['cleaned_text'].apply(lambda x: len(x.split()))
    df_processed['char_count'] = df_processed['cleaned_text'].apply(len)
    df_processed['sentence_count'] = df_processed['cleaned_text'].apply(lambda x: len([s for s in x.split('.') if s.strip()]))

    # Step 5: Advanced filtering
    print(f"\nStep 5: Advanced filtering (min {min_words} words)...")
    initial_rows = len(df_processed)

    # Filter by word count
    df_processed = df_processed[df_processed['word_count'] >= min_words].copy()
    print(f"Word count filter: Removed {initial_rows - len(df_processed)} articles")

    # Filter very long articles (potential outliers)
    initial_rows = len(df_processed)
    max_words = df_processed['word_count'].quantile(0.99)  # Remove top 1%
    df_processed = df_processed[df_processed['word_count'] <= max_words].copy()
    print(f"Length outlier filter: Removed {initial_rows - len(df_processed)} very long articles")

    # Filter articles with very low information content
    initial_rows = len(df_processed)
    df_processed = df_processed[df_processed['char_count'] >= min_words * 3].copy()  # ~3 chars per word minimum
    print(f"Information content filter: Removed {initial_rows - len(df_processed)} low-content articles")

    # Reset index
    df_processed.reset_index(drop=True, inplace=True)

    print(f"\n PREPROCESSING COMPLETE!")
    print(f"Final dataset: {len(df_processed):,} articles ready for splitting")
    print(f"Retention rate: {len(df_processed)/len(df)*100:.1f}%")

    return df_processed

# Apply preprocessing
df_processed = preprocess_dataset(df_raw, min_words=50)

🔧 ENHANCED DATA PREPROCESSING PIPELINE
Step 1: Handling missing values...
Removed 0 rows with missing text

Step 2: Removing duplicates...
Removed 107 duplicate articles

Step 3: Cleaning article text...

Step 4: Calculating text statistics...

Step 5: Advanced filtering (min 50 words)...
Word count filter: Removed 1 articles
Length outlier filter: Removed 3 very long articles
Information content filter: Removed 0 low-content articles

 PREPROCESSING COMPLETE!
Final dataset: 226 articles ready for splitting
Retention rate: 67.1%


# Data Splitting

In [10]:
from sklearn.model_selection import train_test_split
import pandas as pd

def create_stratified_split(df, train_size=0.7, val_size=0.15, test_size=0.15, random_state=42):
    """
    Create stratified train, validation, and test splits based on article word count.

    Args:
        df (pd.DataFrame): Processed dataset with a 'word_count' column.
        train_size (float): The proportion for the training set.
        val_size (float): The proportion for the validation set.
        test_size (float): The proportion for the test set.
        random_state (int): Seed for reproducibility.

    Returns:
        tuple: (train_df, val_df, test_df, split_info)
    """
    print(" CREATING STRATIFIED TRAIN/VALIDATION/TEST SPLITS")
    print("=" * 60)

    # Ensure proportions sum to 1.0
    assert abs(train_size + val_size + test_size - 1.0) < 1e-9, "Split proportions must sum to 1.0"

    df_temp = df.copy()

    # Stratify by word count quartiles
    df_temp['word_quartile'] = pd.qcut(df_temp['word_count'], q=4, labels=['short', 'medium', 'long', 'very_long'])

    print(f"Stratification will be based on: Word Count Quartiles")
    print("Category distribution in the original dataset:")
    print(df_temp['word_quartile'].value_counts(normalize=True).mul(100).round(2).astype(str) + '%')

    # --- First split: Separate test set from the rest ---
    train_val_df, test_df = train_test_split(
        df_temp,
        test_size=test_size,
        stratify=df_temp['word_quartile'],
        random_state=random_state
    )

    # --- Second split: Separate train and validation from the remaining data ---
    val_proportion_of_remaining = val_size / (train_size + val_size)

    train_df, val_df = train_test_split(
        train_val_df,
        test_size=val_proportion_of_remaining,
        stratify=train_val_df['word_quartile'],
        random_state=random_state
    )

    # --- FIX: Perform verification BEFORE dropping the column and resetting the index ---
    print("\nVerifying stratification (distribution of word_quartiles):")
    train_dist = train_df['word_quartile'].value_counts(normalize=True).mul(100)
    val_dist = val_df['word_quartile'].value_counts(normalize=True).mul(100)
    test_dist = test_df['word_quartile'].value_counts(normalize=True).mul(100)

    verification_df = pd.DataFrame({
        'Train Set %': train_dist.round(2),
        'Validation Set %': val_dist.round(2),
        'Test Set %': test_dist.round(2)
    })
    print(verification_df)

    # --- Now, drop the temporary column and reset the indices for the final output ---
    train_df = train_df.drop(columns=['word_quartile']).reset_index(drop=True)
    val_df = val_df.drop(columns=['word_quartile']).reset_index(drop=True)
    test_df = test_df.drop(columns=['word_quartile']).reset_index(drop=True)

    # Create a dictionary with information about the splits
    split_info = {
        'total_articles': len(df),
        'train_size': len(train_df),
        'val_size': len(val_df),
        'test_size': len(test_df),
        'train_pct': len(train_df) / len(df) * 100,
        'val_pct': len(val_df) / len(df) * 100,
        'test_pct': len(test_df) / len(df) * 100,
        'random_state': random_state
    }

    print("\n DATA SPLIT COMPLETE:")
    print(f"Training set:   {len(train_df):,} articles ({split_info['train_pct']:.1f}%)")
    print(f"Validation set: {len(val_df):,} articles ({split_info['val_pct']:.1f}%)")
    print(f"Test set:       {len(test_df):,} articles ({split_info['test_pct']:.1f}%)")

    return train_df, val_df, test_df, split_info


# --- USAGE ---
# Assuming 'df_processed' is your DataFrame with a 'cleaned_text' and 'word_count' column

# Create the splits (e.g., 70% train, 15% validation, 15% test)
train_df, val_df, test_df, split_info = create_stratified_split(
    df_processed,
    train_size=0.7,
    val_size=0.15,
    test_size=0.15
)

# Create the final text lists for your models
train_texts = train_df['cleaned_text'].tolist()
val_texts = val_df['cleaned_text'].tolist()
test_texts = test_df['cleaned_text'].tolist()

print(f"\nCreated {len(train_texts)} training, {len(val_texts)} validation, and {len(test_texts)} test texts.")

 CREATING STRATIFIED TRAIN/VALIDATION/TEST SPLITS
Stratification will be based on: Word Count Quartiles
Category distribution in the original dataset:
word_quartile
short        25.22%
very_long    25.22%
medium       24.78%
long         24.78%
Name: proportion, dtype: object

Verifying stratification (distribution of word_quartiles):
               Train Set %  Validation Set %  Test Set %
word_quartile                                           
short                24.68             26.47       26.47
medium               25.32             23.53       23.53
long                 24.68             26.47       23.53
very_long            25.32             23.53       26.47

 DATA SPLIT COMPLETE:
Training set:   158 articles (69.9%)
Validation set: 34 articles (15.0%)
Test set:       34 articles (15.0%)

Created 158 training, 34 validation, and 34 test texts.


In [22]:
validation_queries = [
    # Technology & Data Science (more question-based)
    "how is data science used in business?",
    "the future of artificial intelligence",

    # Software Development & Design
    "getting started with web development",
    "what makes a good user interface?",

    # Business & Productivity
    "strategies for business growth",
    "tips for improving personal productivity",

    # Finance & General Interest
    "introduction to cryptocurrency investing"
]


test_queries = [
    # Software & Career
    "best practices for writing clean code",
    "how to handle burnout at work",

    # Marketing & Startups
    "content marketing strategies for social media",
    "how can startups find their first customers?",

    # Conceptual & Explanatory (designed to be hard for TF-IDF)
    "explaining machine learning to a beginner",
    "the role of creativity in technology"
]

# Baseline TF-IDF Implementation

In [12]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class TFIDFSearchEngine:
    """
    TF-IDF search engine with hyperparameter tuning and keyword extraction.
    """

    def __init__(self):
        self.vectorizer = None
        self.tfidf_matrix = None
        self.documents = None
        self.best_params = None
        self.param_history = []

    def tune_hyperparameters(self, train_texts, val_texts, val_queries, param_grid):
      """
      Hyperparameter tuning with train/validation split.
      """
      print("Tuning hyperparameters...")
      print("-" * 40)

      best_score = -1
      # best_params should be a dictionary for the vectorizer, not the full history item
      best_params_config = None

      param_combinations = [dict(zip(param_grid, v)) for v in __import__('itertools').product(*param_grid.values())]

      for params in param_combinations:
          vectorizer = TfidfVectorizer(stop_words='english', **params, lowercase=True)

          try:
              # train vectorizer on training data only
              tfidf_matrix_train = vectorizer.fit_transform(train_texts)

              # transform validation data using trained vectorizer
              tfidf_matrix_val = vectorizer.transform(val_texts)

              # test queries against validation data
              scores = []
              for query in val_queries:
                  query_vector = vectorizer.transform([query])

                  # compare query against validation documents
                  similarities = cosine_similarity(query_vector, tfidf_matrix_val).flatten()

                  top_5_scores = np.sort(similarities)[-5:]
                  scores.append(np.mean(top_5_scores))

              avg_score = np.mean(scores)

              # Store results
              param_result = {**params, 'score': avg_score, 'vocab_size': len(vectorizer.vocabulary_)}
              self.param_history.append(param_result)

              if avg_score > best_score:
                  best_score = avg_score
                  # Store the parameters that can be passed to TfidfVectorizer
                  best_params_config = params

          except Exception as e:
              print(f"Error with params {params}: {e}")
              continue

      self.best_params = best_params_config
      return self.best_params

    def fit(self, documents, use_best_params=True):
        """
        Fit the TF-IDF model with optimal or default parameters.

        Args:
            documents (list): Training documents
            use_best_params (bool): Use tuned parameters if available
        """
        print("Building TF-IDF Model...")

        self.documents = documents

        if use_best_params and self.best_params:
            params = self.best_params
            print(f"Using tuned hyperparameters: {params}")
        else:
            params = {
                'max_features': 5000,
                'min_df': 2,
                'max_df': 0.9,
                'ngram_range': (1, 1)
            }
            print(f"Using default hyperparameters: {params}")

        # Initialize vectorizer with chosen parameters
        self.vectorizer = TfidfVectorizer(stop_words='english', **params, lowercase=True)

        # Fit and transform
        self.tfidf_matrix = self.vectorizer.fit_transform(documents)

        print(f"TF-IDF model built")
        print(f"Matrix shape: {self.tfidf_matrix.shape}")
        print(f"Vocabulary size: {len(self.vectorizer.vocabulary_):,}")

    def _get_hot_keywords(self, query, doc_text, top_n):
        """
        A private helper method to extract shared keywords between a query and a document.
        """
        # Get feature names from the fitted vectorizer
        feature_names = np.array(self.vectorizer.get_feature_names_out())

        # Transform the single document to get its TF-IDF vector
        doc_vector = self.vectorizer.transform([doc_text])

        # Create a dictionary of word -> tf-idf score for this document
        doc_scores = {word: score for word, score in zip(feature_names, doc_vector.toarray().flatten()) if score > 0}

        # Tokenize the query and filter for words present in the vocabulary
        query_words = [word for word in query.lower().split() if word in self.vectorizer.vocabulary_]

        # Find shared keywords and their scores in the document
        shared_keywords = {word: doc_scores.get(word, 0) for word in query_words}

        # Sort by score in descending order
        sorted_keywords = sorted(shared_keywords.items(), key=lambda item: item[1], reverse=True)

        # Return just the words
        return [word for word, score in sorted_keywords[:top_n]]

    def search(self, query, top_n=5, top_n_keywords=5, documents_to_search=None):
        """
        Search for similar documents. Can search either the internal (training) index
        or an external list of documents provided for evaluation.
        """
        if self.vectorizer is None:
            raise ValueError("Model not fitted yet. Call fit() first.")

        query_vector = self.vectorizer.transform([query])

        # Determine which document set and matrix to use
        if documents_to_search is not None:
            # --- EVALUATION MODE ---
            # Use the provided documents for the search
            search_documents = documents_to_search
            # Transform the external documents using the FITTED vectorizer
            search_matrix = self.vectorizer.transform(documents_to_search)
        else:
            # --- NORMAL MODE ---
            # Default to the internal documents the engine was trained on
            search_documents = self.documents
            search_matrix = self.tfidf_matrix

        similarities = cosine_similarity(query_vector, search_matrix).flatten()

        top_indices = np.argsort(similarities)[-top_n:][::-1]
        top_scores = similarities[top_indices]

        top_keywords = []
        for doc_index in top_indices:
            document_text = search_documents[doc_index]
            keywords = self._get_hot_keywords(query, document_text, top_n=top_n_keywords)
            top_keywords.append(keywords)

        return top_indices, top_scores, top_keywords

In [13]:
print("\n TF-IDF Hyperparameter Tuning")
print("="*60)

# Initialize and tune TF-IDF engine
tfidf_engine = TFIDFSearchEngine()

param_grid = {
      'max_features': [1000, 3000, 5000, 8000],
      'min_df': [1, 2, 3],
      'max_df': [0.8, 0.9, 0.95],
      'ngram_range': [(1,1), (1,2)]
  }

# Tune hyperparameters
best_tfidf_params = tfidf_engine.tune_hyperparameters(
    train_texts=train_texts,
    val_texts=val_texts,
    val_queries=validation_queries,
    param_grid=param_grid
)

# Train final model with best parameters
tfidf_engine.fit(train_texts, use_best_params=True)

print(f"\n TF-IDF Hyperparameter Tuning Results:")
print(f"Tested {len(tfidf_engine.param_history)} parameter combinations")

# Show top 5 parameter combinations
sorted_params = sorted(tfidf_engine.param_history, key=lambda x: x['score'], reverse=True)
print("\nTop 5 parameter combinations:")
for i, params in enumerate(sorted_params[:5], 1):
    print(f"{i}. Score: {params['score']:.4f} | Features: {params['max_features']} | "
          f"Min DF: {params['min_df']} | N-grams: {params['ngram_range']}")



 TF-IDF Hyperparameter Tuning
Tuning hyperparameters...
----------------------------------------
Building TF-IDF Model...
Using tuned hyperparameters: {'max_features': 1000, 'min_df': 2, 'max_df': 0.8, 'ngram_range': (1, 1)}
TF-IDF model built
Matrix shape: (158, 1000)
Vocabulary size: 1,000

 TF-IDF Hyperparameter Tuning Results:
Tested 72 parameter combinations

Top 5 parameter combinations:
1. Score: 0.1501 | Features: 1000 | Min DF: 2 | N-grams: (1, 1)
2. Score: 0.1498 | Features: 1000 | Min DF: 1 | N-grams: (1, 1)
3. Score: 0.1497 | Features: 1000 | Min DF: 3 | N-grams: (1, 1)
4. Score: 0.1493 | Features: 1000 | Min DF: 1 | N-grams: (1, 1)
5. Score: 0.1493 | Features: 1000 | Min DF: 1 | N-grams: (1, 1)


In [23]:
print("\n TF-IDF Hyperparameter Tuning")
print("="*60)

# Initialize and tune TF-IDF engine
tfidf_engine = TFIDFSearchEngine()

param_grid = {
    # Exploring values below and around the previous best of 1000
    'max_features': [5,10, 50, 100, 200, 250, 500, 750, 1000, 1500],
    # test values around the previous best of 3
    'min_df': [3, 4, 5],
    # fixed
    'max_df': [0.9],
    # last results showed a clear preference for (1, 2)
    'ngram_range': [(1, 2)]
}

# Tune hyperparameters
best_tfidf_params = tfidf_engine.tune_hyperparameters(
    train_texts=train_texts,
    val_texts=val_texts,
    val_queries=validation_queries,
    param_grid=param_grid
)

# Train final model with best parameters
tfidf_engine.fit(train_texts, use_best_params=True)

print(f"\n TF-IDF Hyperparameter Tuning Results:")
print(f"Tested {len(tfidf_engine.param_history)} parameter combinations")

# Show top 5 parameter combinations
sorted_params = sorted(tfidf_engine.param_history, key=lambda x: x['score'], reverse=True)
print("\nTop 5 parameter combinations:")
for i, params in enumerate(sorted_params[:10], 1):
    print(f"{i}. Score: {params['score']:.4f} | Features: {params['max_features']} | "
          f"Min DF: {params['min_df']} | N-grams: {params['ngram_range']}")



 TF-IDF Hyperparameter Tuning
Tuning hyperparameters...
----------------------------------------
Building TF-IDF Model...
Using tuned hyperparameters: {'max_features': 50, 'min_df': 3, 'max_df': 0.9, 'ngram_range': (1, 2)}
TF-IDF model built
Matrix shape: (158, 50)
Vocabulary size: 50

 TF-IDF Hyperparameter Tuning Results:
Tested 30 parameter combinations

Top 5 parameter combinations:
1. Score: 0.1684 | Features: 50 | Min DF: 3 | N-grams: (1, 2)
2. Score: 0.1684 | Features: 50 | Min DF: 4 | N-grams: (1, 2)
3. Score: 0.1684 | Features: 50 | Min DF: 5 | N-grams: (1, 2)
4. Score: 0.1347 | Features: 5 | Min DF: 3 | N-grams: (1, 2)
5. Score: 0.1347 | Features: 5 | Min DF: 4 | N-grams: (1, 2)
6. Score: 0.1347 | Features: 5 | Min DF: 5 | N-grams: (1, 2)
7. Score: 0.1203 | Features: 250 | Min DF: 5 | N-grams: (1, 2)
8. Score: 0.1202 | Features: 250 | Min DF: 3 | N-grams: (1, 2)
9. Score: 0.1202 | Features: 250 | Min DF: 4 | N-grams: (1, 2)
10. Score: 0.1165 | Features: 200 | Min DF: 3 | N-g

# Deep Learning Semantic Search Implementation

In [24]:
import os
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer, util
from keybert import KeyBERT

class SemanticSearchEngine:
    """
    A semantic search engine that evaluates models, builds an index,
    and extracts thematic keywords for search results.
    """
    def __init__(self):
        self.model = None
        self.index_embeddings = None
        self.documents = None
        self.model_name = None
        self.kw_model = None # <-- ADDED: Will hold the KeyBERT model

    def evaluate_and_select_model(self, val_documents, val_queries, models_to_test=None):
        """
        Evaluates multiple sentence-transformer models on a validation set.
        (No changes to this method)
        """
        if models_to_test is None:
            models_to_test = [
                'all-MiniLM-L6-v2',         # fast and resource-efficient
                'all-mpnet-base-v2',        # provides higher quality embeddings
                'msmarco-distilbert-base-v4'# optimized for asymmetric search tasks
            ]

        print("Starting model evaluation on the validation set.")
        print("-" * 50)

        model_results = {}

        for model_name in models_to_test:
            print(f"\nEvaluating model: {model_name}")
            try:
                model = SentenceTransformer(model_name)
                val_embeddings = model.encode(val_documents, show_progress_bar=False)
                query_scores = []
                for query in val_queries:
                    query_embedding = model.encode(query)
                    hits = util.semantic_search(query_embedding, val_embeddings, top_k=5)
                    if hits and len(hits[0]) > 0:
                        scores = [hit['score'] for hit in hits[0]]
                        query_scores.append(np.mean(scores))

                avg_score = np.mean(query_scores) if query_scores else 0
                model_results[model_name] = {'average_validation_score': avg_score}
                print(f"  Average validation score: {avg_score:.4f}")

            except Exception as e:
                print(f"  An error occurred with model {model_name}: {e}")
                continue

        if not model_results:
            raise RuntimeError("Model evaluation failed for all candidate models.")

        best_model_name = max(model_results.keys(), key=lambda k: model_results[k]['average_validation_score'])
        print("-" * 50)
        print(f"Evaluation complete. Best model selected: {best_model_name}")

        return model_results, best_model_name

    def build_index(self, train_documents, model_name, force_recompute=False):
        """
        Builds the search index and initializes the keyword extraction model.
        """
        print(f"\nBuilding search index with model: {model_name}")

        self.documents = train_documents
        self.model_name = model_name
        embedding_file = f'train_embeddings_{model_name.replace("/", "_")}.pkl'
        self.model = SentenceTransformer(model_name)

        if os.path.exists(embedding_file) and not force_recompute:
            print(f"Loading existing embeddings from {embedding_file}")
            with open(embedding_file, 'rb') as f:
                self.index_embeddings = pickle.load(f)
        else:
            print("Computing embeddings for all training documents.")
            self.index_embeddings = self.model.encode(
                train_documents, show_progress_bar=True, batch_size=32
            )
            print(f"Saving embeddings to {embedding_file}")
            with open(embedding_file, 'wb') as f:
                pickle.dump(self.index_embeddings, f)

        # <-- NEW: Initialize the keyword model after the main index is ready -->
        print("Initializing keyword extraction model (KeyBERT)...")
        self.kw_model = KeyBERT()

        print("Search index built successfully.")
        print(f"Embedding shape: {self.index_embeddings.shape}")

    def _get_thematic_keywords(self, doc_text, top_n):
        """
        A private helper method to extract thematic keywords using KeyBERT.
        """
        if self.kw_model is None:
            return []

        # Extract keywords and keyphrases (e.g., bigrams)
        keywords = self.kw_model.extract_keywords(doc_text,
                                                 keyphrase_ngram_range=(1, 2),
                                                 stop_words='english',
                                                 top_n=top_n)
        # Return just the words/phrases
        return [word for word, score in keywords]

    def search(self, query, top_n=5, top_n_keywords=5, documents_to_search=None):
          """
          Searches for semantically similar documents. Can search either the internal
          (training) index or an external list of documents for evaluation.
          """
          if self.model is None:
              raise RuntimeError("The search index has not been built yet. Call 'build_index()' first.")

          query_embedding = self.model.encode(query)

          # Determine which document set and embeddings to use
          if documents_to_search is not None:
              # --- EVALUATION MODE ---
              search_documents = documents_to_search
              # Encode the external documents on the fly for this search
              search_embeddings = self.model.encode(documents_to_search)
          else:
              # --- NORMAL MODE ---
              search_documents = self.documents
              search_embeddings = self.index_embeddings

          hits = util.semantic_search(query_embedding, search_embeddings, top_k=top_n)[0]

          indices = [hit['corpus_id'] for hit in hits]
          scores = [hit['score'] for hit in hits]

          top_keywords = []
          for doc_index in indices:
              document_text = search_documents[doc_index]
              keywords = self._get_thematic_keywords(document_text, top_n=top_n_keywords)
              top_keywords.append(keywords)

          return indices, scores, top_keywords

In [25]:
print("\n Semantic Search Engine model choice")
print("="*80)

# Initialize semantic engine
semantic_engine = SemanticSearchEngine()

# Compare different models
print("Comparing semantic models on validation set...")
model_results, best_model_name = semantic_engine.evaluate_and_select_model(
    val_documents=val_texts,
    val_queries=validation_queries
)

# build index using the best model
semantic_engine.build_index(
    train_documents=train_texts,
    model_name=best_model_name,
    force_recompute=True
)


 Semantic Search Engine model choice
Comparing semantic models on validation set...
Starting model evaluation on the validation set.
--------------------------------------------------

Evaluating model: all-MiniLM-L6-v2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  Average validation score: 0.2936

Evaluating model: all-mpnet-base-v2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  Average validation score: 0.2925

Evaluating model: msmarco-distilbert-base-v4


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/545 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/319 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  Average validation score: 0.2506
--------------------------------------------------
Evaluation complete. Best model selected: all-MiniLM-L6-v2

Building search index with model: all-MiniLM-L6-v2
Computing embeddings for all training documents.


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Saving embeddings to train_embeddings_all-MiniLM-L6-v2.pkl
Initializing keyword extraction model (KeyBERT)...
Search index built successfully.
Embedding shape: (158, 384)


In [29]:
import time

def run_final_evaluation(tfidf_engine, semantic_engine, test_documents, test_queries, top_n=3):
    """
    Performs the definitive, final head-to-head comparison on the unseen test set,
    including a measurement of search latency.
    """
    print("=" * 100)
    print("      STARTING FINAL MODEL EVALUATION ON THE UNSEEN TEST SET")
    print("=" * 100)

    # Lists to store the latency for each query
    tfidf_latencies = []
    semantic_latencies = []

    for query in test_queries:
        print(f"\n\n{'='*40} QUERY: '{query}' {'='*40}")

        # --- 1. TF-IDF Model Evaluation ---
        try:
            start_time = time.time()
            indices, scores, keywords = tfidf_engine.search(
                query, documents_to_search=test_documents, top_n=top_n
            )
            end_time = time.time()

            latency = (end_time - start_time) * 1000  # Convert to milliseconds
            tfidf_latencies.append(latency)

            print(f"\n--- TF-IDF Model Results (Keyword-Based) --- [Search Time: {latency:.2f} ms]")
            if not indices.any():
                 print("No relevant documents found.")
            else:
                for i in range(len(indices)):
                    doc_index = indices[i]
                    print(f"\nResult #{i+1} | Score: {scores[i]:.4f} | Keywords: {keywords[i]}")
                    print(f"  Text: {test_documents[doc_index][:200]}...")

        except Exception as e:
            print(f"An error occurred with the TF-IDF search: {e}")

        # --- 2. Semantic Search Model Evaluation ---
        try:
            start_time = time.time()
            indices, scores, keywords = semantic_engine.search(
                query, documents_to_search=test_documents, top_n=top_n
            )
            end_time = time.time()

            latency = (end_time - start_time) * 1000  # Convert to milliseconds
            semantic_latencies.append(latency)

            print(f"\n\n--- Semantic Search Model Results (Meaning-Based) --- [Search Time: {latency:.2f} ms]")
            if not indices:
                print("No relevant documents found.")
            else:
                for i in range(len(indices)):
                    doc_index = indices[i]
                    print(f"\nResult #{i+1} | Score: {scores[i]:.4f} | Keywords: {keywords[i]}")
                    print(f"  Text: {test_documents[doc_index][:200]}...")

        except Exception as e:
            print(f"An error occurred with the Semantic search: {e}")

    # --- 3. Final Performance Summary ---
    print(f"\n\n{'='*100}")
    print("      FINAL EVALUATION COMPLETE")
    print("=" * 100)

    # Calculate average latencies
    avg_tfidf_latency = sum(tfidf_latencies) / len(tfidf_latencies) if tfidf_latencies else 0
    avg_semantic_latency = sum(semantic_latencies) / len(semantic_latencies) if semantic_latencies else 0

    print("\n--- Performance Summary ---")
    print(f"Number of test queries: {len(test_queries)}")
    print(f"Number of documents searched: {len(test_documents)}")
    print("\nAverage Search Latency:")
    print(f"  - TF-IDF Model:      {avg_tfidf_latency:.2f} ms per query")
    print(f"  - Semantic Model:    {avg_semantic_latency:.2f} ms per query")

    if avg_tfidf_latency > 0 and avg_semantic_latency > 0:
        speed_difference = avg_semantic_latency / avg_tfidf_latency
        print(f"\nConclusion: The TF-IDF model is approximately {speed_difference:.1f}x faster than the Semantic model.")

In [30]:

run_final_evaluation(
    tfidf_engine=tfidf_engine,
    semantic_engine=semantic_engine,
    test_documents=test_texts,
    test_queries=test_queries,
    top_n=3
)

      STARTING FINAL MODEL EVALUATION ON THE UNSEEN TEST SET



--- TF-IDF Model Results (Keyword-Based) --- [Search Time: 46.06 ms]

Result #1 | Score: 0.3025 | Keywords: ['code']
  Text: this is part of a series about building a deep learning model to recognize traffic signs. its intended to be a learning experience for myself and for anyone else who likes to follow along. there are a...

Result #2 | Score: 0.1878 | Keywords: ['code']
  Text: ai and games is a crowdfunded youtube series on the research and applications of ai within video games. the following article is a more involved transcription of the topics discussed in the video link...

Result #3 | Score: 0.1830 | Keywords: ['code']
  Text: upd april scikit flow has been merged into tensorflow since version . and now called tensorflow learn or tf.learn. google released a machine learning framework called tensorflow and its taking the wor...


--- Semantic Search Model Results (Meaning-Based) --- [Search Time: 2295.76 ms]

Resu