In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy import sparse
import pandas as pd
import json
import os
import pickle
from pathlib import Path
from tabulate import tabulate
import re
import pymc as pm
import numpy as np
import pandas as pd
from string import ascii_lowercase
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from pathlib import Path
from urllib.parse import urlparse 
from concurrent.futures import ThreadPoolExecutor 
import multiprocessing 
from queue import Queue, Empty 
import pickle
import requests

In [30]:
class BM25:
    def __init__(self, b=0.75, k1=1.6):
        """
        BM25 implementation using TfidfVectorizer for vectorization.
        """
        self.b = b
        self.k1 = k1
        self.vectorizer = TfidfVectorizer(lowercase=True, stop_words='english', use_idf=True)

    def fit(self, X):
        """
        Fit the model with a list of documents X.
        """
        self.vectorizer.fit(X)
        self.y = self.vectorizer.transform(X)
        self.avdl = self.y.sum(axis=1).mean()  # Average document length

    def transform(self, q):
        """
        Calculate BM25 scores between query q and documents X.
        """
        b, k1, avdl = self.b, self.k1, self.avdl

        # Apply CountVectorizer
        len_y = self.y.sum(axis=1).A1  # Length of each document
        q_vector = self.vectorizer.transform([q])  # Vectorize query
        assert sparse.isspmatrix_csr(q_vector)

        # Convert to csc for better column slicing
        y_csc = self.y.tocsc()[:, q_vector.indices]
        denom = y_csc + (k1 * (1 - b + b * len_y / avdl))[:, None]
        idf = self.vectorizer.idf_[None, q_vector.indices] - 1.0
        numer = y_csc.multiply(np.broadcast_to(idf, y_csc.shape)) * (k1 + 1)
        return (numer / denom).sum(axis=1).A1


In [31]:
class WebIndexer:
    def __init__(self, is_reset=False):
        """
        Initialize the WebIndexer and load preprocessed data if exists.
        """
        self.crawled_folder = Path(Path().absolute()).parent / "crawled/"
        self.stored_file = 'src/resource/manual_indexer.pkl'

        # Load the cached index if it exists and is_reset is False
        if not is_reset and os.path.isfile(self.stored_file):
            with open(self.stored_file, "rb") as f:
                cached_dict = pickle.load(f)
            self.__dict__.update(cached_dict)
        else:
            self.run_indexer()

    @staticmethod
    def preprocess_text(text):
        """
        Preprocess text by lowercasing, removing punctuation, and cleaning spaces.
        """
        # Lowercase text
        text = text.lower()
        # Remove punctuation and special characters
        text = re.sub(r'[^\w\s]', '', text)
        # Remove extra spaces
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def run_indexer(self):
        """
        Load documents from crawled folder, preprocess, and index them.
        """
        documents = []

        # Load all .txt files from crawled folder
        for file in os.listdir(self.crawled_folder):
            if file.endswith(".txt"):
                file_path = os.path.join(self.crawled_folder, file)

                # Read the file with UTF-8 encoding and handle errors
                with open(file_path, "r", encoding="utf-8", errors="replace") as f:
                    try:
                        j = json.load(f)
                        if 'title' in j and 'text' in j:
                            # Preprocess title and text
                            j['title'] = self.preprocess_text(j['title'])
                            j['text'] = self.preprocess_text(j['text'])
                            documents.append(j)
                        else:
                            print(f"Skipped file {file}: Missing 'title' or 'text'")
                    except json.JSONDecodeError as e:
                        print(f"Error decoding JSON in {file}: {e}")
                    except Exception as e:
                        print(f"Error reading {file}: {e}")

        if not documents:
            raise ValueError("No valid documents found. Ensure crawled/ folder contains valid .txt files.")

        # Preprocess and create BM25 index
        self.documents = pd.DataFrame.from_dict(documents)
        corpus = self.documents.apply(lambda s: ' '.join(s[['title', 'text']]), axis=1).tolist()

        if not corpus:
            raise ValueError("Corpus is empty. Ensure documents contain 'title' and 'text' fields.")

        # Initialize BM25 and fit the corpus
        self.bm25 = BM25()
        self.bm25.fit(corpus)

        # Ensure directory exists before saving
        os.makedirs(os.path.dirname(self.stored_file), exist_ok=True)

        # Save the final processed data
        with open(self.stored_file, "wb") as f:
            pickle.dump(self.__dict__, f)

    def search_query(self, query, top_n=5):
        """
        Perform a search query on the indexed data.
        """
        # Preprocess query
        query = self.preprocess_text(query)
        query_scores = self.bm25.transform(query)

        if not any(query_scores):
            raise ValueError("No documents matched the query. Check the query or indexed data.")

        self.documents["score"] = query_scores
        results = self.documents.nlargest(top_n, "score")[["url", "title", "text", "score"]]

        return results

In [32]:
if __name__ == "__main__":
    indexer = WebIndexer(is_reset=False)

    query = "school"
    try:
        results = indexer.search_query(query, top_n=5)
        print(tabulate(results, headers="keys", tablefmt="grid"))
    except ValueError as e:
        print(f"Error: {e}")

+-----+---------------------------------------------+----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [33]:
from flask import Flask, request
import pickle
import pandas as pd
import time

# Initialize Flask app
app = Flask(__name__)

# Load the manual BM25 index
with open('src/resource/manual_indexer.pkl', 'rb') as f:
    manual_index = pickle.load(f)

# Load BM25 and documents from the manual index
bm25 = manual_index['bm25']
documents = manual_index['documents']

# Define the search endpoint for manual_index
@app.route('/search_manual', methods=['GET'])
def search_manual():
    # Record start time for query execution
    start = time.time()
    response_object = {'status': 'success'}

    # Get the search query from request arguments
    query_term = request.args.get('query', '')

    if not query_term:
        return {"status": "error", "message": "Query term is missing"}, 400

    # Perform the search using BM25
    query_scores = bm25.transform(query_term)

    # Assign scores to documents
    documents["score"] = query_scores
    results = documents.nlargest(10, "score")[["url", "title", "text", "score"]]

    # Record end time for query execution
    end = time.time()

    # Prepare the response object
    response_object['total_hits'] = len(results)
    response_object['results'] = results.to_dict('records')
    response_object['elapse'] = end - start  # Query execution time

    return response_object

# Run the Flask app
if __name__ == '__main__':
    app.run(debug=False)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [03/Feb/2025 17:46:20] "GET /search_manual?query=camt HTTP/1.1" 200 -
