In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy import sparse
import pandas as pd
import json
import os
import pickle
from pathlib import Path
from tabulate import tabulate
import re
import numpy as np
import pandas as pd
from string import ascii_lowercase
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from pathlib import Path
from urllib.parse import urlparse 
from concurrent.futures import ThreadPoolExecutor 
import multiprocessing 
from queue import Queue, Empty 
import pickle
import requests

In [3]:
class BM25:
    def __init__(self, b=0.75, k1=1.6):
        """
        BM25 implementation using TfidfVectorizer for vectorization.
        """
        self.b = b
        self.k1 = k1
        self.vectorizer = TfidfVectorizer(lowercase=True, stop_words='english', use_idf=True)

    def fit(self, X):
        """
        Fit the model with a list of documents X.
        """
        self.vectorizer.fit(X)
        self.y = self.vectorizer.transform(X)
        self.avdl = self.y.sum(axis=1).mean()  # Average document length

    def transform(self, q):
        """
        Calculate BM25 scores between query q and documents X.
        """
        b, k1, avdl = self.b, self.k1, self.avdl

        # Apply CountVectorizer
        len_y = self.y.sum(axis=1).A1  # Length of each document
        q_vector = self.vectorizer.transform([q])  # Vectorize query
        assert sparse.isspmatrix_csr(q_vector)

        # Convert to csc for better column slicing
        y_csc = self.y.tocsc()[:, q_vector.indices]
        denom = y_csc + (k1 * (1 - b + b * len_y / avdl))[:, None]
        idf = self.vectorizer.idf_[None, q_vector.indices] - 1.0
        numer = y_csc.multiply(np.broadcast_to(idf, y_csc.shape)) * (k1 + 1)
        return (numer / denom).sum(axis=1).A1


In [4]:
class WebIndexer:
    def __init__(self, is_reset=False):
        """
        Initialize the WebIndexer and load preprocessed data if exists.
        """
        self.crawled_folder = Path(Path().absolute()).parent / "crawled/"
        self.stored_file = 'src/resource/manual_indexer.pkl'

        # Load the cached index if it exists and is_reset is False
        if not is_reset and os.path.isfile(self.stored_file):
            with open(self.stored_file, "rb") as f:
                cached_dict = pickle.load(f)
            self.__dict__.update(cached_dict)
        else:
            self.run_indexer()

    @staticmethod
    def preprocess_text(text):
        """
        Preprocess text by lowercasing, removing punctuation, and cleaning spaces.
        """
        # Lowercase text
        text = text.lower()
        # Remove punctuation and special characters
        text = re.sub(r'[^\w\s]', '', text)
        # Remove extra spaces
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def run_indexer(self):
        """
        Load documents from crawled folder, preprocess, and index them.
        """
        documents = []

        # Load all .txt files from crawled folder
        for file in os.listdir(self.crawled_folder):
            if file.endswith(".txt"):
                file_path = os.path.join(self.crawled_folder, file)

                # Read the file with UTF-8 encoding and handle errors
                with open(file_path, "r", encoding="utf-8", errors="replace") as f:
                    try:
                        j = json.load(f)
                        if 'title' in j and 'text' in j:
                            # Preprocess title and text
                            j['title'] = self.preprocess_text(j['title'])
                            j['text'] = self.preprocess_text(j['text'])
                            documents.append(j)
                        else:
                            print(f"Skipped file {file}: Missing 'title' or 'text'")
                    except json.JSONDecodeError as e:
                        print(f"Error decoding JSON in {file}: {e}")
                    except Exception as e:
                        print(f"Error reading {file}: {e}")

        if not documents:
            raise ValueError("No valid documents found. Ensure crawled/ folder contains valid .txt files.")

        # Preprocess and create BM25 index
        self.documents = pd.DataFrame.from_dict(documents)
        corpus = self.documents.apply(lambda s: ' '.join(s[['title', 'text']]), axis=1).tolist()

        if not corpus:
            raise ValueError("Corpus is empty. Ensure documents contain 'title' and 'text' fields.")

        # Initialize BM25 and fit the corpus
        self.bm25 = BM25()
        self.bm25.fit(corpus)

        # Ensure directory exists before saving
        os.makedirs(os.path.dirname(self.stored_file), exist_ok=True)

        # Save the final processed data
        with open(self.stored_file, "wb") as f:
            pickle.dump(self.__dict__, f)

    def search_query(self, query, top_n=5):
        """
        Perform a search query on the indexed data.
        """
        # Preprocess query
        query = self.preprocess_text(query)
        query_scores = self.bm25.transform(query)

        if not any(query_scores):
            raise ValueError("No documents matched the query. Check the query or indexed data.")

        self.documents["score"] = query_scores
        results = self.documents.nlargest(top_n, "score")[["url", "title", "text", "score"]]

        return results

In [5]:
if __name__ == "__main__":
    indexer = WebIndexer(is_reset=False)

    query = "school"
    try:
        results = indexer.search_query(query, top_n=5)
        print(tabulate(results, headers="keys", tablefmt="grid"))
    except ValueError as e:
        print(f"Error: {e}")

+-----+---------------------------------------------+----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Page 76

In [6]:
class Pr:

    def __init__(self, alpha):
        self.crawled_folder = Path(os.path.abspath('')).parent / 'crawled/'
        self.alpha = alpha

    def url_extractor(self):
        url_maps = {}
        all_urls = set([])

        for file in os.listdir(self.crawled_folder):
            if file.endswith(".txt"):
                with open(os.path.join(self.crawled_folder, file), "r", encoding="utf-8") as f:
                    j = json.load(f)

                all_urls.add(j['url'])
                for s in j['url_lists']:
                    all_urls.add(s)
                url_maps[j['url']] = list(set(j['url_lists']))

        all_urls = list(all_urls)
        return url_maps, all_urls

    def pr_calc(self):
        url_maps, all_urls = self.url_extractor()
        url_matrix = pd.DataFrame(columns=all_urls, index=all_urls)

        for url in url_maps:
            if len(url_maps[url]) > 0 and len(all_urls) > 0:
                url_matrix.loc[url] = (1 - self.alpha) * (1 / len(all_urls))
                url_matrix.loc[url, url_maps[url]] = url_matrix.loc[url, url_maps[url]] + (self.alpha *
                    (1 / len(url_maps[url])))

        url_matrix.loc[url_matrix.isnull().all(axis=1), :] = (1 / len(all_urls))

        x0 = np.matrix([1 / len(all_urls)] * len(all_urls))
        P = np.asmatrix(url_matrix.values)

        prev_Px = x0
        Px = x0 * P
        i = 0
        while (any(abs(np.asarray(prev_Px).flatten() - np.asarray(Px).flatten()) > 1e-8)):
            i += 1
            prev_Px = Px
            Px = Px * P

        self.pr_result = pd.DataFrame(Px, columns=url_matrix.index, index=['score']).T.sort_values(by='score', ascending=False)
        return i, self.pr_result




In [7]:
from flask import Flask, request
import pickle
import pandas as pd
import time
import numpy as np

# Initialize Flask app
app = Flask(__name__)

# Load the manual BM25 index
with open('src/resource/manual_indexer.pkl', 'rb') as f:
    manual_index = pickle.load(f)

# Load BM25 model and documents
bm25 = manual_index['bm25']
documents = manual_index['documents'].copy()  # Make a copy to avoid Pandas warning

# Ensure the document DataFrame has the correct columns
if 'url' not in documents.columns:
    raise ValueError("Documents DataFrame is missing 'url' column.")

# Initialize PageRank
pr = Pr(alpha=0.85)  # Ensure Pr class is correctly defined
_, pr_result = pr.pr_calc()  # Compute PageRank

# Check if pr_result is structured correctly
if not isinstance(pr_result, pd.DataFrame) or 'score' not in pr_result.columns:
    raise ValueError("pr_result should be a DataFrame with a 'score' column.")

# Map PageRank scores to documents
documents['pagerank_score'] = documents['url'].map(pr_result['score'])

# Fill NaN values with a small default value (to avoid zero multiplication)
documents['pagerank_score'] = documents['pagerank_score'].fillna(documents['pagerank_score'].mean())

@app.route('/search_manual_pr', methods=['GET'])
def search_manual():
    start = time.time()
    response_object = {'status': 'success'}

    # Get search query
    query_term = request.args.get('query', '').strip()
    if not query_term:
        return {"status": "error", "message": "Query term is missing"}, 400

    # Perform BM25 search
    query_scores = np.array(bm25.transform(query_term)).flatten()

    # Ensure query_scores length matches documents length
    if len(query_scores) != len(documents):
        return {"status": "error", "message": "BM25 scores length mismatch with documents"}, 500

    # Assign BM25 scores
    documents['score'] = query_scores

    # Compute the combined score (BM25 * PageRank)
    documents['combined_score'] = documents['score'] * documents['pagerank_score']

    # Handle NaN values in combined_score
    documents['combined_score'] = documents['combined_score'].fillna(0)

    # Select top 10 results
    sorted_documents = documents.nlargest(10, "combined_score")[["url", "title", "text", "score", "pagerank_score", "combined_score"]]

    end = time.time()

    # Prepare response
    response_object['total_hits'] = len(sorted_documents)
    response_object['results'] = sorted_documents.to_dict('records')
    response_object['elapse'] = end - start

    return response_object

if __name__ == '__main__':
    app.run(debug=False)



 * Serving Flask app '__main__'
 * Debug mode: off


  documents['pagerank_score'] = documents['pagerank_score'].fillna(documents['pagerank_score'].mean())
 * Running on http://127.0.0.1:5000
Press CTRL+C to quit


### Page 77

In [None]:
from flask import Flask, request
import pickle
import pandas as pd
import time
import numpy as np

# Initialize Flask app
app = Flask(__name__)

# Load the manual BM25 index
with open('src/resource/manual_indexer.pkl', 'rb') as f:
    manual_index = pickle.load(f)

# Load BM25 model and documents
bm25 = manual_index['bm25']
documents = manual_index['documents'].copy()  # Work on a copy to avoid Pandas issues

# Ensure the document DataFrame has the correct columns
if 'url' not in documents.columns:
    raise ValueError("Documents DataFrame is missing 'url' column.")

# Initialize PageRank
pr = Pr(alpha=0.85)  # Ensure Pr class is correctly defined
_, pr_result = pr.pr_calc()  # Compute PageRank

# Ensure pr_result is structured correctly
if not isinstance(pr_result, pd.DataFrame) or 'score' not in pr_result.columns:
    raise ValueError("pr_result should be a DataFrame with a 'score' column.")

# Map PageRank scores to documents
documents['pagerank_score'] = documents['url'].map(pr_result['score'])

# Fill NaN values with a small default value (if a document is missing from PageRank)
documents['pagerank_score'] = documents['pagerank_score'].fillna(documents['pagerank_score'].mean())

@app.route('/search_manual_pr', methods=['GET'])
def search_manual():
    start = time.time()
    response_object = {'status': 'success'}

    # Get search query
    query_term = request.args.get('query', '').strip()
    if not query_term:
        return {"status": "error", "message": "Query term is missing"}, 400

    # Perform BM25 search
    query_scores = np.array(bm25.transform(query_term)).flatten()

    # Ensure query_scores length matches documents length
    if len(query_scores) != len(documents):
        return {"status": "error", "message": "BM25 scores length mismatch with documents"}, 500

    # Assign BM25 scores (only for ranking purposes)
    documents['bm25_score'] = query_scores

    # Retrieve top N documents using BM25 ranking (e.g., top 100)
    top_bm25_docs = documents.nlargest(100, "bm25_score")

    # Sort these top documents based **only on PageRank**
    sorted_documents = top_bm25_docs.sort_values(by="pagerank_score", ascending=False)

    # Select top 10 results after sorting by PageRank
    final_results = sorted_documents.head(10)[["url", "title", "text", "bm25_score", "pagerank_score"]]

    end = time.time()

    # Prepare response
    response_object['total_hits'] = len(final_results)
    response_object['results'] = final_results.to_dict('records')
    response_object['elapse'] = end - start

    return response_object

if __name__ == '__main__':
    app.run(debug=False)


 * Serving Flask app '__main__'
 * Debug mode: off


  documents['pagerank_score'] = documents['pagerank_score'].fillna(documents['pagerank_score'].mean())
 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
