In [1]:
import pickle

import numpy as np
import os
import json
from pathlib import Path
import pandas as pd
from elasticsearch import Elasticsearch
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer

### Page 66-67


In [2]:
# Initial probability distribution
x0 = np.matrix([1/7] * 7)

# Transition probability matrix
P = np.matrix([
    [1/7, 1/7, 1/7, 1/7, 1/7, 1/7, 1/7],
    [25/56, 3/140, 25/56, 3/140, 3/140, 3/140, 3/140],
    [3/140, 3/140, 3/140, 3/140, 61/70, 3/140, 3/140],
    [3/140, 3/140, 25/56, 3/140, 3/140, 3/140, 25/56],
    [25/56, 3/140, 3/140, 3/140, 3/140, 25/56, 3/140],
    [3/140, 3/140, 61/70, 3/140, 3/140, 3/140, 3/140],
    [3/140, 3/140, 25/56, 3/140, 3/140, 25/56, 3/140],
])

# Compute Markov process steps
print("x0 * P:\n", x0 * P)
print("x0 * P^2:\n", x0 * P * P)
print("x0 * P^3:\n", x0 * P * P * P)

x0 * P:
 [[0.16020408 0.03877551 0.34234694 0.03877551 0.16020408 0.16020408
  0.0994898 ]]
x0 * P^2:
 [[0.12544825 0.04088192 0.25229774 0.04088192 0.33187682 0.15125182
  0.05736152]]
x0 * P^3:
 [[0.19508404 0.03666157 0.2243539  0.03666157 0.25111465 0.20208787
  0.05403639]]


In [3]:
x0 = np.matrix([1/7] * 7)

P = np.matrix([
         [1/7, 1/7, 1/7, 1/7, 1/7, 1/7, 1/7],
         [25/56, 3/140, 25/56, 3/140, 3/140, 3/140, 3/140],
         [3/140, 3/140, 3/140, 3/140, 61/70, 3/140, 3/140],
         [3/140, 3/140, 25/56, 3/140, 3/140, 3/140, 25/56],
         [25/56, 3/140, 3/140, 3/140, 3/140, 25/56, 3/140],
         [3/140, 3/140, 61/70, 3/140, 3/140, 3/140, 3/140],
         [3/140, 3/140, 25/56, 3/140, 3/140, 25/56, 3/140],
     ])

prev_Px = x0
Px = x0*P
i=0
while(any(abs(np.asarray(prev_Px).flatten()-np.asarray(Px).flatten()) > 1e-8)):
    i+=1
    prev_Px = Px
    Px = Px * P
print('Converged in {0} iterations: {1}'.format(i, np.asarray(Px).flatten()))


Converged in 39 iterations: [0.16911688 0.04196419 0.25324048 0.04196419 0.2572186  0.17669667
 0.05979897]


### Page 68-70

In [4]:
class Pr:

    def __init__(self, alpha):
        self.crawled_folder = Path(os.path.abspath('')).parent / 'crawled/'
        self.alpha = alpha

    def url_extractor(self):
        url_maps = {}
        all_urls = set([])

        for file in os.listdir(self.crawled_folder):
            if file.endswith(".txt"):
                with open(os.path.join(self.crawled_folder, file), "r", encoding="utf-8") as f:
                    j = json.load(f)

                all_urls.add(j['url'])
                for s in j['url_lists']:
                    all_urls.add(s)
                url_maps[j['url']] = list(set(j['url_lists']))

        all_urls = list(all_urls)
        return url_maps, all_urls

    def pr_calc(self):
        url_maps, all_urls = self.url_extractor()
        url_matrix = pd.DataFrame(columns=all_urls, index=all_urls)

        for url in url_maps:
            if len(url_maps[url]) > 0 and len(all_urls) > 0:
                url_matrix.loc[url] = (1 - self.alpha) * (1 / len(all_urls))
                url_matrix.loc[url, url_maps[url]] = url_matrix.loc[url, url_maps[url]] + (self.alpha *
                    (1 / len(url_maps[url])))

        url_matrix.loc[url_matrix.isnull().all(axis=1), :] = (1 / len(all_urls))

        x0 = np.matrix([1 / len(all_urls)] * len(all_urls))
        P = np.asmatrix(url_matrix.values)

        prev_Px = x0
        Px = x0 * P
        i = 0
        while (any(abs(np.asarray(prev_Px).flatten() - np.asarray(Px).flatten()) > 1e-8)):
            i += 1
            prev_Px = Px
            Px = Px * P

        self.pr_result = pd.DataFrame(Px, columns=url_matrix.index, index=['score']).T.sort_values(by='score', ascending=False)
        return i, self.pr_result




In [5]:
if __name__ == '__main__':
    s = Pr(alpha=0.85)
    iterations, pr_results = s.pr_calc()


In [6]:
print(f'Converged in {iterations} iterations')

Converged in 3 iterations


In [7]:
print('Final PageRank Scores:')
(pr_results.sort_values(by='score', ascending=False))

Final PageRank Scores:


Unnamed: 0,score
https://cmu.ac.th/CurriculumLecturerPublicList.aspx,0.00423
https://cmu.ac.th/Identity/login.aspx,0.00423
https://www.facebook.com/legal/terms/,0.002765
https://www.facebook.com/r.php,0.002765
https://www.facebook.com/privacy/policy/,0.002765
...,...
https://cmu.ac.th/th/content/7F64EE14-922A-41B0-B247-E5DD81957E01,0.002289
https://cmu.ac.th/th/article/1d77b40b-a500-4a51-b6d9-f6675c454677,0.002289
https://cmu.ac.th/en/faculty/aboutus,0.002276
https://www.cmu.ac.th,0.002276


In [8]:
total_score = s.pr_result['score'].sum()
total_score

1.0000000000000027

### Page 71-75

In [9]:
class IndexerWithPR:

    def __init__(self):
        self.crawled_folder = Path(os.path.abspath('')).parent / 'crawled'
        with open(self.crawled_folder / 'url_list.pickle', 'rb') as f:
            import pickle
            self.file_mapper = pickle.load(f)
        self.es_client = Elasticsearch("https://localhost:9200", basic_auth=("elastic", "Em7zHso+d=46KI5rk5*I"), ca_certs="../http_ca.crt")
        self.pr = Pr(alpha=0.85)

    def run_indexer(self):
        self.pr.pr_calc()
        self.es_client.options(ignore_status=[400, 404]).indices.delete(index='simple')
        self.es_client.options(ignore_status=[400]).indices.create(index='simple')

        for file in os.listdir(self.crawled_folder):
            if file.endswith(".txt"):
                with open(os.path.join(self.crawled_folder, file), 'r', encoding='utf-8') as f:
                    j = json.load(f)
                j['id'] = j['url']
                j['pagerank'] = self.pr.pr_result.loc[j['id']].score
                print(j)
                self.es_client.index(index='simple', body=j)


In [10]:
s = IndexerWithPR()
s.run_indexer()

{'url': 'https://cmu.ac.th/th/article/a8a1ff5d-1dc0-4600-9e19-5a08f873e0c7', 'title': 'ข่าวสาร : CMUBS ขยายเวลารับสมัครอาจารย์ สังกัดภาควิชาการตลาด (พนักงานมหาวิทยาลัยประจำ แบบเพิ่มศักยภาพ) - มหาวิทยาลัยเชียงใหม่', 'text': 'CMU             หลักสูตร    แยกตามคณะ    ค้นหาหลักสูตร    ปริญญาตรี    ปริญญาโท    ปริญญาเอก    หลักสูตรอื่นๆ     |   การศึกษา    การรับสมัครระดับปริญญาตรี    การรับสมัครระดับบัณฑิตศึกษา    การรับสมัครหลักสูตรนานาชาติ    ทุนอธิการบดีมหาวิทยาลัยเชียงใหม่     |   คณะและหน่วยงาน    คณะ    หน่วยงาน    ส่วนงานอื่นๆ     |   TH    EN    CN                                 TH  EN  CN     หลักสูตร    แยกตามคณะ    ค้นหาหลักสูตร    ปริญญาตรี    ปริญญาโท    ปริญญาเอก    หลักสูตรอื่นๆ      การศึกษา    การรับสมัครระดับปริญญาตรี    การรับสมัครระดับบัณฑิตศึกษา    การรับสมัครหลักสูตรนานาชาติ    ทุนอธิการบดีมหาวิทยาลัยเชียงใหม่      คณะและหน่วยงาน    คณะ    หน่วยงาน    ส่วนงานอื่นๆ       ข่าวสาร    ข่าวงานวิจัยและนวัตกรรม    ข่าวเด่น    ข่าวบุคคลเด่น    รางวัลและความภาคภูมิใจ    ประชุ

In [11]:
from flask import Flask, request
from  elasticsearch import Elasticsearch
import pandas as pd
import time

app = Flask(__name__)
app.es_client = Elasticsearch("https://localhost:9200", basic_auth=("elastic", "Em7zHso+d=46KI5rk5*I"), ca_certs="../http_ca.crt")

@app.route('/search_es_pr', methods=['GET'])
def search_es_pr():
    start = time.time()
    response_object = {'status': 'success'}
    argList = request.args.to_dict(flat=False)
    query_term = argList['query'][0]
    results = app.es_client.search(index='simple', source_excludes=['url_lists'], size=100, query={"script_score": {"query": {"match": {"text": query_term}}, "script": {"source": "_score*doc['pagerank'].value"}}})
    end = time.time()
    total_hit = results['hits']['total']['value']
    results_df = pd.DataFrame([[hit["_source"]['title'], hit["_source"]['url'], hit["_source"]['text'][:100], hit["_score"]] for hit in results['hits']['hits']], columns=['title', 'url', 'text', 'score'])

    response_object['total_hit'] = total_hit
    response_object['results'] = results_df.to_dict('records')
    response_object['elapse'] = end - start

    return response_object

### Page 76

In [12]:
class BM25:
    def __init__(self, b=0.75, k1=1.6):
        """
        BM25 implementation using TfidfVectorizer for vectorization.
        """
        self.b = b
        self.k1 = k1
        self.vectorizer = TfidfVectorizer(lowercase=True, stop_words='english', use_idf=True)

    def fit(self, X):
        """
        Fit the model with a list of documents X.
        """
        self.vectorizer.fit(X)
        self.y = self.vectorizer.transform(X)
        self.avdl = self.y.sum(axis=1).mean()  # Average document length

    def transform(self, q):
        """
        Calculate BM25 scores between query q and documents X.
        """
        b, k1, avdl = self.b, self.k1, self.avdl

        # Apply CountVectorizer
        len_y = self.y.sum(axis=1).A1  # Length of each document
        q_vector = self.vectorizer.transform([q])  # Vectorize query
        assert sparse.isspmatrix_csr(q_vector)

        # Convert to csc for better column slicing
        y_csc = self.y.tocsc()[:, q_vector.indices]
        denom = y_csc + (k1 * (1 - b + b * len_y / avdl))[:, None]
        idf = self.vectorizer.idf_[None, q_vector.indices] - 1.0
        numer = y_csc.multiply(np.broadcast_to(idf, y_csc.shape)) * (k1 + 1)
        return (numer / denom).sum(axis=1).A1


In [None]:
from flask import Flask, request
import pickle
import pandas as pd
import time

# Load the manual BM25 index
with open('src/resource/manual_indexer.pkl', 'rb') as f:
    manual_index = pickle.load(f)

# Load BM25 and documents from the manual index
bm25 = manual_index['bm25']
documents = manual_index['documents']

# Load PageRank results (assuming you have a separate PageRank calculation object `pr_result`)
pr = Pr(alpha=0.85)  # Assuming you have a Pr class for PageRank calculation
_, pr_result = pr.pr_calc()  # This will give you the PageRank results

# Assuming the 'documents' dataframe has a 'url' field that matches the URLs in pr_result
# Add the PageRank score to your documents dataframe
documents['pagerank_score'] = documents['url'].map(pr_result['score'])

# Define the search endpoint for manual_index with PageRank integration
@app.route('/search_manual', methods=['GET'])
def search_manual():
    # Record start time for query execution
    start = time.time()
    response_object = {'status': 'success'}

    # Get the search query from request arguments
    query_term = request.args.get('query', '')

    if not query_term:
        return {"status": "error", "message": "Query term is missing"}, 400

    # Perform the search using BM25
    query_scores = bm25.transform(query_term)

    # Assign scores to documents
    documents["score"] = query_scores

    # Add combined score by multiplying BM25 score and PageRank score
    documents['combined_score'] = documents['score'] * documents['pagerank_score']

    # Sort documents based on the combined score (BM25 score * PageRank score)
    sorted_documents = documents.nlargest(10, "combined_score")[["url", "title", "text", "score", "pagerank_score", "combined_score"]]

    # Record end time for query execution
    end = time.time()

    # Prepare the response object
    response_object['total_hits'] = len(sorted_documents)
    response_object['results'] = sorted_documents.to_dict('records')
    response_object['elapse'] = end - start  # Query execution time

    return response_object

# Run the Flask app
if __name__ == '__main__':
    app.run(debug=False)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit


In [None]:
if __name__ == '__main__':
    app.run(debug=False)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
