### Import

In [18]:
from urllib.parse import urlparse, urljoin
from pathlib import Path
from queue import Queue, Empty
from concurrent.futures import ThreadPoolExecutor
import multiprocessing
import pickle
import requests
from bs4 import BeautifulSoup, Comment
import json

### Page 29-33

In [19]:
class MultiThreadCrawler:
    def __init__(self, base_url, depth):
        """
        Initialize the MultiThreadCrawler with base URL and depth
        """
        self.base_url = base_url #Entry point
        extracted_url = urlparse(base_url)
        parent = extracted_url.path[:extracted_url.path.find("/") + 1]
        self.root_url = "{}://{}{}".format(extracted_url.scheme, extracted_url.netloc, parent)
        self.pool = ThreadPoolExecutor(max_workers=multiprocessing.cpu_count() - 1)
        self.to_crawl = Queue()  # Queue to store URLs to be crawled / Create a queue
        self.to_crawl.put({self.base_url: depth})  # Add the entry url to the queue
        self.stored_folder = Path(Path().absolute()).parent / "crawled/"

        if not Path(self.stored_folder).exists():
            Path.mkdir(self.stored_folder)

        # Load crawled pages if already saved
        if Path(self.stored_folder / "url_list.pickle").exists(): # Continue from the saved work or start a new one
            with open(self.stored_folder / "url_list.pickle", "rb") as f:
                self.crawled_pages = pickle.load(f)
            print(self.crawled_pages)
        else:
            self.crawled_pages = set()

    def extract_page(self, obj):
        """
        Extract content and links from a downloaded page
        """
        if obj.result():
            result, url, depth = obj.result()
            if result and result.status_code == 200:
                url_lists = self.parse_links(result.text, depth)
                self.parse_contents(url, result.text, url_lists)

    def get_page(self, url, depth):
        """
        Download a webpage
        """
        try:
            res = requests.get(url, timeout=(3, 30))
            return res, url, depth
        except requests.RequestException: #Download a webpage
            return

    def parse_links(self, html, depth):
        """
        Extract all links from the HTML content of a page
        """
        soup = BeautifulSoup(html, 'html.parser')
        links = soup.find_all('a', href=True)
        url_lists = []
        for link in links: # Find all links by extracting all the <a href> tags
            url = link['href']
            url = urljoin(self.root_url, url)
            if depth >= 0 and ".." not in url and url not in self.crawled_pages:
                print("Adding {}".format(url))
                # Enqueue the URL if it has not been visited and has not reached the depth threshold.
                self.to_crawl.put({url: depth})
                url_lists.append(url)
        return url_lists

    def parse_contents(self, url, html, url_lists):
        """
        Extract visible text and save it along with the URL and links
        """
        def tag_visible(element):
            if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
                return False
            if isinstance(element, Comment):
                return False
            return True

        try:
            soup = BeautifulSoup(html, 'html.parser')
            texts = soup.findAll(string=True)
            visible_texts = filter(tag_visible, texts) # Get all the texts from visible tags
            title = soup.find('title').string.strip()
            text = u" ".join(t.strip() for t in visible_texts).strip()
            # Store the extracted links in a JSON format and write the JSON data to a text file.
            with open(self.stored_folder / (str(hash(url)) + '.txt'), 'w', encoding='utf-8') as f:
                json.dump({'url': url, 'title': title, 'text': text, 'url_lists': url_lists}, f, ensure_ascii=False)
        except:
            pass

    def run_scraper(self):
        """
        Continuously process the queue of URLs to scrape content
        """
        while True:
            try:
                # Get the first URL from the queue
                target = self.to_crawl.get(timeout=10) # Dequeue the first entry
                url, depth = [(k, target[k]) for k in target][0]

                # Skip if the URL has already been crawled
                if url not in self.crawled_pages: # If unprocessed
                    self.crawled_pages.add(url)

                    # Submit the task to the thread pool
                    job = self.pool.submit(self.get_page, url, depth - 1)
                    job.add_done_callback(self.extract_page)

            except Empty:
                # Save crawled pages to a file when the queue is empty
                with open(self.stored_folder / "url_list.pickle", "wb") as f:
                    pickle.dump(self.crawled_pages, f, pickle.HIGHEST_PROTOCOL)
                # The task is done when there is no remaining URL in the queue.
                with open(self.stored_folder / "url_list.pickle", "rb") as f:
                    print(pickle.load(f))
                break

            except Exception as e:
                print(e)
                continue


### Page 33

In [20]:
if __name__ == '__main__':
    s = MultiThreadCrawler("https://cmu.ac.th/en/faculty/aboutus", 2)
    s.run_scraper()

Adding https://cmu.ac.th/Controls/ShareContent/
Adding https://cmu.ac.th/course
Adding https://mis.cmu.ac.th/TQF/TQF2/CurriculumPublicList.aspx
Adding https://cmu.ac.th/level/bachelor
Adding https://cmu.ac.th/level/master_bachelor
Adding https://cmu.ac.th/level/phd
Adding https://cmu.ac.th/Level/other
Adding https://cmu.ac.th/Controls/ShareContent/
Adding https://cmu.ac.th/content/F0917C78-2125-4FFD-AF68-BE9E4F5E0D36
Adding https://cmu.ac.th/content/E13BF8C0-7C77-46BA-BEA1-56392A967AEF
Adding https://cmu.ac.th/content/425815A0-90AB-4F39-9DA9-8D5DEA9CDBED
Adding https://cmu.ac.th/content/D4F81742-3F7A-4197-8286-4B568129DCF1
Adding https://cmu.ac.th/Controls/ShareContent/
Adding https://cmu.ac.th/aboutus
Adding https://cmu.ac.th/organization
Adding https://cmu.ac.th/content/7A7616BC-C917-407E-8ED9-9F544D3416BA
Adding https://cmu.ac.th/th/faculty/aboutus
Adding https://cmu.ac.th/aboutus
Adding https://cmu.ac.th/cn/faculty/aboutus
Adding javascript:void(0)
Adding javascript:__doPostBack('c

### Page 35

### PreProcess

In [23]:
from nltk import word_tokenize
import re


def preProcess(s):
    s = s.lower()
    s = re.sub(r'[^A-Za-z]', ' ', s)
    s = re.sub(r'\s+', ' ', s)
    s = word_tokenize(s)
    return ' '.join(s)

In [25]:
import os
import pickle
import json
from pathlib import Path
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from rank_bm25 import BM25

class Indexer:
    def __init__(self, is_reset=False):
        self.crawled_folder = Path(os.path.abspath('')).parent / 'crawled/'
        self.stored_file = 'src/resource/manual_indexer.pkl'

        if not is_reset and os.path.isfile(self.stored_file):
            with open(self.stored_file, 'rb') as f:
                cached_dict = pickle.load(f)
                self.__dict__.update(cached_dict)
        else:
            self.run_indexer()

    def run_indexer(self):
        documents = []
        for file in os.listdir(self.crawled_folder):
            if file.endswith(".txt"):
                with open(os.path.join(self.crawled_folder, file)) as f:
                    j = json.load(f)
                    documents.append(j)

        self.documents = pd.DataFrame(documents)

        tfidf_vectorizer = TfidfVectorizer(preprocessor=preProcess, stop_words=stopwords.words('english'))
        self.bm25 = BM25(tfidf_vectorizer)
        self.bm25.fit(self.documents.apply(lambda s: ' '.join(s[['title', 'text']]), axis=1))

        with open(self.stored_file, 'wb') as f:
            pickle.dump(self.__dict__, f)


In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy import sparse
import pandas as pd
import json
import os
import pickle
from pathlib import Path
from tabulate import tabulate
import re

In [31]:
class BM25:
    def __init__(self, b=0.75, k1=1.6):
        """
        BM25 implementation using TfidfVectorizer for vectorization.
        """
        self.b = b
        self.k1 = k1
        self.vectorizer = TfidfVectorizer(lowercase=True, stop_words='english', use_idf=True)

    def fit(self, X):
        """
        Fit the model with a list of documents X.
        """
        self.vectorizer.fit(X)
        self.y = self.vectorizer.transform(X)
        self.avdl = self.y.sum(axis=1).mean()  # Average document length

    def transform(self, q):
        """
        Calculate BM25 scores between query q and documents X.
        """
        b, k1, avdl = self.b, self.k1, self.avdl

        # Apply CountVectorizer
        len_y = self.y.sum(axis=1).A1  # Length of each document
        q_vector = self.vectorizer.transform([q])  # Vectorize query
        assert sparse.isspmatrix_csr(q_vector)

        # Convert to csc for better column slicing
        y_csc = self.y.tocsc()[:, q_vector.indices]
        denom = y_csc + (k1 * (1 - b + b * len_y / avdl))[:, None]
        idf = self.vectorizer.idf_[None, q_vector.indices] - 1.0
        numer = y_csc.multiply(np.broadcast_to(idf, y_csc.shape)) * (k1 + 1)
        return (numer / denom).sum(axis=1).A1


In [32]:
class WebIndexer:
    def __init__(self, is_reset=False):
        """
        Initialize the WebIndexer and load preprocessed data if exists.
        """
        self.crawled_folder = Path(Path().absolute()).parent / "crawled/"
        self.stored_file = 'src/resource/manual_indexer.pkl'

        # Load the cached index if it exists and is_reset is False
        if not is_reset and os.path.isfile(self.stored_file):
            with open(self.stored_file, "rb") as f:
                cached_dict = pickle.load(f)
            self.__dict__.update(cached_dict)
        else:
            self.run_indexer()

    @staticmethod
    def preprocess_text(text):
        """
        Preprocess text by lowercasing, removing punctuation, and cleaning spaces.
        """
        # Lowercase text
        text = text.lower()
        # Remove punctuation and special characters
        text = re.sub(r'[^\w\s]', '', text)
        # Remove extra spaces
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def run_indexer(self):
        """
        Load documents from crawled folder, preprocess, and index them.
        """
        documents = []

        # Load all .txt files from crawled folder
        for file in os.listdir(self.crawled_folder):
            if file.endswith(".txt"):
                file_path = os.path.join(self.crawled_folder, file)

                # Read the file with UTF-8 encoding and handle errors
                with open(file_path, "r", encoding="utf-8", errors="replace") as f:
                    try:
                        j = json.load(f)
                        if 'title' in j and 'text' in j:
                            # Preprocess title and text
                            j['title'] = self.preprocess_text(j['title'])
                            j['text'] = self.preprocess_text(j['text'])
                            documents.append(j)
                        else:
                            print(f"Skipped file {file}: Missing 'title' or 'text'")
                    except json.JSONDecodeError as e:
                        print(f"Error decoding JSON in {file}: {e}")
                    except Exception as e:
                        print(f"Error reading {file}: {e}")

        if not documents:
            raise ValueError("No valid documents found. Ensure crawled/ folder contains valid .txt files.")

        # Preprocess and create BM25 index
        self.documents = pd.DataFrame.from_dict(documents)
        corpus = self.documents.apply(lambda s: ' '.join(s[['title', 'text']]), axis=1).tolist()

        if not corpus:
            raise ValueError("Corpus is empty. Ensure documents contain 'title' and 'text' fields.")

        # Initialize BM25 and fit the corpus
        self.bm25 = BM25()
        self.bm25.fit(corpus)

        # Ensure directory exists before saving
        os.makedirs(os.path.dirname(self.stored_file), exist_ok=True)

        # Save the final processed data
        with open(self.stored_file, "wb") as f:
            pickle.dump(self.__dict__, f)

    def search_query(self, query, top_n=5):
        """
        Perform a search query on the indexed data.
        """
        # Preprocess query
        query = self.preprocess_text(query)
        query_scores = self.bm25.transform(query)

        if not any(query_scores):
            raise ValueError("No documents matched the query. Check the query or indexed data.")

        self.documents["score"] = query_scores
        results = self.documents.nlargest(top_n, "score")[["url", "title", "text", "score"]]

        return results


In [33]:
if __name__ == "__main__":
    indexer = WebIndexer(is_reset=False)

    query = "school"
    try:
        results = indexer.search_query(query, top_n=5)
        print(tabulate(results, headers="keys", tablefmt="grid"))
    except ValueError as e:
        print(f"Error: {e}")

+-----+---------------------------------------------+----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [35]:
indexer.documents

Unnamed: 0,url,title,text,url_lists,score
0,https://portal.cmu.ac.th/,itsc services,itsc searchtype services search all students s...,[],0.000000
1,https://mis.cmu.ac.th/publication,home page,สารสนเทศผลงานทางวชาการ มหาวทยาลยเชยงใหม ปเผยแพ...,[],0.000000
2,https://shop.cmu.ac.th/,มลนธพฒนามหาวทยาลยเชยงใหม จำหนายของทระลก 60 ป มช,skip to content มลนธพฒนามหาวทยาลยเชยงใหม จำหนา...,[],0.000000
3,https://mis.cmu.ac.th/TQF/TQF2/CurriculumPubli...,,หลกสตร รายการหลกสตร อาจารยผรบผดชอบหลกสตร year ...,[https://cmu.ac.th/CurriculumLecturerPublicLis...,0.285078
4,https://cmu.ac.th/th/content/D7ABB770-4C2B-41E...,มหาวทยาลยเชยงใหม chiang mai university thailand,cmu หลกสตร แยกตามคณะ คนหาหลกสตร ปรญญาตร ปรญญาโ...,[],0.000000
...,...,...,...,...,...
216,https://www.facebook.com/cmuofficial/,มหาวทยาลยเชยงใหม chiang mai university chiang mai,,[],0.000000
217,https://cmu.ac.th/th/faculty/level/master_bach...,มหาวทยาลยเชยงใหม chiang mai university thailand,cmu หลกสตร แยกตามคณะ คนหาหลกสตร ปรญญาตร ปรญญาโ...,[],0.148912
218,https://cmu.ac.th/th/60years/article/7d7325f9-...,ขาวสาร เชญชมการแสดงขบรองประสานเสยง โดย วง cmu ...,cmu th en cn home about us สารจากอธการ ประวต ม...,[],0.000000
219,https://cmu.ac.th/th/60years/article/84a6fc60-...,ขาวสาร รวมฉลอง 60 ป มช ประกวด ภาพแหงความทรงจำ ...,cmu th en cn home about us สารจากอธการ ประวต ม...,[],0.000000


In [36]:
results = indexer.search_query("itsc")

In [37]:
results

Unnamed: 0,url,title,text,score
0,https://portal.cmu.ac.th/,itsc services,itsc searchtype services search all students s...,1.958642
134,https://mis.cmu.ac.th/CMUBuildings/,cmu building system,cmu building system ระบบสารสนเทศ และ จดการขอมล...,1.810092
126,https://itsc.cmu.ac.th/,สำนกบรการเทคโนโลยสารสนเทศ มหาวทยาลยเชยงใหม,239 huay kaew roadmuang district chiang mai th...,1.516517
204,https://e-council.cmu.ac.th/,cmu account sign in with cmu account,sign in to continue to cmu ecouncil email addr...,1.44118
206,https://oauth.cmu.ac.th/v1/Authorize.aspx?resp...,cmu account sign in with cmu account,sign in to continue to สบคนมตทประชมสภามหาวทยาล...,1.421396


### Elasticsearch & python

In [5]:
from elasticsearch import Elasticsearch

es = Elasticsearch("https://localhost:9200", basic_auth=("elastic", "9L43t2V91jSE8k1Yh5+Q"),ca_certs="../http_ca.crt")
es.info().body

{'name': 'af716ff81605',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': 'D_w8te2pR-6RHxuRTFRu9A',
 'version': {'number': '8.12.2',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': '48a287ab9497e852de30327444b0809e55d46466',
  'build_date': '2024-02-19T10:04:32.774273190Z',
  'build_snapshot': False,
  'lucene_version': '9.9.2',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}