In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install PyPDF2 sentence-transformers faiss-cpu 



Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-win_amd64.whl.metadata (5.0 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
   -------------------------------------- 232.6/232.6 kB 710.3 kB/s eta 0:00:00
Downloading sentence_transformers-5.0.0-py3-none-any.whl (470 kB)
   -------------------------------------- 470.2/470.2 kB 866.2 kB/s eta 0:00:00
Downloading faiss_cpu-1.11.0-cp311-cp311-win_amd64.whl (15.0 MB)
   ---------------------------------------- 15.0/15.0 MB 642.6 kB/s eta 0:00:00
Installing collected packages: PyPDF2, faiss-cpu, sentence-transformers
Successfully installed PyPDF2-3.0.1 faiss-cpu-1.11.0 sentence-transformers-5.0.0



[notice] A new release of pip is available: 23.3 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-win_amd64.whl (18.7 MB)
   ---------------------------------------- 18.7/18.7 MB 897.8 kB/s eta 0:00:00
Installing collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.3



[notice] A new release of pip is available: 23.3 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Load a Board Game Rulebook and Extract Text

In [1]:
import fitz  # PyMuPDF

file_path = "ideabase_chessrules.pdf"  

doc = fitz.open(file_path)

full_text = ""
for page_num,page in enumerate(doc):
    page_text = page.get_text()
    full_text += f"\n--- Page {page_num + 1} ---\n" + page_text

print(full_text[:5000])  # Print the first 1000 characters to check the content
    


--- Page 1 ---
3
THE OFFICIAL RULES OF CHESS • CARDOZA PUBLISHING
Rules of
Chess
ERIC SCHILLER

--- Page 2 ---
THE OFFICIAL RULES OF CHESS • CARDOZA PUBLISHING
THE OFFICIAL RULES OF CHESS
The following are the standard rules of chess as applied in World
Championship competition. In later chapter we present some of the
variations of the rules used in amateur, scholastic, and online com-
petitions. These rules conform in most part to those of the world
chess federation (FIDE), but differ significantly from those found in
American tournaments conducted under the auspices of the United
States Chess Federation. Since most American tournaments are
amateur events, those rules are discussed in the section on ama-
teur rules.
This set of rules was composed by International Arbiter Eric
Schiller, with the cooperation and valuable assistance of Interna-
tional Arbiters Andrzej Filipowicz (Poland) and Yuri Averbakh (Rus-
sia). They were used verbatim in the 2000 Braingames.net World
Chess Champio

### Clean and Preprocess the Rulebook Text

In [3]:
import re

# ✅ Retain double newlines (paragraph breaks), collapse 3+ to exactly 2
cleaned_text = re.sub(r'\n{3,}', '\n\n', full_text)

# Step 2: Remove bullet characters or other non-standard symbols
cleaned_text = re.sub(r'[•·►\uf0b7\xa0]', '', cleaned_text)

# Step 3: Remove common footer/header artifacts (if you notice any patterns like "Page x" or "FIDE Laws of Chess")
cleaned_text = re.sub(r'Page\s+\d+', '', cleaned_text, flags=re.IGNORECASE)

# Step 4: Strip extra whitespace from beginning and end of each line
cleaned_text = '\n'.join([line.strip() for line in cleaned_text.splitlines()])

# Step 5: (Optional) Convert to lowercase for normalization
cleaned_text = cleaned_text.lower()  # Only if you're okay making everything lowercase

# Preview the cleaned result
print(cleaned_text[:1500])



---  ---
3
the official rules of chess  cardoza publishing
rules of
chess
eric schiller

---  ---
the official rules of chess  cardoza publishing
the official rules of chess
the following are the standard rules of chess as applied in world
championship competition. in later chapter we present some of the
variations of the rules used in amateur, scholastic, and online com-
petitions. these rules conform in most part to those of the world
chess federation (fide), but differ significantly from those found in
american tournaments conducted under the auspices of the united
states chess federation. since most american tournaments are
amateur events, those rules are discussed in the section on ama-
teur rules.
this set of rules was composed by international arbiter eric
schiller, with the cooperation and valuable assistance of interna-
tional arbiters andrzej filipowicz (poland) and yuri averbakh (rus-
sia). they were used verbatim in the 2000 braingames.net world
chess championship and were

In [4]:
print(cleaned_text[:5000])  # Print the first 5000 characters to check the cleaned content


---  ---
3
the official rules of chess  cardoza publishing
rules of
chess
eric schiller

---  ---
the official rules of chess  cardoza publishing
the official rules of chess
the following are the standard rules of chess as applied in world
championship competition. in later chapter we present some of the
variations of the rules used in amateur, scholastic, and online com-
petitions. these rules conform in most part to those of the world
chess federation (fide), but differ significantly from those found in
american tournaments conducted under the auspices of the united
states chess federation. since most american tournaments are
amateur events, those rules are discussed in the section on ama-
teur rules.
this set of rules was composed by international arbiter eric
schiller, with the cooperation and valuable assistance of interna-
tional arbiters andrzej filipowicz (poland) and yuri averbakh (rus-
sia). they were used verbatim in the 2000 braingames.net world
chess championship and were

### Chunking Text

In [None]:
# Split based on paragraph breaks
raw_chunks = cleaned_text.split('\n\n')

# Keep only meaningful chunks (e.g., more than 30 words)
chunks = []
for chunk in raw_chunks:
    chunk = chunk.strip()
    if len(chunk.split()) > 15: # Adjust the threshold as needed
        chunks.append(chunk)

# Show a few to confirm
for i, chunk in enumerate(chunks[:3]):
    print(f"\n=== Chunk {i+1} ===\n{chunk}")



=== Chunk 1 ===
---  ---
the official rules of chess  cardoza publishing
the official rules of chess
the following are the standard rules of chess as applied in world
championship competition. in later chapter we present some of the
variations of the rules used in amateur, scholastic, and online com-
petitions. these rules conform in most part to those of the world
chess federation (fide), but differ significantly from those found in
american tournaments conducted under the auspices of the united
states chess federation. since most american tournaments are
amateur events, those rules are discussed in the section on ama-
teur rules.
this set of rules was composed by international arbiter eric
schiller, with the cooperation and valuable assistance of interna-
tional arbiters andrzej filipowicz (poland) and yuri averbakh (rus-
sia). they were used verbatim in the 2000 braingames.net world
chess championship and were personally approved by world cham-
pion garry kasparov and his challenge

In [9]:
print(chunks)  # Check how many chunks we have



### Generate Embeddings

In [6]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(chunks, show_progress_bar=True)

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]


In [7]:
print(f"Total chunks embedded: {len(embeddings)}")
print(f"First vector shape: {embeddings[0].shape}")  # should be (384,)


Total chunks embedded: 16
First vector shape: (384,)
