In [1]:
# Install spacy
# !pip3 install spacy

In [2]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Doc, Token

# Create a blank English model
nlp = spacy.blank("en")



In [3]:
def setup_token(label, words):
  token_label = f"is_{label}"
  pipe_name = f"label_{label}"
  matcherName = label.upper()
  # Register the custom extension attribute
  Token.set_extension(token_label, default=False, force=True)

  # Define patterns for pieces using spaCy's pattern syntax
  matcher = Matcher(nlp.vocab)
  patterns = [
    [{"ORTH": {"in": word.split()}}] for word in words
  ]

  # Add patterns to the matcher
  for i, pattern in enumerate(patterns, start=1):
    matcher.add(f"{matcherName}_PATTERN_{i}", [pattern])

  # Define a custom component to apply the matcher and label tokens
  @spacy.Language.component(pipe_name)
  def label_token(doc):
    matches = matcher(doc)
    for match_id, start, end in matches:
      span = doc[start:end]
      for token in span:
        token._.set(token_label, True)
    return doc

  # Add the component to the pipeline
  nlp.add_pipe(pipe_name, last=True)
  

In [4]:
setup_token(label="place", words=[
  "a1", "b1", "c1", "d1", "e1", "f1", "g1", "h1",
  "a2", "b2", "c2", "d2", "e2", "f2", "g2", "h2",
  "a3", "b3", "c3", "d3", "e3", "f3", "g3", "h3",
  "a4", "b4", "c4", "d4", "e4", "f4", "g4", "h4",
  "a5", "b5", "c5", "d5", "e5", "f5", "g5", "h5",
  "a6", "b6", "c6", "d6", "e6", "f6", "g6", "h6",
  "a7", "b7", "c7", "d7", "e7", "f7", "g7", "h7",
  "a8", "b8", "c8", "d8", "e8", "f8", "g8", "h8",
])

setup_token(label="piece", words=["king", "queen", "bishop", "knight", "rook", "pawn"])
setup_token(label="action", words=["to", "move", "moves", "capture","captures", "en passant", "promote","promotes"])
setup_token(label="color", words=["white", "black"])
setup_token(label="flag", words=["check", "checkmate"])

# long side castle is not tagged?
# setup_token(label="castle", words=["long side castle", "king side castle", "castle", "short side castle", "queen side castle"])



In [7]:

# Process a text and check the custom attribute
# text = "White moves pawn to e7"

def log_token(text):
  doc = nlp(text.lower())
  print(f"Printing token {doc} : ")

  for token in doc:
    if token._.is_color:
      print(f"Color: {token.text}")
    if token._.is_action:
      print(f"Action: {token.text}")
    if token._.is_place:
      print(f"Place: {token.text}")
    if token._.is_piece:
      print(f"Piece: {token.text}")
    if token._.is_flag:
      print(f"Flag: {token.text}")
  
  print("\n\n")

log_token("White moves pawn to e7 with check")
log_token("Long side castle with check")

Printing token white moves pawn to e7 with check : 
Color: white
Action: moves
Piece: pawn
Action: to
Place: e7
Flag: check



Printing token long side castle with check : 
Flag: check



