## Part 1: Loading and Processing

In [1]:
import pandas as pd
import numpy as np
import spacy
import warnings
from transformers import pipeline
from wordsegment import load, segment
from deepmultilingualpunctuation import PunctuationModel
from gliner import GLiNER
from gliner.multitask import GLiNERClassifier

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# Load the cleaned crossword dataset
df = pd.read_csv('deep_learning_nytcrosswords2021.csv')
df = df.sample(n=100)  # Use a small subset for faster processing
df['Word'] = df['Word'].str.lower()  # Convert answers to lowercase

# Define category labels for GLiNER classification
LABELS = [
    "Person", "Place", "Thing", "Animal", "Food", "Science", "Art", "Sport",
    "History", "Literature", "Music", "Brand", "Abbreviation", "Acronym",
    "Foreign", "Wordplay (Pun/Anagram/Homophone)", "Mythology", "Religion", "Vehicle", "Clothing",
    "Instrument", "Plant", "Event", "Concept", "Miscellaneous",
    "Slang", "Geography", "Object", "Technology", "Expression"
]

# Load necessary NLP models
print("Loading models...")
nlp = spacy.load("en_core_web_trf", disable=["parser"])  # Transformer-based model optimized for text processing
punctuation_model = PunctuationModel()  # Restores capitalization & punctuation
load()  # Load word segmentation model

# Load GLiNER multitask model for classification
model_id = "knowledgator/gliner-multitask-v1.0"
gliner_model = GLiNER.from_pretrained(model_id)
classifier = GLiNERClassifier(model=gliner_model)
print("Models loaded successfully!")

# Define helper functions
def restore_spacing(word):
    """Uses word segmentation to add spaces back to improperly formatted words."""
    return " ".join(segment(word.lower())).title()

def detect_multi_word(word):
    """Detects if an answer consists of multiple words using word segmentation."""
    return "MULTI-WORD" if len(segment(word.lower())) > 1 and not word.islower() else "SINGLE-WORD"

def classify_pos(word):
    """Classifies the part-of-speech (POS) for single and multi-word terms using spaCy."""
    doc = nlp(word)
    return " ".join([token.pos_ for token in doc if token.pos_ in ["VERB", "NOUN", "ADJ", "ADV"]]) if len(doc) > 1 else doc[0].pos_ if len(doc) > 0 else "UNKNOWN"

def classify_with_gliner(answer, clue, top_n=3):
    """Classifies a clue-answer pair using GLiNER and returns the top N predicted labels."""
    formatted_text = f"Clue: {clue}. Answer: {answer}"
    predictions = classifier(formatted_text, classes=LABELS, multi_label=True)
    predictions = predictions[0] if isinstance(predictions, list) and len(predictions) > 0 and isinstance(predictions[0], list) else predictions
    sorted_labels = sorted(predictions, key=lambda x: x["score"], reverse=True)[:top_n]
    return [f"{label['label']} ({label['score']:.2f})" for label in sorted_labels] if sorted_labels else ["Other"]

def lemmatize_word(word):
    """Lemmatizes words to their root form using spaCy."""
    return nlp(word)[0].lemma_ if nlp(word) else word

def process_dataframe(df):
    """
    Processes crossword words with the following steps:
    - Restores capitalization & punctuation
    - Adds spaces to improperly formatted multi-word entities
    - Performs named entity recognition (NER) with GLiNER
    - Classifies multi-word terms
    - Performs POS tagging
    - Lemmatizes words to their root form
    """
    df["Fixed Word"] = df["Word"].apply(lambda x: punctuation_model.restore_punctuation(x))
    df["Spaced Word"] = df["Fixed Word"].apply(restore_spacing)
    df["GLiNER Labels"] = df.apply(lambda row: classify_with_gliner(row["Spaced Word"], row["Clue"]), axis=1)
    df["Multi Word"] = df["Spaced Word"].apply(detect_multi_word)
    df["POS Tag"] = df["Spaced Word"].apply(classify_pos)
    df["Lemmatized Word"] = df["Spaced Word"].apply(lemmatize_word)
    return df

# Process the crossword dataset
df = process_dataframe(df)
print("Processing complete!")

# Display the first 20 rows of the processed dataset
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
display(df.head(20))

Loading models...


Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Models loaded successfully!


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Processing complete!


Unnamed: 0,Date,Word,Clue,Character Count,Fixed Word,Spaced Word,GLiNER Labels,Multi Word,POS Tag,Lemmatized Word
5340,2021-03-07,icee,Drink similar to a slushie,4,icee.,Icee,[other (1.00)],SINGLE-WORD,PROPN,Icee
7899,2021-07-13,adroit,Proficient,6,adroit.,Adroit,"[Music (0.69), History (0.63), Science (0.58)]",SINGLE-WORD,PROPN,Adroit
14400,2021-10-31,pen,It might click for a writer,3,pen.,Pen,"[Literature (0.91), History (0.63)]",SINGLE-WORD,NOUN,pen
4533,2021-06-27,onbail,Temporarily out,6,onbail.,On Bail,"[Music (0.75), History (0.65), Literature (0.61)]",MULTI-WORD,NOUN,on
23293,2021-10-25,toad,Warty fly-catcher,4,toad.,Toad,"[Animal (0.89), Science (0.84), Thing (0.61)]",SINGLE-WORD,NOUN,toad
8041,2021-09-11,chad,"Lake ___, where the Chari River empties",4,chad.,Chad,[History (0.71)],SINGLE-WORD,PROPN,Chad
2582,2021-09-22,see,Observe,3,see:,See,"[History (0.73), Music (0.65), Literature (0.64)]",SINGLE-WORD,VERB,see
12816,2021-03-25,usa,Destination of Finnish mail that's addressed t...,3,usa.,Usa,[History (0.56)],SINGLE-WORD,PROPN,Usa
2575,2021-10-04,car,Word after bumper or cable,3,car.,Car,[Music (0.60)],SINGLE-WORD,NOUN,car
12946,2021-08-17,rent,Flat fee?,4,rent.,Rent,"[Music (0.63), History (0.55), Literature (0.55)]",SINGLE-WORD,NOUN,rent


In [2]:
df.to_csv('sampledatapostprocessing.csv', index = False)