In [2]:
#!/usr/bin/env python3
"""
Determine most similar words in terms of their word embeddings.
"""

from __future__ import annotations
import argparse
import logging
from pathlib import Path
from integerize import Integerizer   # look at integerize.py for more info

# Needed for Python's optional type annotations.
# We've included type annotations and recommend that you do the same, 
# so that mypy (or a similar package) can catch type errors in your code.
from typing import List, Optional

try:
    # PyTorch is your friend. Not using it will make your program so slow.
    # And it's also required for this assignment. ;-)
    # So if you comment this block out instead of dealing with it, you're
    # making your own life worse.
    import torch    
    import torch.nn as nn
except ImportError:
    print("\nERROR! You need to install Miniconda, then create and activate the nlp-class environment.  See the INSTRUCTIONS file.\n")
    raise


# log = logging.getLogger(Path(__file__).stem)  # The only okay global variable.

# Logging is in general a good practice to monitor the behavior of your code
# while it's running. Compared to calling `print`, it provides two benefits.
# 
# - It prints to standard error (stderr), not standard output (stdout) by
#   default.  So these messages will normally go to your screen, even if
#   you have redirected stdout to a file.  And they will not be seen by
#   the autograder, so the autograder won't be confused by them.
# 
# - You can configure how much logging information is provided, by
#   controlling the logging 'level'. You have a few options, like
#   'debug', 'info', 'warning', and 'error'. By setting a global flag,
#   you can ensure that the information you want - and only that info -
#   is printed. As an example:
#        >>> try:
#        ...     rare_word = "prestidigitation"
#        ...     vocab.get_counts(rare_word)
#        ... except KeyError:
#        ...     log.error(f"Word that broke the program: {rare_word}")
#        ...     log.error(f"Current contents of vocab: {vocab.data}")
#        ...     raise  # Crash the program; can't recover.
#        >>> log.info(f"Size of vocabulary is {len(vocab)}")
#        >>> if len(vocab) == 0:
#        ...     log.warning(f"Empty vocab. This may cause problems.")
#        >>> log.debug(f"The values are {vocab}")
#   If we set the log level to be 'INFO', only the log.info, log.warning,
#   and log.error statements will be printed. You can calibrate exactly how 
#   much info you need, and when. None of these pollute stdout with things 
#   that aren't the real 'output' of your program.
# 
# In `parse_args`, we provided two command-line options to control the logging level.
# The default level is 'INFO'. You can lower it to 'DEBUG' if you pass '--verbose'
# and you can raise it to 'WARNING' if you pass '--quiet'.
#
# More info: https://docs.python.org/3/howto/logging.html#logging-basic-tutorial
# 
# In all the starter code for the NLP course, we've elected to create a separate
# logger for each source code file, stored in a variable named log that
# is globally visible throughout the file.  That way, calls like log.info(...)
# will use the logger for the current source code file and thus their output will 
# helpfully show the filename.  You could configure the current file's logger using
# log.basicConfig(...), whereas logging.basicConfig(...) affects all of the loggers.
# The command-line options affect all of the loggers.


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("embeddings", type=Path, help="path to word embeddings file")
    parser.add_argument("word", type=str, help="word to look up")
    parser.add_argument("--minus", type=str, default=None)
    parser.add_argument("--plus", type=str, default=None)

    # for verbosity of logging
    parser.set_defaults(logging_level=logging.INFO)
    verbosity = parser.add_mutually_exclusive_group()
    verbosity.add_argument(
        "-v", "--verbose", dest="logging_level", action="store_const", const=logging.DEBUG
    )
    verbosity.add_argument(
        "-q", "--quiet",   dest="logging_level", action="store_const", const=logging.WARNING
    )

    args = parser.parse_args()
    if not args.embeddings.is_file():
        parser.error(f"Embeddings file {args.embeddings} not found")
    if (args.minus is None) != (args.plus is None):  # != is the XOR operation!
        parser.error("Must include both `--plus` and `--minus` or neither")

    return args


In [None]:
import numpy as np
from typing import Dict

class Lexicon:
    """
    Class that manages a lexicon and can compute similarity.

    >>> my_lexicon = Lexicon.from_file(my_file)
    >>> my_lexicon.find_similar_words("bagpipe")
    """

    def __init__(self, token_to_ix: Dict[str, int], ix_to_token: Dict[int, str],
                 token_embeddings_tensor: torch.Tensor, embedding_norms: torch.Tensor) -> None:
        """Load information into coupled word-index mapping and embedding matrix."""
        # FINISH THIS FUNCTION
        # Store your stuff! Both the word-index mapping and the embedding matrix.
        #
        # Do something with this size info?
        # PyTorch's torch.Tensor objects rely on fixed-size arrays in memory.
        # One of the worst things you can do for efficiency is
        # append row-by-row, like you would with a Python list.
        #
        # Probably make the entire list all at once, then convert to a torch.Tensor.
        # Otherwise, make the torch.Tensor and overwrite its contents row-by-row.
        self.token_to_ix = token_to_ix
        self.ix_to_token = ix_to_token
        self.token_embeddings_tensor = token_embeddings_tensor
        self.embedding_norms = embedding_norms

    @classmethod
    def from_file(cls, file: Path) -> Lexicon:
        return Lexicon.WordEmbeddingBuilder(file).build()

    class WordEmbeddingBuilder:
        def __init__(self, file_path: Path, normalize: bool = True):
            self.file_path = file_path
            self._normalized = normalize
            
        def _normalize(self, embedding: List[float]) -> List[float]:
            return embedding / np.linalg.norm(embedding)
        
        def build(self):
            token_list = []
            embedding_list = []

            with open(self.file_path) as f:
                first_line = next(f)  # Peel off the special first line.
                for line in f:  # All of the other lines are regular.
                    splits = line.strip().split()
                    if len(splits) == 0:
                        continue
                    token = splits[0] 
                    embedding = [float(split) for split in splits[1:]]
                    
                    token_list.append(token)
                    embedding_list.append(embedding)
                
                token_to_ix = dict(enumerate(token_list))
                index_to_token = {token: index for index, token in enumerate(token_list)}
                
                token_embeddings_tensor = torch.tensor(embedding_list, dtype=torch.float32)

                if self._normalized:
                    # for numerical stability
                    normalized = token_embeddings_tensor.norm(dim=1, keepdim=True)
                    token_embeddings_tensor = token_embeddings_tensor / (normalized + 1e-16)

                return Lexicon(token_to_ix, index_to_token, token_embeddings_tensor, token_embeddings_tensor.norm(dim=1))  
        
    def find_similar_words(
        self, word: str, *, plus: Optional[str] = None, minus: Optional[str] = None
    ) -> List[str]:
        """Find most similar words, in terms of embeddings, to a query."""
        # FINISH THIS FUNCTION

        # The star above forces you to use `plus` and `minus` only
        # as named arguments. This helps avoid mixups or readability
        # problems where you forget which comes first.
        #
        # We've also given `plus` and `minus` the type annotation
        # Optional[str]. This means that the argument may be None, or
        # it may be a string. If you don't provide these, it'll automatically
        # use the default value we provided: None.
        if (minus is None) != (plus is None):  # != is the XOR operation!
            raise TypeError("Must include both of `plus` and `minus` or neither.")
        # Keep going!
        # Be sure that you use fast, batched computations
        # instead of looping over the rows. If you use a loop or a comprehension
        # in this function, you've probably made a mistake.
        if word not in self.token_to_ix:
            raise ValueError(f"Word {word} not found in lexicon")
        
        # Handle word analogy: word + plus - minus
        if plus is not None and minus is not None:
            if plus not in self.token_to_ix:
                raise ValueError(f"Word {plus} not found in lexicon")
            if minus not in self.token_to_ix:
                raise ValueError(f"Word {minus} not found in lexicon")
            
            # Compute analogy: word + plus - minus
            query_embedding = (self.token_embeddings_tensor[self.token_to_ix[word]] + 
                             self.token_embeddings_tensor[self.token_to_ix[plus]] - 
                             self.token_embeddings_tensor[self.token_to_ix[minus]])
            # Normalize the query embedding
            query_embedding = query_embedding / (torch.norm(query_embedding) + 1e-16)
        else:
            # Regular similarity search
            query_ix = self.token_to_ix[word]
            query_embedding = self.token_embeddings_tensor[query_ix]
        
        # Compute cosine similarities using matrix multiplication
        # Since embeddings are already normalized, this gives us cosine similarity
        similarities = torch.matmul(self.token_embeddings_tensor, query_embedding)
        
        # Exclude the query word itself from results
        if plus is None and minus is None:  # Only exclude for regular similarity, not analogy
            query_ix = self.token_to_ix[word]
            similarities[query_ix] = -float('inf')
        
        # Get top 10 most similar words
        topk_ix = torch.topk(similarities, k=10).indices
        
        return [self.ix_to_token[ix.item()] for ix in topk_ix]
        

In [7]:
", ".join(['apple', 'banana', 'orange'])

'apple, banana, orange'

In [4]:
def main():
    args = parse_args()
    logging.basicConfig(level=args.logging_level)
    lexicon = Lexicon.from_file(args.embeddings)
    similar_words = lexicon.find_similar_words(
        args.word, plus=args.plus, minus=args.minus
    )
    print(" ".join(similar_words))  # print all words on one line, separated by spaces


if __name__ == "__main__":
    main()


usage: ipykernel_launcher.py [-h] [--minus MINUS] [--plus PLUS] [-v | -q]
                             embeddings word
ipykernel_launcher.py: error: the following arguments are required: embeddings, word


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [8]:
token_list = ["apple", "banana", "orange"]

In [None]:
token_to_ix = dict(enumerate(token_list))
index_to_token = {token: index for token, index in token_to_ix.items()}

In [13]:
dict(enumerate(token_list))

{0: 'apple', 1: 'banana', 2: 'orange'}

In [11]:
index_to_token

{0: 'apple', 1: 'banana', 2: 'orange'}

In [12]:
token_to_ix

{'apple': 0, 'banana': 1, 'orange': 2}

In [1]:
from findsim import Lexicon
def findsim(file_path, word, plus=None, minus=None):
    lexicon = Lexicon.from_file(file_path)
    similar_words = lexicon.find_similar_words(
        word, plus=plus, minus=minus
    )
    print(" ".join(similar_words))

In [5]:
words = ["seattle", "dog", "communist", "jpg", "the", "google"]
file_path = ["lexicons/words-10.txt", "lexicons/words-20.txt", "lexicons/words-50.txt", 
             "lexicons/words-100.txt", "lexicons/words-200.txt"]

for file in file_path:
    print(f"d = {file.split('.')[0].split('-')[-1]}")
    for word in words:
        print(f"Word: {word}")
        findsim(file, word)
    print("\n")

d = 10
Word: seattle
seattle indianapolis atlanta lakers dallas expos boston detroit cleveland houston
Word: dog
dog turnip coronets ass pig embroidered eyed cow unicorns haired
Word: communist
communist socialist rightist udt comintern communists fascist instigated bolshevik leftist
Word: jpg
jpg hout maui hino ledger lsch storey bahnhof longship monte
Word: the
the marked reintroduced successive split gradually contention sway intervening changed
Word: google
google info geocaching archiving downloadable digitized web com printing bitnet


d = 20
Word: seattle
seattle atlanta dallas miami canucks indianapolis cincinnati lauderdale florida knicks
Word: dog
dog cat dogs ass badger hound sighthound canine azawakh bikini
Word: communist
communist socialist communists comintern fascist trotskyist cnt agitating bolshevik nationalist
Word: jpg
jpg png colspan ffffff bratislava verbena gallery hoek alster hopewell
Word: the
the of in within its between entire over part uninterrupted
Word: go

KeyboardInterrupt: 

In [2]:
words = [("bank", "river", "money"), ("apple", "fruit", "company"),
         ("python", "animal", "programming"), ("nurse", "woman", "man"),
         ("beautiful", "more", "most")]
file_path = ["lexicons/words-10.txt", "lexicons/words-20.txt", "lexicons/words-50.txt", 
             "lexicons/words-100.txt", "lexicons/words-200.txt"]

for file in file_path:
    print(f"d = {file.split('.')[0].split('-')[-1]}")
    for word in words:
        print(f"{word[0]} - {word[1]} + {word[2]}")
        findsim(file, word[0], minus=word[1], plus=word[2])
    print("\n")

d = 10
bank - river + money
funding gst bankruptcy credit reporting shareholder firm profit deferred seignorage
apple - fruit + company
ibm commodore pricewaterhousecoopers mca discontinued revamped isu honeywell tramiel amd
python - animal + programming
minix multiplan ms netbsd openstep odp ported fgu basica slate
nurse - woman + man
huck boyfriend phoney girlfriend housewife girlfriends bookish dugan belson obsessed
beautiful - more + most
supergroup goldsmiths beltaine dawn artworks minerva ecd coffins whirling tuileries


d = 20
bank - river + money
credit creditors contracts taxpayer premiums compensation payment repayment profits repay
apple - fruit + company
tramiel microcomputer ibm buyout corp inprise commodore bushnell compaq mattel
python - animal + programming
terseness kernighan applescript irssi perl awk niklaus scripting rexx tads
nurse - woman + man
frink parris onetime lister bunter indefatigable doctor dentist spotswood groote
beautiful - more + most
quaint famous bi

In [39]:
bank - river + money
apple - fruit + company
python - animal + programming
nurse - woman + man
beautiful - more + most

NameError: name 'bank' is not defined