In [32]:
%run -i "../util/file_utils.ipynb"
%run -i "../util/lang_utils.ipynb"

In [33]:
import time

# Part of speech tagging using spaCy

In [34]:
def pos_tag_spacy(text, model):
    doc = model(text)
    words = [token.text for token in doc]
    pos = [token.pos_ for token in doc]
    return list(zip(words, pos))

In [35]:
text = read_text_file("../data/sherlock_holmes_1.txt")
words_with_pos = pos_tag_spacy(text, small_model)
print(words_with_pos)

[('To', 'ADP'), ('Sherlock', 'PROPN'), ('Holmes', 'PROPN'), ('she', 'PRON'), ('is', 'AUX'), ('always', 'ADV'), ('_', 'PUNCT'), ('the', 'DET'), ('_', 'PROPN'), ('woman', 'NOUN'), ('.', 'PUNCT'), ('I', 'PRON'), ('have', 'AUX'), ('seldom', 'ADV'), ('heard', 'VERB'), ('him', 'PRON'), ('\n', 'SPACE'), ('mention', 'VERB'), ('her', 'PRON'), ('under', 'ADP'), ('any', 'DET'), ('other', 'ADJ'), ('name', 'NOUN'), ('.', 'PUNCT'), ('In', 'ADP'), ('his', 'PRON'), ('eyes', 'NOUN'), ('she', 'PRON'), ('eclipses', 'VERB'), ('and', 'CCONJ'), ('\n', 'SPACE'), ('predominates', 'VERB'), ('the', 'DET'), ('whole', 'NOUN'), ('of', 'ADP'), ('her', 'PRON'), ('sex', 'NOUN'), ('.', 'PUNCT'), ('It', 'PRON'), ('was', 'AUX'), ('not', 'PART'), ('that', 'SCONJ'), ('he', 'PRON'), ('felt', 'VERB'), ('any', 'DET'), ('emotion', 'NOUN'), ('\n', 'SPACE'), ('akin', 'ADJ'), ('to', 'PART'), ('love', 'VERB'), ('for', 'ADP'), ('Irene', 'PROPN'), ('Adler', 'PROPN'), ('.', 'PUNCT'), ('All', 'DET'), ('emotions', 'NOUN'), (',', 'PUNC

# Part of speech tagging using NLTK

In [36]:
#nltk.download('averaged_perceptron_tagger') # Run the first time you run the notebook
def pos_tag_nltk(text):
    words = word_tokenize_nltk(text)
    words_with_pos = nltk.pos_tag(words)
    return words_with_pos

In [37]:
words_with_pos = pos_tag_nltk(text)
print(words_with_pos)

[('To', 'TO'), ('Sherlock', 'NNP'), ('Holmes', 'NNP'), ('she', 'PRP'), ('is', 'VBZ'), ('always', 'RB'), ('_the_', 'JJ'), ('woman', 'NN'), ('.', '.'), ('I', 'PRP'), ('have', 'VBP'), ('seldom', 'VBN'), ('heard', 'RB'), ('him', 'PRP'), ('mention', 'VB'), ('her', 'PRP'), ('under', 'IN'), ('any', 'DT'), ('other', 'JJ'), ('name', 'NN'), ('.', '.'), ('In', 'IN'), ('his', 'PRP$'), ('eyes', 'NNS'), ('she', 'PRP'), ('eclipses', 'VBZ'), ('and', 'CC'), ('predominates', 'VBZ'), ('the', 'DT'), ('whole', 'NN'), ('of', 'IN'), ('her', 'PRP$'), ('sex', 'NN'), ('.', '.'), ('It', 'PRP'), ('was', 'VBD'), ('not', 'RB'), ('that', 'IN'), ('he', 'PRP'), ('felt', 'VBD'), ('any', 'DT'), ('emotion', 'NN'), ('akin', 'NN'), ('to', 'TO'), ('love', 'VB'), ('for', 'IN'), ('Irene', 'NNP'), ('Adler', 'NNP'), ('.', '.'), ('All', 'DT'), ('emotions', 'NNS'), (',', ','), ('and', 'CC'), ('that', 'IN'), ('one', 'CD'), ('particularly', 'RB'), (',', ','), ('were', 'VBD'), ('abhorrent', 'JJ'), ('to', 'TO'), ('his', 'PRP$'), ('co

# Compare running times for NLTK and spaCy

In [38]:
import time
start = time.time()
pos_tag_nltk(text)
print(f"NLTK: {time.time() - start} s")

start = time.time()
pos_tag_spacy(text, small_model)
print(f"spaCy: {time.time() - start} s")

NLTK: 0.006206035614013672 s
spaCy: 0.02251291275024414 s


# Get part of speech tags using GPT-3

In [39]:
from openai import OpenAI
client = OpenAI(api_key=OPEN_AI_KEY)

In [54]:
prompt="""Decide what the part of speech tags are for a sentence. 
Preserve original capitalization. 
Return the list in the format of a python tuple: (word, part of speech). 
Sentence: In his eyes she eclipses and predominates the whole of her sex."""
response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    temperature=0,
    max_tokens=256,
    top_p=1.0,
    frequency_penalty=0,
    presence_penalty=0,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ], 
)
print(response)

Here are the part of speech tags for the sentence "In his eyes she eclipses and predominates the whole of her sex" in the format of a Python tuple:

[('In', 'IN'), ('his', 'PRP$'), ('eyes', 'NNS'), ('she', 'PRP'), ('eclipses', 'VBZ'), ('and', 'CC'), ('predominates', 'VBZ'), ('the', 'DT'), ('whole', 'JJ'), ('of', 'IN'), ('her', 'PRP$'), ('sex', 'NN')]


In [58]:
print(response.choices[0].message.content)

ChatCompletion(id='chatcmpl-9hCq34UAzMiNiqNGopt2U8ZmZM5po', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are the part of speech tags for the sentence "In his eyes she eclipses and predominates the whole of her sex" in the format of a Python tuple:\n\n[(\'In\', \'IN\'), (\'his\', \'PRP$\'), (\'eyes\', \'NNS\'), (\'she\', \'PRP\'), (\'eclipses\', \'VBZ\'), (\'and\', \'CC\'), (\'predominates\', \'VBZ\'), (\'the\', \'DT\'), (\'whole\', \'JJ\'), (\'of\', \'IN\'), (\'her\', \'PRP$\'), (\'sex\', \'NN\')]', role='assistant', function_call=None, tool_calls=None))], created=1720084483, model='gpt-3.5-turbo-0125', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=120, prompt_tokens=74, total_tokens=194))


In [52]:
from ast import literal_eval

def pos_tag_gpt(text, client):
    prompt = f"""Decide what the part of speech tags are for a sentence. 
    Preserve original capitalization. 
    Return the list in the format of a python tuple: (word, part of speech).
    Do not include any other explanations.
    Sentence: {text}."""

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        temperature=0,
        max_tokens=256,
        top_p=1.0,
        frequency_penalty=0,
        presence_penalty=0,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],    
    )
    result = response.choices[0].message.content
    result = result.replace("\n", "")
    result = list(literal_eval(result))
    return result

In [53]:
start = time.time()
first_sentence = "In his eyes she eclipses and predominates the whole of her sex."
words_with_pos = pos_tag_gpt(first_sentence, client)
print(words_with_pos)
print(f"GPT: {time.time() - start} s")

[('In', 'IN'), ('his', 'PRP$'), ('eyes', 'NNS'), ('she', 'PRP'), ('eclipses', 'VBZ'), ('and', 'CC'), ('predominates', 'VBZ'), ('the', 'DT'), ('whole', 'JJ'), ('of', 'IN'), ('her', 'PRP$'), ('sex', 'NN')]
GPT: 2.4942469596862793 s


In [50]:
words_with_pos_nltk = pos_tag_nltk(first_sentence)
print(words_with_pos_nltk)

[('In', 'IN'), ('his', 'PRP$'), ('eyes', 'NNS'), ('she', 'PRP'), ('eclipses', 'VBZ'), ('and', 'CC'), ('predominates', 'VBZ'), ('the', 'DT'), ('whole', 'NN'), ('of', 'IN'), ('her', 'PRP$'), ('sex', 'NN'), ('.', '.')]


In [51]:
print(words_with_pos == words_with_pos_nltk)

False


In [59]:
print(2.4942469596862793/0.006206035614013672)

401.9066461774875
