In [1]:
%run -i "../util/file_utils.ipynb"
%run -i "../util/lang_utils.ipynb"

# Part of speech tagging using NLTK

In [2]:
#nltk.download('averaged_perceptron_tagger') # Run the first time you run the notebook
def pos_tag_nltk(text):
    words = word_tokenize_nltk(text)
    words_with_pos = nltk.pos_tag(words)
    return words_with_pos

In [25]:
text = read_text_file("data/sherlock_holmes_1.txt")
words_with_pos = pos_tag_nltk(text)
words_with_pos

[('To', 'TO'),
 ('Sherlock', 'NNP'),
 ('Holmes', 'NNP'),
 ('she', 'PRP'),
 ('is', 'VBZ'),
 ('always', 'RB'),
 ('_the_', 'JJ'),
 ('woman', 'NN'),
 ('.', '.'),
 ('I', 'PRP'),
 ('have', 'VBP'),
 ('seldom', 'VBN'),
 ('heard', 'RB'),
 ('him', 'PRP'),
 ('mention', 'VB'),
 ('her', 'PRP'),
 ('under', 'IN'),
 ('any', 'DT'),
 ('other', 'JJ'),
 ('name', 'NN'),
 ('.', '.'),
 ('In', 'IN'),
 ('his', 'PRP$'),
 ('eyes', 'NNS'),
 ('she', 'PRP'),
 ('eclipses', 'VBZ'),
 ('and', 'CC'),
 ('predominates', 'VBZ'),
 ('the', 'DT'),
 ('whole', 'NN'),
 ('of', 'IN'),
 ('her', 'PRP$'),
 ('sex', 'NN'),
 ('.', '.'),
 ('It', 'PRP'),
 ('was', 'VBD'),
 ('not', 'RB'),
 ('that', 'IN'),
 ('he', 'PRP'),
 ('felt', 'VBD'),
 ('any', 'DT'),
 ('emotion', 'NN'),
 ('akin', 'NN'),
 ('to', 'TO'),
 ('love', 'VB'),
 ('for', 'IN'),
 ('Irene', 'NNP'),
 ('Adler', 'NNP'),
 ('.', '.'),
 ('All', 'DT'),
 ('emotions', 'NNS'),
 (',', ','),
 ('and', 'CC'),
 ('that', 'IN'),
 ('one', 'CD'),
 ('particularly', 'RB'),
 (',', ','),
 ('were', 'VBD'),

# Part of speech tagging using spaCy

In [4]:
def pos_tag_spacy(text, model):
    doc = model(text)
    words = [token.text for token in doc]
    pos = [token.pos_ for token in doc]
    return list(zip(words, pos))

In [5]:
#!python -m spacy download en_core_web_sm #Run if necessary
nlp = spacy.load("en_core_web_sm")
words_with_pos = pos_tag_spacy(text, nlp)
words_with_pos

[('To', 'ADP'),
 ('Sherlock', 'PROPN'),
 ('Holmes', 'PROPN'),
 ('she', 'PRON'),
 ('is', 'AUX'),
 ('always', 'ADV'),
 ('_', 'PUNCT'),
 ('the', 'DET'),
 ('_', 'PROPN'),
 ('woman', 'NOUN'),
 ('.', 'PUNCT'),
 ('I', 'PRON'),
 ('have', 'AUX'),
 ('seldom', 'ADV'),
 ('heard', 'VERB'),
 ('him', 'PRON'),
 ('\n', 'SPACE'),
 ('mention', 'VERB'),
 ('her', 'PRON'),
 ('under', 'ADP'),
 ('any', 'DET'),
 ('other', 'ADJ'),
 ('name', 'NOUN'),
 ('.', 'PUNCT'),
 ('In', 'ADP'),
 ('his', 'PRON'),
 ('eyes', 'NOUN'),
 ('she', 'PRON'),
 ('eclipses', 'VERB'),
 ('and', 'CCONJ'),
 ('\n', 'SPACE'),
 ('predominates', 'VERB'),
 ('the', 'DET'),
 ('whole', 'NOUN'),
 ('of', 'ADP'),
 ('her', 'PRON'),
 ('sex', 'NOUN'),
 ('.', 'PUNCT'),
 ('It', 'PRON'),
 ('was', 'AUX'),
 ('not', 'PART'),
 ('that', 'SCONJ'),
 ('he', 'PRON'),
 ('felt', 'VERB'),
 ('any', 'DET'),
 ('emotion', 'NOUN'),
 ('\n', 'SPACE'),
 ('akin', 'ADJ'),
 ('to', 'PART'),
 ('love', 'VERB'),
 ('for', 'ADP'),
 ('Irene', 'PROPN'),
 ('Adler', 'PROPN'),
 ('.', 'PUNCT

# Compare running times for NLTK and spaCy

In [26]:
import time
start = time.time()
pos_tag_nltk(text)
print(f"NLTK: {time.time() - start} s")

start = time.time()
pos_tag_spacy(text, nlp)
print(f"spaCy: {time.time() - start} s")

NLTK: 0.007420539855957031 s
spaCy: 0.08337211608886719 s


# Get part of speech tags using GPT-3

In [34]:
import openai

OPEN_AI_KEY="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" #REMOVE LATER
openai.api_key = OPEN_AI_KEY

In [24]:
prompt="""Decide what the part of speech tags are for a sentence. 
Preserve original capitalization. 
Return the list in the format of a python tuple: (word, part of speech). 
Sentence: In his eyes she eclipses and predominates the whole of her sex."""
response = openai.Completion.create(
    model="text-davinci-003",
    prompt=prompt,
    temperature=0,
    max_tokens=256,
    top_p=1.0,
    frequency_penalty=0,
    presence_penalty=0
)
print(response)

{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "text": "\n\n('In', 'IN'), ('his', 'PRP$'), ('eyes', 'NNS'), ('she', 'PRP'), ('eclipses', 'VBZ'), ('and', 'CC'), ('predominates', 'VBZ'), ('the', 'DT'), ('whole', 'NN'), ('of', 'IN'), ('her', 'PRP$'), ('sex', 'NN')"
    }
  ],
  "created": 1684873552,
  "id": "cmpl-7JSrosjDLNYJCnSkgXADCi3Dx0TH9",
  "model": "text-davinci-003",
  "object": "text_completion",
  "usage": {
    "completion_tokens": 87,
    "prompt_tokens": 55,
    "total_tokens": 142
  }
}


In [35]:
from ast import literal_eval

def pos_tag_gpt(text, open_ai_key):
    openai.api_key = open_ai_key
    prompt = f"""Decide what the part of speech tags are for a sentence. 
    Preserve original capitalization. 
    Return the list in the format of a python tuple: (word, part of speech).
    Sentence: {text}."""
    response = openai.Completion.create(
        model="text-davinci-003",
        prompt=prompt,
        temperature=0,
        max_tokens=256,
        top_p=1.0,
        frequency_penalty=0,
        presence_penalty=0
    )
    result = response["choices"][0]["text"]
    result = result.replace("\n", "")
    result = list(literal_eval(result))
    return result

In [39]:
start = time.time()
first_sentence = "In his eyes she eclipses and predominates the whole of her sex."
words_with_pos = pos_tag_gpt(first_sentence, OPEN_AI_KEY)
print(words_with_pos)
print(f"GPT: {time.time() - start} s")

[('In', 'IN'), ('his', 'PRP$'), ('eyes', 'NNS'), ('she', 'PRP'), ('eclipses', 'VBZ'), ('and', 'CC'), ('predominates', 'VBZ'), ('the', 'DT'), ('whole', 'NN'), ('of', 'IN'), ('her', 'PRP$'), ('sex', 'NN'), ('.', '.')]
GPT: 9.507455110549927 s


In [40]:
words_with_pos_nltk = pos_tag_nltk(first_sentence)
print(words_with_pos)

[('In', 'IN'), ('his', 'PRP$'), ('eyes', 'NNS'), ('she', 'PRP'), ('eclipses', 'VBZ'), ('and', 'CC'), ('predominates', 'VBZ'), ('the', 'DT'), ('whole', 'NN'), ('of', 'IN'), ('her', 'PRP$'), ('sex', 'NN'), ('.', '.')]


In [41]:
print(words_with_pos == words_with_pos_nltk)

True
