In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import re

# Ensure necessary resources are downloaded
# nltk.download("punkt")
# nltk.download("averaged_perceptron_tagger")

nltk.download('punkt', download_dir='/Users/xiaoyunhan/nltk_data')
nltk.download('averaged_perceptron_tagger', download_dir='/Users/xiaoyunhan/nltk_data')

# Step 1: Tokenization
def tokenize_sentence(sentence):
    return word_tokenize(sentence)

# Step 2: POS Tagging
def pos_tagging(tokens):
    return pos_tag(tokens)

# Step 3: Custom Chunking Function
def chunk_tokens(tokens):
    chunked = []
    i = 0
    while i < len(tokens):
        token = tokens[i]
        if re.match(r"\d+%", token):  # Match percentage
            numeric_value = float(token.strip("%")) / 100
            if i + 1 < len(tokens):
                chunked.append(f"{numeric_value}*{tokens[i+1]}")
                i += 1  # Skip next token since it is already processed
        elif token.lower() == "hedge" and i + 2 < len(tokens):
            percentage = float(tokens[i+1].strip("%")) / 100
            if tokens[i+2].lower() == "delta":
                function_call = f"Delta(euro_call({tokens[i+3]}))"
                chunked.append(f"hedge {percentage}*{function_call}")
                i += 3  # Skip "Delta", "euro_call", and the stock name
        else:
            chunked.append(token)
        i += 1
    return chunked

# Step 4: Mapping to Mathematical Expression
def map_to_expression(chunked_tokens):
    portfolio = []
    for token in chunked_tokens:
        if "hedge" in token:
            portfolio.append(f"-{token.split(' ', 1)[1]}")
        else:
            portfolio.append(token)
    return " + ".join(portfolio)

# Test Case
sentence = "Build a portfolio with 50% AAPL and hedge 30% with Delta of a European call on TSLA"

# Processing Steps
tokens = tokenize_sentence(sentence)
pos_tags = pos_tagging(tokens)
chunked_tokens = chunk_tokens(tokens)
mapped_expression = map_to_expression(chunked_tokens)

# Display Results
tokens, pos_tags, chunked_tokens, mapped_expression


In [None]:
import nltk
print(nltk.data.path)


['/Users/xiaoyunhan/nltk_data', '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/nltk_data', '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/share/nltk_data', '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']
