In [12]:
import spacy
from transformers import AutoTokenizer
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import pandas as pd

# Text Preprocessing

In [19]:
def preprocess_text(text):
  

   
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')

    # models
    nlp = spacy.load("en_core_web_sm")
    hf_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))


    doc = nlp(text)
    sentences = [sent.text for sent in doc]
    words = [token.text for token in doc]

   
    lowercased = [word.lower() for word in words]

    
    no_punct = [word for word in lowercased if word not in string.punctuation]

   
    no_stopwords = [word for word in no_punct if word not in stop_words]


    stemmed = [stemmer.stem(word) for word in no_stopwords]

   
    lemmatized = [token.lemma_ for token in nlp(" ".join(no_stopwords))]

  
    subwords = hf_tokenizer.tokenize(text)

    return {
        "original_text": text,
        "sentences": sentences,
        "word_tokens": words,
        "lowercased": lowercased,
        "no_punctuation": no_punct,
        "no_stopwords": no_stopwords,
        "stemmed": stemmed,
        "lemmatized": lemmatized,
        "subwords": subwords
    }


In [21]:
df = pd.read_csv('/Users/tshmacm1172/Desktop/DimowKay_FinBot/test_data.csv')


df["question_processed"] = df["question"].apply(preprocess_text)
df["answer_processed"] = df["answer"].apply(preprocess_text)

# Example: View preprocessed tokens for the first question
print(df["question_processed"].iloc[0])


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tshmacm1172/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tshmacm1172/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tshmacm1172/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tshmacm1172/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tshmacm1172/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tshmacm1172/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


KeyboardInterrupt: 

# Vectorization

In [8]:
df

Unnamed: 0,question,answer,question_tokens,answer_tokens
0,Company revenue increased however stock price ...,The company released its 2nd Quarter Revenue o...,{'sentences': ['Company revenue increased howe...,{'sentences': ['The company released its 2nd Q...
1,"Should I invest in my house, when its in my wi...",If you are concerned about it being inequitabl...,"{'sentences': ['Should I invest in my house, w...",{'sentences': ['If you are concerned about it ...
2,Proscons for buying gold vs. saving money in a...,Just because gold performed that well in the p...,{'sentences': ['Proscons for buying gold vs. s...,{'sentences': ['Just because gold performed th...
3,still have mortgage on old house to be torn do...,"I could be wrong, but I doubt youre going to b...",{'sentences': ['still have mortgage on old hou...,"{'sentences': ['I could be wrong, but I doubt ..."
4,Deal with stock PSEC,It looks like it has to deal with an expiratio...,"{'sentences': ['Deal with stock PSEC'], 'words...",{'sentences': ['It looks like it has to deal w...
...,...,...,...,...
2577,How to find a reputable company to help sell a...,You own something with very little market valu...,{'sentences': ['How to find a reputable compan...,{'sentences': ['You own something with very li...
2578,Convertible debtnotebonddebentures which of th...,They all basically mean the same thing a type...,{'sentences': ['Convertible debtnotebonddebent...,{'sentences': ['They all basically mean the sa...
2579,U.S. Mutual Fund Supermarkets Where are some g...,"I personally like Schwab. Great service, low f...",{'sentences': ['U.S. Mutual Fund Supermarkets ...,"{'sentences': ['I personally like Schwab.', 'G..."
2580,How can I deal with a spouse who compulsively ...,"Perhaps it seems harsh, but I would get separa...",{'sentences': ['How can I deal with a spouse w...,"{'sentences': ['Perhaps it seems harsh, but I ..."
