In [9]:
import pandas as pd

In [10]:
df = pd.read_csv('train_data.csv')
df

Unnamed: 0,question,answer
0,Would I ever need credit card if my debit card...,Skimmers are most likely at gas station pumps....
1,Cheapest way to wire or withdraw money from US...,There is a number of cheaper online options th...
2,How do I go about finding an honest ethical f...,Large and wellknown companies are typically a ...
3,Why invest in becoming a landlord?,why does it make sense financially to buy prop...
4,What could be the cause of a extreme highlow p...,Often these types of trades fall into two diff...
...,...,...
12042,What percent of my salary should I save?,I disagree with the selected answer. Theres no...
12043,Why do people invest in mutual fund rather tha...,How on earth can you possibly know what is goi...
12044,What would happen if the Euro currency went bust?,Each country would have to go back to its own ...
12045,Are credit cards not viewed as credit until yo...,Theres a difference between missing a payment ...


# Preprocessing
clean the text by removing extra spaces, special characters, and unwanted symbols.


In [11]:
import re

Here I defined a function called data cleaning.
The function will do the following:
1. remove punctuations but keep things like currency, numbers, and percentages. Please keep in mind that the finBOT may make use of these hence we're keeping them.

2. The function will convert all text to lower case and get rid of whitespaces from the start and the end of the string for consistency.


Example: " Tokenization sucks " => "tokenization sucks"

In [12]:
def data_cleaning(text):
    text = re.sub(r'[^\w\s.$%€£0-9]', '', text)
    text =  text.lower().strip()
    return text

In [13]:
df['question'] = df['question'].apply(data_cleaning)
df['answer'] = df['answer'].apply(data_cleaning)

# Tokenization

1. Word level tokenization
2. Lemmatization
3. Subword Tokenization
4. Sentence Tokenization

# 1. Word level Tokenization

Here we are just splitting text into individual words.

Example: "interest rate increases" => ["interest", "rate", "increases"]

In [14]:
from nltk.tokenize import word_tokenize
import nltk

nltk.download("punkt_tab")

df["word_token_question"] = df["question"].apply(word_tokenize)
df["word_token_answer"] = df["answer"].apply(word_tokenize)

print(df[["question", "word_token_question"]].head())
print(df[["answer", "word_token_answer"]].head())


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/tshmacm1172/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


                                            question  \
0  would i ever need credit card if my debit card...   
1  cheapest way to wire or withdraw money from us...   
2  how do i go about finding an honest  ethical f...   
3                  why invest in becoming a landlord   
4  what could be the cause of a extreme highlow p...   

                                 word_token_question  
0  [would, i, ever, need, credit, card, if, my, d...  
1  [cheapest, way, to, wire, or, withdraw, money,...  
2  [how, do, i, go, about, finding, an, honest, e...  
3           [why, invest, in, becoming, a, landlord]  
4  [what, could, be, the, cause, of, a, extreme, ...  
                                              answer  \
0  skimmers are most likely at gas station pumps....   
1  there is a number of cheaper online options th...   
2  large and wellknown companies are typically a ...   
3  why does it make sense financially to buy prop...   
4  often these types of trades fall into two diff... 

# 2. Lemmatization
takes words to their root word

Example: "Motsekuwa" => "Mo"

Note: run this line of code first: pip install spacy

In [17]:
import spacy.cli
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

def lemmatize_text(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc]

df["lemmatized_token_question"] = df["question"].apply(lemmatize_text)
df["lemmatized_token_answer"] = df["answer"].apply(lemmatize_text)
print(df[["question", "lemmatized_token_question"]].head())
print(df[["answer", "lemmatized_token_answer"]].head())


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m945.7 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
                                            question  \
0  would i ever need credit card if my debit card...   
1  cheapest way to wire or withdraw money from us...   
2  how do i go about finding an honest  ethical f...   
3  

# 3. Subword Tokenization
it makes sure that complicated words and non "dictionarized" words are processed efficiently

Example: finbotization => "finbot", "ization"

Note: run this line of code first: pip install transformers. 

### "Ġ" appears before words that originally had a space before them to ensure correct spacing when reconstructing the sentence.

In [18]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

df["subword_token_question"] = df["question"].apply(lambda x: tokenizer.tokenize(x))
df["subword_token_answer"] = df["answer"].apply(lambda x: tokenizer.tokenize(x))
print(df[["question", "subword_token_question"]].head())
print(df[["answer", "subword_token_answer"]].head())

RuntimeError: Failed to import transformers.models.auto.tokenization_auto because of the following error (look up to see its traceback):
Failed to import transformers.generation.utils because of the following error (look up to see its traceback):
module 'numpy._core' has no attribute 'multiarray'

# 4. Sentence Tokenization

this will ensure we still keep the sentence
Example: Interest rates will rise. Investors are adjusting portfolios. => ["Interest rates will rise.", "Investors are adjusting portfolios."]


In [None]:
def sentence_tokenize(text):
    doc = nlp(text)
    return [sent.text for sent in doc.sents]  

df["sentence_tokenized_questions"] = df["question"].apply(sentence_tokenize)
df["sentence_tokenized_answers"] = df["answer"].apply(sentence_tokenize)


print(df[["question", "sentence_tokenized_questions"]].head())
print(df[["answer", "sentence_tokenized_answers"]].head())


                                            question  \
0  would i ever need credit card if my debit card...   
1  cheapest way to wire or withdraw money from us...   
2  how do i go about finding an honest  ethical f...   
3                  why invest in becoming a landlord   
4  what could be the cause of a extreme highlow p...   

                        sentence_tokenized_questions  
0  [would i ever need credit card if my debit car...  
1  [cheapest way to wire or withdraw money from u...  
2  [how do i go about finding an honest  ethical ...  
3                [why invest in becoming a landlord]  
4  [what could be the cause of a extreme highlow ...  
                                              answer  \
0  skimmers are most likely at gas station pumps....   
1  there is a number of cheaper online options th...   
2  large and wellknown companies are typically a ...   
3  why does it make sense financially to buy prop...   
4  often these types of trades fall into two diff... 

In [None]:
df

Unnamed: 0,question,answer,word_token_question,word_token_answer,lemmatized_token_question,lemmatized_token_answer,subword_token_question,subword_token_answer,sentence_tokenized_questions,sentence_tokenized_answers
0,would i ever need credit card if my debit card...,skimmers are most likely at gas station pumps....,"[would, i, ever, need, credit, card, if, my, d...","[skimmers, are, most, likely, at, gas, station...","[would, I, ever, need, credit, card, if, my, d...","[skimmer, be, most, likely, at, gas, station, ...","[would, Ġi, Ġever, Ġneed, Ġcredit, Ġcard, Ġif,...","[sk, immers, Ġare, Ġmost, Ġlikely, Ġat, Ġgas, ...",[would i ever need credit card if my debit car...,[skimmers are most likely at gas station pumps...
1,cheapest way to wire or withdraw money from us...,there is a number of cheaper online options th...,"[cheapest, way, to, wire, or, withdraw, money,...","[there, is, a, number, of, cheaper, online, op...","[cheap, way, to, wire, or, withdraw, money, fr...","[there, be, a, number, of, cheap, online, opti...","[che, apest, Ġway, Ġto, Ġwire, Ġor, Ġwithdraw,...","[there, Ġis, Ġa, Ġnumber, Ġof, Ġcheaper, Ġonli...",[cheapest way to wire or withdraw money from u...,[there is a number of cheaper online options t...
2,how do i go about finding an honest ethical f...,large and wellknown companies are typically a ...,"[how, do, i, go, about, finding, an, honest, e...","[large, and, wellknown, companies, are, typica...","[how, do, I, go, about, find, an, honest, , e...","[large, and, wellknown, company, be, typically...","[how, Ġdo, Ġi, Ġgo, Ġabout, Ġfinding, Ġan, Ġho...","[large, Ġand, Ġwell, known, Ġcompanies, Ġare, ...",[how do i go about finding an honest ethical ...,[large and wellknown companies are typically a...
3,why invest in becoming a landlord,why does it make sense financially to buy prop...,"[why, invest, in, becoming, a, landlord]","[why, does, it, make, sense, financially, to, ...","[why, invest, in, become, a, landlord]","[why, do, it, make, sense, financially, to, bu...","[why, Ġinvest, Ġin, Ġbecoming, Ġa, Ġlandlord]","[why, Ġdoes, Ġit, Ġmake, Ġsense, Ġfinancially,...",[why invest in becoming a landlord],[why does it make sense financially to buy pro...
4,what could be the cause of a extreme highlow p...,often these types of trades fall into two diff...,"[what, could, be, the, cause, of, a, extreme, ...","[often, these, types, of, trades, fall, into, ...","[what, could, be, the, cause, of, a, extreme, ...","[often, these, type, of, trade, fall, into, tw...","[what, Ġcould, Ġbe, Ġthe, Ġcause, Ġof, Ġa, Ġex...","[often, Ġthese, Ġtypes, Ġof, Ġtrades, Ġfall, Ġ...",[what could be the cause of a extreme highlow ...,[often these types of trades fall into two dif...
...,...,...,...,...,...,...,...,...,...,...
12042,what percent of my salary should i save,i disagree with the selected answer. theres no...,"[what, percent, of, my, salary, should, i, save]","[i, disagree, with, the, selected, answer, ., ...","[what, percent, of, my, salary, should, I, save]","[I, disagree, with, the, select, answer, ., th...","[what, Ġpercent, Ġof, Ġmy, Ġsalary, Ġshould, Ġ...","[i, Ġdisagree, Ġwith, Ġthe, Ġselected, Ġanswer...",[what percent of my salary should i save],"[i disagree with the selected answer., theres ..."
12043,why do people invest in mutual fund rather tha...,how on earth can you possibly know what is goi...,"[why, do, people, invest, in, mutual, fund, ra...","[how, on, earth, can, you, possibly, know, wha...","[why, do, people, invest, in, mutual, fund, ra...","[how, on, earth, can, you, possibly, know, wha...","[why, Ġdo, Ġpeople, Ġinvest, Ġin, Ġmutual, Ġfu...","[how, Ġon, Ġearth, Ġcan, Ġyou, Ġpossibly, Ġkno...",[why do people invest in mutual fund rather th...,[how on earth can you possibly know what is go...
12044,what would happen if the euro currency went bust,each country would have to go back to its own ...,"[what, would, happen, if, the, euro, currency,...","[each, country, would, have, to, go, back, to,...","[what, would, happen, if, the, euro, currency,...","[each, country, would, have, to, go, back, to,...","[what, Ġwould, Ġhappen, Ġif, Ġthe, Ġeuro, Ġcur...","[each, Ġcountry, Ġwould, Ġhave, Ġto, Ġgo, Ġbac...",[what would happen if the euro currency went b...,[each country would have to go back to its own...
12045,are credit cards not viewed as credit until yo...,theres a difference between missing a payment ...,"[are, credit, cards, not, viewed, as, credit, ...","[theres, a, difference, between, missing, a, p...","[be, credit, card, not, view, as, credit, unti...","[there, s, a, difference, between, miss, a, pa...","[are, Ġcredit, Ġcards, Ġnot, Ġviewed, Ġas, Ġcr...","[the, res, Ġa, Ġdifference, Ġbetween, Ġmissing...",[are credit cards not viewed as credit until y...,[theres a difference between missing a payment...


# Vectorizing a text to numeric

In [None]:
df.columns

Index(['question', 'answer', 'word_token_question', 'word_token_answer'], dtype='object')