In [275]:
import pandas as pd

In [276]:
df = pd.read_csv('train_data.csv')
df

Unnamed: 0,question,answer
0,Would I ever need credit card if my debit card...,Skimmers are most likely at gas station pumps....
1,Cheapest way to wire or withdraw money from US...,There is a number of cheaper online options th...
2,How do I go about finding an honest ethical f...,Large and wellknown companies are typically a ...
3,Why invest in becoming a landlord?,why does it make sense financially to buy prop...
4,What could be the cause of a extreme highlow p...,Often these types of trades fall into two diff...
...,...,...
12042,What percent of my salary should I save?,I disagree with the selected answer. Theres no...
12043,Why do people invest in mutual fund rather tha...,How on earth can you possibly know what is goi...
12044,What would happen if the Euro currency went bust?,Each country would have to go back to its own ...
12045,Are credit cards not viewed as credit until yo...,Theres a difference between missing a payment ...


# Preprocessing
clean the text by removing extra spaces, special characters, and unwanted symbols.


In [277]:
import re

Here I defined a function called data cleaning.
The function will do the following:
1. remove punctuations but keep things like currency, numbers, and percentages. Please keep in mind that the finBOT may make use of these hence we're keeping them.

2. The function will convert all text to lower case and get rid of whitespaces from the start and the end of the string for consistency.


Example: " Tokenization sucks " => "tokenization sucks"

In [278]:
def data_cleaning(text):
    text = re.sub(r'[^\w\s.$%€£0-9]', '', text)
    text =  text.lower().strip()
    return text

In [279]:
df['question'] = df['question'].apply(data_cleaning)
df['answer'] = df['answer'].apply(data_cleaning)

In [280]:
df

Unnamed: 0,question,answer
0,would i ever need credit card if my debit card...,skimmers are most likely at gas station pumps....
1,cheapest way to wire or withdraw money from us...,there is a number of cheaper online options th...
2,how do i go about finding an honest ethical f...,large and wellknown companies are typically a ...
3,why invest in becoming a landlord,why does it make sense financially to buy prop...
4,what could be the cause of a extreme highlow p...,often these types of trades fall into two diff...
...,...,...
12042,what percent of my salary should i save,i disagree with the selected answer. theres no...
12043,why do people invest in mutual fund rather tha...,how on earth can you possibly know what is goi...
12044,what would happen if the euro currency went bust,each country would have to go back to its own ...
12045,are credit cards not viewed as credit until yo...,theres a difference between missing a payment ...


# Tokenization

1. Word level tokenization
2. Lemmatization
3. Subword Tokenization
4. Sentence Piece Tokenization

# 1. Word level Tokenization

Here we are just splitting text into individual words.

Example: "interest rate increases" => ["interest", "rate", "increases"]

In [281]:
from nltk.tokenize import word_tokenize
import nltk

nltk.download("punkt")

df["word_token_question"] = df["question"].apply(word_tokenize)
df["word_token_answer"] = df["answer"].apply(word_tokenize)

print(df[["question", "word_token_question"]].head())
print(df[["answer", "word_token_answer"]].head())


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tshmacm1171/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/tshmacm1171/nltk_data'
    - '/opt/anaconda3/nltk_data'
    - '/opt/anaconda3/share/nltk_data'
    - '/opt/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - '/tmp/nltk_data'
    - '/tmp/nltk_data'
    - '/tmp/nltk_data'
    - '/tmp/nltk_data'
    - '/tmp/nltk_data'
    - '/tmp/nltk_data'
    - '/Users/tshmacm1171/nltk_data'
**********************************************************************


# 2. Lemmatization
takes words to their root word

Example: "Motsekuwa" => "Mo"

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

def lemmatize_text(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc]

df["lemmatized_token_question"] = df["question"].apply(lemmatize_text)
df["lemmatized_token_answer"] = df["answer"].apply(lemmatize_text)
print(df[["question", "lemmatized_token_question"]].head())
print(df[["answer", "answer"]].head())


OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

# 3. Subword Tokenization
it makes sure that complicated words and non "dictionarized" words are processed efficiently

Example: finbotization => "finbot", "ization"

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

df["subword_token_question"] = df["question"].apply(lambda x: tokenizer.tokenize(x))
df["subword_token_answer"] = df["answer"].apply(lambda x: tokenizer.tokenize(x))
print(df[["question", "subword_token_question"]].head())
print(df[["answer", "subword_token_answer"]].head())

# 4. SentencePiece Tokenization

