# Text preprocessing

# 0. Create Data sets

In [3]:
import pandas as pd

In [4]:
data = [
    "When life gives you lemons, make lemonade! 🙂",
    "She bought 2 lemons for $1 at Maven Market.",
    "A dozen lemons will make a gallon of lemonade. [AllRecipes]",
    "lemon, lemon, lemons, lemon, lemon, lemons",
    "He's running to the market to get a lemon — there's a great sale today.",
    "Does Maven Market carry Eureka lemons or Meyer lemons?",
    "An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]",
    "iced tea is my favorite"
]

In [5]:
pd.set_option('display.max_colwidth', None)


In [6]:
data

['When life gives you lemons, make lemonade! 🙂',
 'She bought 2 lemons for $1 at Maven Market.',
 'A dozen lemons will make a gallon of lemonade. [AllRecipes]',
 'lemon, lemon, lemons, lemon, lemon, lemons',
 "He's running to the market to get a lemon — there's a great sale today.",
 'Does Maven Market carry Eureka lemons or Meyer lemons?',
 'An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]',
 'iced tea is my favorite']

In [7]:
data_df=pd.DataFrame(data, columns=['sentence'])

In [8]:
data_df

Unnamed: 0,sentence
0,"When life gives you lemons, make lemonade! 🙂"
1,She bought 2 lemons for $1 at Maven Market.
2,A dozen lemons will make a gallon of lemonade. [AllRecipes]
3,"lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon — there's a great sale today.
5,Does Maven Market carry Eureka lemons or Meyer lemons?
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]"
7,iced tea is my favorite


In [9]:
test = [
    "We're going to start this course with traditional NLP applications.",
    "Then we'll move on to modern NLP theory.",
    "Finally, we'll wrap things up with modern NLP applications."
]

test_series=pd.Series(test)
test_series

0    We're going to start this course with traditional NLP applications.
1                               Then we'll move on to modern NLP theory.
2            Finally, we'll wrap things up with modern NLP applications.
dtype: object

In [10]:
test_series_df=pd.DataFrame(test_series, columns=['sentence'])
test_series_df

Unnamed: 0,sentence
0,We're going to start this course with traditional NLP applications.
1,Then we'll move on to modern NLP theory.
2,"Finally, we'll wrap things up with modern NLP applications."


# Text preprocessing with pandas

In [11]:
data_df

Unnamed: 0,sentence
0,"When life gives you lemons, make lemonade! 🙂"
1,She bought 2 lemons for $1 at Maven Market.
2,A dozen lemons will make a gallon of lemonade. [AllRecipes]
3,"lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon — there's a great sale today.
5,Does Maven Market carry Eureka lemons or Meyer lemons?
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]"
7,iced tea is my favorite


In [12]:
df=data_df.copy()
df

Unnamed: 0,sentence
0,"When life gives you lemons, make lemonade! 🙂"
1,She bought 2 lemons for $1 at Maven Market.
2,A dozen lemons will make a gallon of lemonade. [AllRecipes]
3,"lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon — there's a great sale today.
5,Does Maven Market carry Eureka lemons or Meyer lemons?
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]"
7,iced tea is my favorite


In [13]:
# lowercalse

df['sentence_clean']=df['sentence'].str.lower()

In [14]:
df

Unnamed: 0,sentence,sentence_clean
0,"When life gives you lemons, make lemonade! 🙂","when life gives you lemons, make lemonade! 🙂"
1,She bought 2 lemons for $1 at Maven Market.,she bought 2 lemons for $1 at maven market.
2,A dozen lemons will make a gallon of lemonade. [AllRecipes],a dozen lemons will make a gallon of lemonade. [allrecipes]
3,"lemon, lemon, lemons, lemon, lemon, lemons","lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon — there's a great sale today.,he's running to the market to get a lemon — there's a great sale today.
5,Does Maven Market carry Eureka lemons or Meyer lemons?,does maven market carry eureka lemons or meyer lemons?
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]","an arnold palmer is half lemonade, half iced tea. [wikipedia]"
7,iced tea is my favorite,iced tea is my favorite


# remove square brackets []

In [15]:
df['sentence_clean']=df['sentence_clean'].str.replace(r'\[.*?\]','',regex=True) # after '\' remove text in brackets and before '\' removes text in brackets
df

Unnamed: 0,sentence,sentence_clean
0,"When life gives you lemons, make lemonade! 🙂","when life gives you lemons, make lemonade! 🙂"
1,She bought 2 lemons for $1 at Maven Market.,she bought 2 lemons for $1 at maven market.
2,A dozen lemons will make a gallon of lemonade. [AllRecipes],a dozen lemons will make a gallon of lemonade.
3,"lemon, lemon, lemons, lemon, lemon, lemons","lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon — there's a great sale today.,he's running to the market to get a lemon — there's a great sale today.
5,Does Maven Market carry Eureka lemons or Meyer lemons?,does maven market carry eureka lemons or meyer lemons?
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]","an arnold palmer is half lemonade, half iced tea."
7,iced tea is my favorite,iced tea is my favorite


In [16]:
# remove punctuation
df['sentence_clean']=df['sentence_clean'].str.replace(r'[^\w\s]','',regex=True) # ^ - not, \w - word character, \s - whitespace, spaces, tabs,newlines
df

Unnamed: 0,sentence,sentence_clean
0,"When life gives you lemons, make lemonade! 🙂",when life gives you lemons make lemonade
1,She bought 2 lemons for $1 at Maven Market.,she bought 2 lemons for 1 at maven market
2,A dozen lemons will make a gallon of lemonade. [AllRecipes],a dozen lemons will make a gallon of lemonade
3,"lemon, lemon, lemons, lemon, lemon, lemons",lemon lemon lemons lemon lemon lemons
4,He's running to the market to get a lemon — there's a great sale today.,hes running to the market to get a lemon theres a great sale today
5,Does Maven Market carry Eureka lemons or Meyer lemons?,does maven market carry eureka lemons or meyer lemons
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]",an arnold palmer is half lemonade half iced tea
7,iced tea is my favorite,iced tea is my favorite


In [17]:
df['sentence_clean']=df['sentence'].str.lower()
df['sentence_clean']=df['sentence_clean'].str.replace(r'\[.*?\]','',regex=True) 
df['sentence_clean']=df['sentence_clean'].str.replace(r'[^\w\s]','',regex=True) 

In [18]:
def lower_replace(series): #column of data #series=pd.Series(test) where test is column text, or series=df['sentence']
    output=series.str.lower()
    output=output.str.replace(r'\[.*?\]','',regex=True) 
    output=output.str.replace(r'[^\w\s]','',regex=True) 
    return output

In [19]:
test_series

0    We're going to start this course with traditional NLP applications.
1                               Then we'll move on to modern NLP theory.
2            Finally, we'll wrap things up with modern NLP applications.
dtype: object

In [20]:
lower_replace(test_series)

0    were going to start this course with traditional nlp applications
1                               then well move on to modern nlp theory
2             finally well wrap things up with modern nlp applications
dtype: object

In [21]:
lower_replace(df.sentence) #we have to specify series or column data

0                              when life gives you lemons make lemonade 
1                              she bought 2 lemons for 1 at maven market
2                         a dozen lemons will make a gallon of lemonade 
3                                  lemon lemon lemons lemon lemon lemons
4    hes running to the market to get a lemon  theres a great sale today
5                  does maven market carry eureka lemons or meyer lemons
6                       an arnold palmer is half lemonade half iced tea 
7                                                iced tea is my favorite
Name: sentence, dtype: object

# Lekcja 38. i dalej - tokens, lemmas, stop words

In [22]:
df

Unnamed: 0,sentence,sentence_clean
0,"When life gives you lemons, make lemonade! 🙂",when life gives you lemons make lemonade
1,She bought 2 lemons for $1 at Maven Market.,she bought 2 lemons for 1 at maven market
2,A dozen lemons will make a gallon of lemonade. [AllRecipes],a dozen lemons will make a gallon of lemonade
3,"lemon, lemon, lemons, lemon, lemon, lemons",lemon lemon lemons lemon lemon lemons
4,He's running to the market to get a lemon — there's a great sale today.,hes running to the market to get a lemon theres a great sale today
5,Does Maven Market carry Eureka lemons or Meyer lemons?,does maven market carry eureka lemons or meyer lemons
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]",an arnold palmer is half lemonade half iced tea
7,iced tea is my favorite,iced tea is my favorite


In [23]:
df.sentence_clean[0]

'when life gives you lemons make lemonade '

In [24]:
import spacy
nlp = spacy.load('en_core_web_sm')
import en_core_web_sm
nlp = en_core_web_sm.load()

phrase=df.sentence_clean[0]
doc=nlp(phrase) #obiekt spacy
print([(token.text, token.pos_) for token in doc])

[('when', 'SCONJ'), ('life', 'NOUN'), ('gives', 'VERB'), ('you', 'PRON'), ('lemons', 'NOUN'), ('make', 'VERB'), ('lemonade', 'NOUN')]


In [25]:
#1. tokenize
[token.text for token in doc] #.text zwraca string

['when', 'life', 'gives', 'you', 'lemons', 'make', 'lemonade']

In [26]:
#2. lematyzacja - zwraca podstawową formę
[token.lemma_ for token in doc ]

['when', 'life', 'give', 'you', 'lemon', 'make', 'lemonade']

In [27]:
list(nlp.Defaults.stop_words)[:10]

['mostly',
 'something',
 'both',
 'again',
 'afterwards',
 'regarding',
 'whenever',
 'ours',
 'namely',
 'nowhere']

In [28]:
#3. stop words - lematyzacja bez stop words
norm=[token.lemma_ for token in doc if not token.is_stop]
norm

['life', 'give', 'lemon', 'lemonade']

In [29]:
' '.join(norm) #dolacza spacje

'life give lemon lemonade'

# 39

In [30]:
def token_lemma_nonstop(text):
    doc=nlp(text)
    output=[token.lemma_ for token in doc if not token.is_stop] #lista lematyzowanych tokenów
    output=' '.join(output) 
    return output

In [31]:
test_series

0    We're going to start this course with traditional NLP applications.
1                               Then we'll move on to modern NLP theory.
2            Finally, we'll wrap things up with modern NLP applications.
dtype: object

In [32]:
test_series.apply(token_lemma_nonstop) #mogę zaaplikować funkcję na każdym wierszu

0    go start course traditional NLP application .
1                              modern NLP theory .
2    finally , wrap thing modern NLP application .
dtype: object

In [33]:
df

Unnamed: 0,sentence,sentence_clean
0,"When life gives you lemons, make lemonade! 🙂",when life gives you lemons make lemonade
1,She bought 2 lemons for $1 at Maven Market.,she bought 2 lemons for 1 at maven market
2,A dozen lemons will make a gallon of lemonade. [AllRecipes],a dozen lemons will make a gallon of lemonade
3,"lemon, lemon, lemons, lemon, lemon, lemons",lemon lemon lemons lemon lemon lemons
4,He's running to the market to get a lemon — there's a great sale today.,hes running to the market to get a lemon theres a great sale today
5,Does Maven Market carry Eureka lemons or Meyer lemons?,does maven market carry eureka lemons or meyer lemons
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]",an arnold palmer is half lemonade half iced tea
7,iced tea is my favorite,iced tea is my favorite


In [34]:
lower_replace(df.sentence) #pandas

0                              when life gives you lemons make lemonade 
1                              she bought 2 lemons for 1 at maven market
2                         a dozen lemons will make a gallon of lemonade 
3                                  lemon lemon lemons lemon lemon lemons
4    hes running to the market to get a lemon  theres a great sale today
5                  does maven market carry eureka lemons or meyer lemons
6                       an arnold palmer is half lemonade half iced tea 
7                                                iced tea is my favorite
Name: sentence, dtype: object

In [35]:
lower_replace(df.sentence).apply(token_lemma_nonstop) #pandas - str.lower + spacy (token, lemma,stop)

0                       life give lemon lemonade
1                     buy 2 lemon 1 maven market
2                    dozen lemon gallon lemonade
3            lemon lemon lemon lemon lemon lemon
4        s run market lemon   s great sale today
5    maven market carry eureka lemon meyer lemon
6       arnold palmer half lemonade half ice tea
7                               ice tea favorite
Name: sentence, dtype: object

In [36]:
lower_replace(df.sentence).apply(token_lemma_nonstop)[0] #-||-

'life give lemon lemonade'

In [37]:
# 4.Parts of speech tagging POS

In [38]:
phrase2=lower_replace(df.sentence).apply(token_lemma_nonstop)[0]
phrase2

'life give lemon lemonade'

In [39]:
doc2=nlp(phrase2)
doc2

life give lemon lemonade

In [40]:
[(token.text,token.pos_) for token in doc2]

[('life', 'NOUN'), ('give', 'VERB'), ('lemon', 'NOUN'), ('lemonade', 'PROPN')]

In [41]:
[(token.text,token.pos_) for token in doc2 if token.pos_ in ['NOUN', 'PROPN']]

[('life', 'NOUN'), ('lemon', 'NOUN'), ('lemonade', 'PROPN')]

In [42]:
nouns=[(token.text) for token in doc2 if token.pos_ in ['NOUN', 'PROPN']]

In [43]:
' '.join(nouns)

'life lemon lemonade'

In [44]:
# funkcja zwracajaca pos

def filter_pos(text,pos_list=['NOUN', 'PROPN']):
    doc=nlp(text)
    output=[(token.text) for token in doc if token.pos_ in pos_list]
    output=' '.join(output)
    return output


In [45]:
filter_pos(doc2,'VERB')


'give'

In [46]:
filter_pos(test_series[0])

'course NLP applications'

In [47]:
test_series.apply(filter_pos) #apply każdy wiersz

0    course NLP applications
1                 NLP theory
2    things NLP applications
dtype: object

In [48]:
df.sentence

0                               When life gives you lemons, make lemonade! 🙂
1                                She bought 2 lemons for $1 at Maven Market.
2                A dozen lemons will make a gallon of lemonade. [AllRecipes]
3                                 lemon, lemon, lemons, lemon, lemon, lemons
4    He's running to the market to get a lemon — there's a great sale today.
5                     Does Maven Market carry Eureka lemons or Meyer lemons?
6              An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]
7                                                    iced tea is my favorite
Name: sentence, dtype: object

In [49]:
lower_replace(df.sentence)

0                              when life gives you lemons make lemonade 
1                              she bought 2 lemons for 1 at maven market
2                         a dozen lemons will make a gallon of lemonade 
3                                  lemon lemon lemons lemon lemon lemons
4    hes running to the market to get a lemon  theres a great sale today
5                  does maven market carry eureka lemons or meyer lemons
6                       an arnold palmer is half lemonade half iced tea 
7                                                iced tea is my favorite
Name: sentence, dtype: object

In [50]:
lower_replace(df.sentence).apply(filter_pos).apply(token_lemma_nonstop)

0              life lemon lemonade
1               lemon maven market
2      dozen lemon gallon lemonade
3    lemon lemon lemon lemon lemon
4          market lemon sale today
5        market eureka lemon lemon
6              palmer lemonade tea
7                              tea
Name: sentence, dtype: object

In [51]:
lower_replace(df.sentence).apply(lambda x: filter_pos(x, ['VERB']))

0       gives make
1           bought
2             make
3            lemon
4    running get s
5            carry
6             iced
7             iced
Name: sentence, dtype: object

# Lekcja 41 Create a NLP Pipepline

In [52]:
def lower_replace(series): #column of data #series=pd.Series(test) where test is column text, or series=df['sentence']
    output=series.str.lower()
    output=output.str.replace(r'\[.*?\]','',regex=True) 
    output=output.str.replace(r'[^\w\s]','',regex=True) 
    return output

def token_lemma_nonstop(text):
    doc=nlp(text)
    output=[token.lemma_ for token in doc if not token.is_stop] #lista lematyzowanych tokenów
    output=' '.join(output) 
    return output

def filter_pos(text,pos_list=['NOUN', 'PROPN']):
    doc=nlp(text)
    output=[(token.text) for token in doc if token.pos_ in pos_list]
    output=' '.join(output)
    return output


In [53]:
def nlp_pipeline(series):
    output=lower_replace(series)
    output=output.apply(token_lemma_nonstop) #only single string
    output=output.apply(filter_pos)
    return output

In [54]:
test_series

0    We're going to start this course with traditional NLP applications.
1                               Then we'll move on to modern NLP theory.
2            Finally, we'll wrap things up with modern NLP applications.
dtype: object

In [56]:
df.sentence

0                               When life gives you lemons, make lemonade! 🙂
1                                She bought 2 lemons for $1 at Maven Market.
2                A dozen lemons will make a gallon of lemonade. [AllRecipes]
3                                 lemon, lemon, lemons, lemon, lemon, lemons
4    He's running to the market to get a lemon — there's a great sale today.
5                     Does Maven Market carry Eureka lemons or Meyer lemons?
6              An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]
7                                                    iced tea is my favorite
Name: sentence, dtype: object

In [57]:
nlp_pipeline(test_series)

0          nlp application
1               nlp theory
2    thing nlp application
dtype: object

In [59]:
text_clean=nlp_pipeline(df.sentence)
text_clean

0                    life lemon lemonade
1                     lemon maven market
2            dozen lemon gallon lemonade
3    lemon lemon lemon lemon lemon lemon
4              s market lemon sale today
5        maven market eureka lemon lemon
6           palmer lemonade half ice tea
7                       ice tea favorite
Name: sentence, dtype: object

In [60]:
pd.to_pickle(text_clean,'text_clean.pkl')

In [62]:
pd.read_pickle('text_clean.pkl')

0                    life lemon lemonade
1                     lemon maven market
2            dozen lemon gallon lemonade
3    lemon lemon lemon lemon lemon lemon
4              s market lemon sale today
5        maven market eureka lemon lemon
6           palmer lemonade half ice tea
7                       ice tea favorite
Name: sentence, dtype: object