<h1>Working With Text</h1>

In [1]:
text1 = "Ethics are built right into the ideals and objectives of the United Nations "

In [2]:
len(text1) # The length of text1

76

In [3]:
text2 = text1.split(' ') # Return a list of the words in text2, separating by ' '.

len(text2)

14

In [4]:
text2 # Have a look

['Ethics',
 'are',
 'built',
 'right',
 'into',
 'the',
 'ideals',
 'and',
 'objectives',
 'of',
 'the',
 'United',
 'Nations',
 '']

List comprehension allows us to find specific words:

In [6]:
[w for w in text2 if len(w) > 3] # Words that are more than 3 characters long

['Ethics',
 'built',
 'right',
 'into',
 'ideals',
 'objectives',
 'United',
 'Nations']

In [7]:
[w for w in text2 if w.istitle()] # Capitalized words in text2

['Ethics', 'United', 'Nations']

In [8]:
[w for w in text2 if w.endswith('s')] # Words in text2 that end in 's'

['Ethics', 'ideals', 'objectives', 'Nations']

We can find unique words using set().



In [9]:
text3 = 'To be or not to be'
text4 = text3.split(' ')

len(text4)

6

In [10]:
len(set(text4))

5

In [11]:
set(text4) # 'To' & 'to' considered 2 times

{'To', 'be', 'not', 'or', 'to'}

In [12]:
len(set([w.lower() for w in text4])) # .lower converts the string to lowercase.

4

In [13]:
set([w.lower() for w in text4])

{'be', 'not', 'or', 'to'}

From words to character

In [19]:
text5 = 'ouagadougou'

In [20]:
text6 = text5.split('ou')

In [21]:
text6

['', 'agad', 'g', '']

In [23]:
'ou'.join(text6)

'ouagadougou'

In [27]:
list(text5) # list of all characters in the string 

['o', 'u', 'a', 'g', 'a', 'd', 'o', 'u', 'g', 'o', 'u']

In [30]:
# Or we can also use 
[c for c in text5]

['o', 'u', 'a', 'g', 'a', 'd', 'o', 'u', 'g', 'o', 'u']

In [31]:
text8 = '   A quick brown fox jumped over the lazy dog. '

In [32]:
text8.split(' ') # Includes white space characters

['',
 '',
 '',
 'A',
 'quick',
 'brown',
 'fox',
 'jumped',
 'over',
 'the',
 'lazy',
 'dog.',
 '']

In [35]:
text9 = text8.strip() # Remove all white space characters
text9.split(' ')

['A', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog.']

In [36]:
text9.find('o') # First index of 'o' from start

10

In [37]:
text9.rfind('o') # Index of 'o' from last

40

In [39]:
text9.replace('o', 'O') # Replace all 'o' characters with 'O'

'A quick brOwn fOx jumped Over the lazy dOg.'

Processing free-text

In [41]:
text10 = '"Ethics are built right into the ideals and objectives of the United Nations" \
#UNSG @ NY Society for Ethical Culture bit.ly/2guVelr'
text11 = text10.split(' ')

text11

['"Ethics',
 'are',
 'built',
 'right',
 'into',
 'the',
 'ideals',
 'and',
 'objectives',
 'of',
 'the',
 'United',
 'Nations"',
 '#UNSG',
 '@',
 'NY',
 'Society',
 'for',
 'Ethical',
 'Culture',
 'bit.ly/2guVelr']

Finding Hashtags

In [43]:
[w for w in text11 if w.startswith('#')]

['#UNSG']

Finding callouts:

In [44]:
[w for w in text11 if w.startswith('@')] # It's not the actua; callout

['@']

In [45]:
text12 = '@UN @UN_Women "Ethics are built right into the ideals and objectives of the United Nations" \
#UNSG @ NY Society for Ethical Culture bit.ly/2guVelr'
text13 = text12.split(' ')

In [46]:
text13

['@UN',
 '@UN_Women',
 '"Ethics',
 'are',
 'built',
 'right',
 'into',
 'the',
 'ideals',
 'and',
 'objectives',
 'of',
 'the',
 'United',
 'Nations"',
 '#UNSG',
 '@',
 'NY',
 'Society',
 'for',
 'Ethical',
 'Culture',
 'bit.ly/2guVelr']

In [51]:
import re

[w for w in text13 if re.search('@[A-za-z0-9_]+',w)] ## There are proper callouts

['@UN', '@UN_Women']

In [53]:
# or we can also use 
[w for w in text13 if re.search('@\w+', w)] # Gets the same result

['@UN', '@UN_Women']

Finding special characters

In [54]:
text5

'ouagadougou'

In [60]:
re.findall(r'[aeiou]', text5) # Return all the vowels in text5

['o', 'u', 'a', 'a', 'o', 'u', 'o', 'u']

In [61]:
re.findall(r'[^aeiou]', text5) # Return all the constants in text5

['g', 'd', 'g']

Regular Expression for Dates

In [78]:
dateStr = '23-10-2017\n23/10/2017\n23/10/17\n10/23/2017\n23 Oct 2017\n23 October 2017\nOct 23, 2017\nOctober 23, 2017\n'

In [79]:
re.findall(r'\d{2}[/-]\d{2}[/-]\d{4}', dateStr)

['23-10-2017', '23/10/2017', '10/23/2017']

In [80]:
re.findall(r'\d{2}[/-]\d{2}[/-]\d{2,4}', dateStr)

['23-10-2017', '23/10/2017', '23/10/17', '10/23/2017']

In [81]:
re.findall(r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', dateStr)

['23-10-2017', '23/10/2017', '23/10/17', '10/23/2017']

In [82]:
re.findall(r'\d{2} (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \d{4}', dateStr)

['23 Oct 2017']

In [83]:
re.findall(r'\d{2} (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{4}', dateStr)

['23 Oct 2017', '23 October 2017']

<h2>Working with Text Data in pandas</h2>

In [88]:
import pandas as pd

In [89]:
time_sentences = ["Monday: The doctor's appointment is at 2:45pm.", 
                  "Tuesday: The dentist's appointment is at 11:30 am.",
                  "Wednesday: At 7:00pm, there is a basketball game!",
                  "Thursday: Be back home by 11:15 pm at the latest.",
                  "Friday: Take the train at 08:10 am, arrive at 09:00am."]

In [90]:
df = pd.DataFrame(data=time_sentences, columns=['text'])

In [91]:
df

Unnamed: 0,text
0,Monday: The doctor's appointment is at 2:45pm.
1,Tuesday: The dentist's appointment is at 11:30...
2,"Wednesday: At 7:00pm, there is a basketball game!"
3,Thursday: Be back home by 11:15 pm at the latest.
4,"Friday: Take the train at 08:10 am, arrive at ..."


In [92]:
# find the number of characters for each string in df['text']
df['text'].str.len()

0    46
1    50
2    49
3    49
4    54
Name: text, dtype: int64

In [93]:
# find the number of tokens/words for each string in df['text']
df['text'].str.split()

0    [Monday:, The, doctor's, appointment, is, at, ...
1    [Tuesday:, The, dentist's, appointment, is, at...
2    [Wednesday:, At, 7:00pm,, there, is, a, basket...
3    [Thursday:, Be, back, home, by, 11:15, pm, at,...
4    [Friday:, Take, the, train, at, 08:10, am,, ar...
Name: text, dtype: object

In [95]:
df['text'].str.split().str.len()

0     7
1     8
2     8
3    10
4    10
Name: text, dtype: int64

In [96]:
# find which entries contain the word 'appointment'
df['text'].str.contains('appointment')

0     True
1     True
2    False
3    False
4    False
Name: text, dtype: bool

In [97]:
# find which entries contain the word 'basketball'
df['text'].str.contains('basketball')

0    False
1    False
2     True
3    False
4    False
Name: text, dtype: bool

In [98]:
# find how many times a digit occurs in each string
df['text'].str.count(r'\d')

0    3
1    4
2    3
3    4
4    8
Name: text, dtype: int64

In [99]:
# find all occurances of the digits
df['text'].str.findall(r'\d')

0                   [2, 4, 5]
1                [1, 1, 3, 0]
2                   [7, 0, 0]
3                [1, 1, 1, 5]
4    [0, 8, 1, 0, 0, 9, 0, 0]
Name: text, dtype: object

In [100]:
# group and find the hours and minutes
df['text'].str.findall(r'(\d?\d):(\d\d)')

0               [(2, 45)]
1              [(11, 30)]
2               [(7, 00)]
3              [(11, 15)]
4    [(08, 10), (09, 00)]
Name: text, dtype: object

In [101]:
# replace weekdays with '???'
df['text'].str.replace(r'\w+day\b', '???')

0          ???: The doctor's appointment is at 2:45pm.
1       ???: The dentist's appointment is at 11:30 am.
2          ???: At 7:00pm, there is a basketball game!
3         ???: Be back home by 11:15 pm at the latest.
4    ???: Take the train at 08:10 am, arrive at 09:...
Name: text, dtype: object

In [102]:
# replace weekdays with 3 letter abbrevations
df['text'].str.replace(r'(\w+day\b)', lambda x: x.groups()[0][:3])

0          Mon: The doctor's appointment is at 2:45pm.
1       Tue: The dentist's appointment is at 11:30 am.
2          Wed: At 7:00pm, there is a basketball game!
3         Thu: Be back home by 11:15 pm at the latest.
4    Fri: Take the train at 08:10 am, arrive at 09:...
Name: text, dtype: object

In [103]:
# create new columns from first match of extracted groups
df['text'].str.extract(r'(\d?\d):(\d\d)')

  


Unnamed: 0,0,1
0,2,45
1,11,30
2,7,0
3,11,15
4,8,10


In [104]:
# extract the entire time, the hours, the minutes, and the period
df['text'].str.extractall(r'((\d?\d):(\d\d) ?([ap]m))')

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,2:45pm,2,45,pm
1,0,11:30 am,11,30,am
2,0,7:00pm,7,0,pm
3,0,11:15 pm,11,15,pm
4,0,08:10 am,8,10,am
4,1,09:00am,9,0,am


In [105]:
# extract the entire time, the hours, the minutes, and the period with group names
df['text'].str.extractall(r'(?P<time>(?P<hour>\d?\d):(?P<minute>\d\d) ?(?P<period>[ap]m))')

Unnamed: 0_level_0,Unnamed: 1_level_0,time,hour,minute,period
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,2:45pm,2,45,pm
1,0,11:30 am,11,30,am
2,0,7:00pm,7,0,pm
3,0,11:15 pm,11,15,pm
4,0,08:10 am,8,10,am
4,1,09:00am,9,0,am
