# Working With Text

In [2]:
text1 = "Ethics are built right into the ideals and objectives of the United Nations "

len(text1)

76

In [3]:
text2 = text1.split()
text2

['Ethics',
 'are',
 'built',
 'right',
 'into',
 'the',
 'ideals',
 'and',
 'objectives',
 'of',
 'the',
 'United',
 'Nations']

In [4]:
#Number of words
len(text2)

13

### List comprehension allows us to find specific words
or <br>
List of words

In [5]:
# Words that are greater than 3 letters long in text2
# using simple loop
for w in text2:
    if len(w) > 3:
        print(w)

Ethics
built
right
into
ideals
objectives
United
Nations


In [8]:
#list comprehension
new_list = [w for w in text2 if len(w) > 3]
new_list

['Ethics',
 'built',
 'right',
 'into',
 'ideals',
 'objectives',
 'United',
 'Nations']

In [9]:
 # Capitalized words in text2
[w for w in text2 if w.istitle()]

['Ethics', 'United', 'Nations']

In [10]:
# Words in text2 that end in 's'
[w for w in text2 if w[-1]=='s']

['Ethics', 'ideals', 'objectives', 'Nations']

In [12]:
# Words in text2 that end in 's'
[w for w in text2 if w.endswith('s')]

['Ethics', 'ideals', 'objectives', 'Nations']

### Finding unique words using set()

In [13]:
text3 = 'To be or not to be'
text4 = text3.split()

text4

['To', 'be', 'or', 'not', 'to', 'be']

In [14]:
len(text4)

6

In [18]:
# Getting unique words
len(set(text4))

5

In [15]:
set(text4)

{'To', 'be', 'not', 'or', 'to'}

In [21]:
# .lower converts the string to lowercase.
lower_text = [w.lower() for w in text4]
lower_text

['to', 'be', 'or', 'not', 'to', 'be']

In [22]:
len(set(lower_text))

4

In [23]:
set(lower_text)

{'be', 'not', 'or', 'to'}

## Processing free-text

In [24]:
text5 = '"Ethics are built right into the ideals and objectives of the United Nations" \
#UNSG @ NY Society for Ethical Culture bit.ly/2guVelr'

text6 = text5.split()
len(text6)

21

In [53]:
print(text6)

['"Ethics', 'are', 'built', 'right', 'into', 'the', 'ideals', 'and', 'objectives', 'of', 'the', 'United', 'Nations"', '#UNSG', '@', 'NY', 'Society', 'for', 'Ethical', 'Culture', 'bit.ly/2guVelr']


In [28]:
#1.Check unique words
len(set(text6))

20

### 1. Finding hastags ('#'):

In [30]:
[w for w in text6 if w.startswith('#')]

['#UNSG']

### 2. Finding callouts ('@'):

In [32]:
[w for w in text6 if w.startswith('@')]

['@']

In [34]:
# both hastags and callouts
[w for w in text6 if w.startswith('#') or w.startswith('@')]

['#UNSG', '@']

In [38]:
text7 = '@UN @UN_Women "Ethics are built right into the ideals and objectives of the United Nations" \
#UNSG @ NY Society for Ethical Culture bit.ly/2guVelr'
text8 = text7.split(' ')

print(len(text8))
print(text8)

23
['@UN', '@UN_Women', '"Ethics', 'are', 'built', 'right', 'into', 'the', 'ideals', 'and', 'objectives', 'of', 'the', 'United', 'Nations"', '#UNSG', '@', 'NY', 'Society', 'for', 'Ethical', 'Culture', 'bit.ly/2guVelr']


In [39]:
[w for w in text8 if w.startswith('#') or w.startswith('@')]

['@UN', '@UN_Women', '#UNSG', '@']

### 3.  We can use regular expressions to help us with more complex parsing.

For example '@[A-Za-z0-9_]+' will return all words that:
* start with `'@'` and are followed by at least one: 
  * capital letter (`'A-Z'`)
  * lowercase letter (`'a-z'`) 
  * number (`'0-9'`)
  * or underscore (`'_'`)

In [41]:
# import re - a module that provides support for regular expressions
import re

In [43]:
[w for w in text8 if re.search('@[A-Za-z0-9_]+', w)]

['@UN', '@UN_Women']

In [44]:
[w for w in text8 if re.search('#[A-Za-z0-9_]+', w)]

['#UNSG']

In [51]:
# Start with Capital latter
[w for w in text8 if re.search('^[A-Z]', w)]

['United', 'Nations"', 'NY', 'Society', 'Ethical', 'Culture']

In [52]:
# Start with small latter
[w for w in text8 if re.search('^[a-z]', w)]

['are',
 'built',
 'right',
 'into',
 'the',
 'ideals',
 'and',
 'objectives',
 'of',
 'the',
 'for',
 'bit.ly/2guVelr']

## Meta -characters: Character matches

*    .   : wildcard, matches a signle character.
*    ^   : start of a string. ex: ^xyz
*  dolar sign   : end of a string. ex: xyz$
*  [  ] : matches one of the set of characters within [  ]
*  [ a-z ] : matches one of the range a to z
* [ ^abc ] : is not a, b or
* a | b : a or b
* ( ) : scoping for operators
* \ : Escape character for special characters (\t, \n, \b)

## Meta -characters: Character symbols

* \ b : word boundary
<br><br>
* \d : Any digit, like [0-9]
* \D : Any non-digit, equivalent to [^0-9]
<br><br>
* \s : Any whitespace, equivalent to [\t\n\r\f\v]
* \S : Any non-whitespace, equivalent to [^ \t\n\r\f\v]
<br><br>
* \w : Alphanumeric characters, equivalent to [a-zA-Z0-9_]
* \W : Non-Alphanumeric characters, equivalent to [^ a-zA-Z0-9_]

## Meta -characters: Repetitions

* ' * ' : zero or more occurrences
*  +  : one or more occurrences
* ? : zero or one occurrences
* {n} : exactly n repetitions
* {n, } : at least n repetitions
* {, m} : at most m repetitions
* {n , m} : at least n and at most m repetitions

## Regular expression for Dates

* Date variations for 23rd October 2002: 
  * 23-10-2002
  * 23/10/2002
  * 23/10/02
  * 10/23/2002
  * 23 Oct 2002
  * 23 October 2002
  * Oct 23, 2002
  * October 23, 2002

In [83]:
date_text = '2/2/2002\n23-10-2002\n23/10/2002\n23/10/02\n10/23/2002\n23 Oct 2002\n23 October 2002\nOct 23, 2002\nOctober 23, 2002'
dates = date_text.split('\n')
dates

['2/2/2002',
 '23-10-2002',
 '23/10/2002',
 '23/10/02',
 '10/23/2002',
 '23 Oct 2002',
 '23 October 2002',
 'Oct 23, 2002',
 'October 23, 2002']

In [84]:
import re

In [85]:
re.findall(r'\d{2}[/-]\d{2}[/-]\d{4}', date_text)

['23-10-2002', '23/10/2002', '10/23/2002']

In [86]:
re.findall(r'\d{2}[/-]\d{2}[/-]\d{2,4}', date_text)

['23-10-2002', '23/10/2002', '23/10/02', '10/23/2002']

In [87]:
re.findall(r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', date_text)

['2/2/2002', '23-10-2002', '23/10/2002', '23/10/02', '10/23/2002']

### Now try for others dates

In [92]:
dates[-4:]

['23 Oct 2002', '23 October 2002', 'Oct 23, 2002', 'October 23, 2002']

In [91]:
re.findall(r'\d{2} (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \d{4}', date_text)

['23 Oct 2002']

In [93]:
re.findall(r'\d{2} (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{4}', date_text)

['23 Oct 2002', '23 October 2002']

In [98]:
re.findall(r'(?:\d{2} )?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{2}, )?\d{4}', date_text)

['23 Oct 2002', '23 October 2002', 'Oct 23, 2002', 'October 23, 2002']