## Regular Expression

In [54]:
import re
import warnings
warnings.filterwarnings(action="ignore")

<center>Cheat Sheet</center>

**Anchors**
- `\A` : Matches only at the beginning of the string
- `\Z` : Matches only at the end of the string.
- `^` :  Matches at the beginning of a line.
- `$` : Matches at the end of a line.
- `\n` : Matches a newline character.
- `re.MULTILINE` or `re.M` : Makes ^ and $ match the start and end of each line (not just the start and end of the string). 
- `\b` : Matches the boundary between a word and a non-word character.
- `\B` :  Matches positions where `\b` does not match (inside words, or between non-word characters).

In [55]:
# Examples 

# \A
print(re.search("\AHello", string="Hello World"))

# Doesnot consider the line break, begining or not. 
print(re.search("\AWorld", string="Hello\nWorld")) # Not Found
print(re.search("\AWorld", string="Hello\nWorld", flags=re.MULTILINE)) # Still not Found


# \Z
print(re.search(r'World!\Z', 'Hello World!'))

print(re.search(r'World!\Z', 'Hello\nWorld!')) # Match Found


# ^
print(re.search("^Hello", string="Hello World"))
print(re.search("^World", string="Hello\nWorld")) # No Match
print(re.search("^World", string="Hello\nWorld", flags=re.MULTILINE)) # Found


# $ 
print(re.search(r'World!$', 'Hello\nWorld!'))
print(re.search(r'Hello$', 'Hello\nWorld!')) # No Match
print(re.search(r'Hello$', 'Hello\nWorld!', re.MULTILINE)) # Found


# \b
print(re.search(r'\bHello\b', 'Hello World!')) # Match Found
print(re.search(r'\bHello\b', 'HelloWorld!')) # No Match

# \B
# Matches "word" only if it is not at a word boundary
print(re.search(r'\Bword\B', 'swordfish'))            # Match found
print(re.search(r'\Bword\B', 'a word in a sentence')) # No match
print(re.search(r'\Bll\B', 'Hello'))                  # Match found



<re.Match object; span=(0, 5), match='Hello'>
None
None
<re.Match object; span=(6, 12), match='World!'>
<re.Match object; span=(6, 12), match='World!'>
<re.Match object; span=(0, 5), match='Hello'>
None
<re.Match object; span=(6, 11), match='World'>
<re.Match object; span=(6, 12), match='World!'>
None
<re.Match object; span=(0, 5), match='Hello'>
<re.Match object; span=(0, 5), match='Hello'>
None
<re.Match object; span=(1, 5), match='word'>
None
<re.Match object; span=(2, 4), match='ll'>


**Features**

- `|` : Conditional OR, Combines multiple regular expressions as alternatives.Each alternative can have independent anchors.
- `(pat)` : Capturing Group, Groups a pattern or patterns. Also captures the matched substring for back-references.
- `(?:pat)` : Non-Capturing Group, Groups a pattern or patterns without capturing the matched substring. 
- `?P<name>pat` :  Named Capture Group, Groups a pattern and assigns a name to the captured substring. 
- `.` :  Matches any single character except the newline character.
- `[]`: Matches one character among many specified inside the brackets 

In [56]:

# | 
print(re.search(r'cat|dog', 'I have a cat'))
print(re.search(r'cat|dog', 'I have a fish') ) # No Match

# (pat)
match = re.search(r'(Hello) (World)', 'Hello World')
print(match.group(1))  # Outputs: Hello
print(match.group(2))  # Outputs: World

text = "Hello\nWorld"
match = re.search(r"(^H\w+)\n(^W\w+)", text, re.MULTILINE)
print(match.group(1))  # Outputs: Hello
print(match.group(2))  # Outputs: World


# (?:pat)
# capture chai gar but use chai na gar. 
#  is a non-capturing group used to group parts of the pattern together without capturing them as separate groups.
match = re.search(r'(?:Hello) (World)', 'Hello World')
print(match.group(0))  # Outputs: Hello World
print(match.groups())  # Outputs: ('World',)


# . 
print(re.search(r'H.llo', 'H\nllo')) # No match
print(re.search(r'H.llo', 'Hillo'))  # Match found

# []
print(re.search(r'[aeiou]', 'Hello'))  # Match found
print(re.search(r'[aeiou]', 'Sky'))    # No match
print(re.search(r'[a-z]', '123abc'))   # Match found

<re.Match object; span=(9, 12), match='cat'>
None
Hello
World
Hello
World
Hello World
('World',)
None
<re.Match object; span=(0, 5), match='Hillo'>
<re.Match object; span=(1, 2), match='e'>
None
<re.Match object; span=(3, 4), match='a'>


**Greedy Quantifiers**

- `*` : Matches the preceding element zero or more times.
- `+` : Matches the preceding element one or more times.
- `?` : Matches the preceding element zero or one time.
- `{m,n}` : Matches the preceding element at least m times, but not more than n times.
- `{m,}` : Matches the preceding element at least m times.
- `{,n}` : Matches the preceding element at most n times.
- `{n}` : Matches the preceding element exactly n times.
- `pat1.*pat2`:  Matches pat1 followed by any number of any characters (including none), and then pat2.
- `pat1.*pat2|pat2.*pat1` : Matches pat1 followed by any characters and then pat2, or pat2 followed by any characters and then pat1.

In [57]:
# * 

# Zero or more 'b' after 'a'
pattern = r'ab*'

print(re.search(pattern, 'a'))
print(re.search(pattern, 'ab'))
print(re.search(pattern, 'abbb'))
print("--"*25)

# +
# One or more 'b' after 'a'
pattern = r'ab+'

print(re.search(pattern, 'a')) # Match Not Found
print(re.search(pattern, 'ab'))
print(re.search(pattern, 'abbb'))
print("--"*25)

# Zero or one 'b' after 'a'
pattern = r'ab?'
print(re.search(pattern, 'a'))
print(re.search(pattern, 'ab'))
print(re.search(pattern, 'abbb'))
print("--"*25)


# {m,n} {m} {m,} {,n}
print(re.search(r'a{2,4}', 'a')) # No Match
print(re.search(r'a{2,4}', 'aa')) # Match found ("aa")
print(re.search(r'a{2,4}', 'aaaaa')) # # Match found ("aaaa")
print(re.search(r'a{,2}', 'aaaaa')) # # Match found ("aaaa")
print(re.search(r'a{2}', 'aaaaa')) # # Match found ("aaaa")
print("--"*25)

# pat1.*pat2
print(re.search(r'Hello.*World', 'Hello amazing World')) # Match Found


# pat1.*pat2|pat2.*pat1
print(re.search(r'cat.*dog|dog.*cat', 'cat and then dog'))



<re.Match object; span=(0, 1), match='a'>
<re.Match object; span=(0, 2), match='ab'>
<re.Match object; span=(0, 4), match='abbb'>
--------------------------------------------------
None
<re.Match object; span=(0, 2), match='ab'>
<re.Match object; span=(0, 4), match='abbb'>
--------------------------------------------------
<re.Match object; span=(0, 1), match='a'>
<re.Match object; span=(0, 2), match='ab'>
<re.Match object; span=(0, 2), match='ab'>
--------------------------------------------------
None
<re.Match object; span=(0, 2), match='aa'>
<re.Match object; span=(0, 4), match='aaaa'>
<re.Match object; span=(0, 2), match='aa'>
<re.Match object; span=(0, 2), match='aa'>
--------------------------------------------------
<re.Match object; span=(0, 19), match='Hello amazing World'>
<re.Match object; span=(0, 16), match='cat and then dog'>


**Character Classes**

- `[aeiou]` : Matches any one of the specified characters (vowels).
- `[^aeiou]` :  The ^ inverts the selection, so this matches any character except the specified ones (consonants in this case).
- `[a-f]`: Matches any one character in the specified range.
- `\d` : Matches any digit, equivalent to [0-9].
- `\D` :  Matches any non-digit, equivalent to `[^0-9]` or `[^\\d]`.
- `\w` : Matches any word character (alphanumeric and underscore), equivalent to [a-zA-Z0-9_].
- `\W`: Matches any non-word character, equivalent to [^a-zA-Z0-9_] or `[^\\w]`.
- `\s`: Matches any whitespace character, including space, tab, newline, carriage return, form feed, and vertical tab. Equivalent to `[\ \t\n\r\f\v]`.
- `\S`: Matches any character that is not a whitespace character. Equivalent to `[^\ \t\n\r\f\v]` or `[^\s]`.


In [58]:
print(re.findall(r'[aeiou]', 'hello world')) # Outputs: ['e', 'o', 'o']
print(re.findall(r'[^aeiou]', 'hello world')) # ['h', 'l', 'l', ' ', 'w', 'r', 'l', 'd']
print(re.findall(r'[a-f]', 'abcdefgxyz')) # ['a', 'b', 'c', 'd', 'e', 'f']
print(re.findall(r'\d', 'abc123xyz')) # ['1', '2', '3']
print(re.findall(r'\D', 'abc123xyz')) # ['a', 'b', 'c', 'x', 'y', 'z']
print(re.findall(r'\w', 'hello_world 123!')) # ['h', 'e', 'l', 'l', 'o', '_', 'w', 'o', 'r', 'l', 'd', '1', '2', '3']
print(re.findall(r'\W', 'hello_world 123!')) # [' ', '!']
print(re.findall(r'\s', 'hello world\t123\n'))  # [' ', '\t', '\n']
print(re.findall(r'\S', 'hello world!\t123\n')) # ['h', 'e', 'l', 'l', 'o', 'w', 'o', 'r', 'l', 'd', '!', '1', '2', '3']

['e', 'o', 'o']
['h', 'l', 'l', ' ', 'w', 'r', 'l', 'd']
['a', 'b', 'c', 'd', 'e', 'f']
['1', '2', '3']
['a', 'b', 'c', 'x', 'y', 'z']
['h', 'e', 'l', 'l', 'o', '_', 'w', 'o', 'r', 'l', 'd', '1', '2', '3']
[' ', '!']
[' ', '\t', '\n']
['h', 'e', 'l', 'l', 'o', 'w', 'o', 'r', 'l', 'd', '!', '1', '2', '3']


**Lookarounds**


- `(?=pat)` : Positive Lookahead Assertion, Asserts that the given pattern matches after the current position.

    ```python
    # Match 'foo' only if it is followed by 'bar'
    pattern = r'foo(?=bar)'
    ```

- `(?<=pat)` :  Positive Lookbehind Assertion, Asserts that the given pattern matches before the current position.

    ```python 
    # Match 'bar' only if it is preceded by 'foo'
    pattern = r'(?<=foo)bar'
    ```

- `(?!pat)` :  Negative Lookahead Assertion,  Asserts that the given pattern does not match after the current position.
  
    ```python
    # Match 'foo' only if it is not followed by 'bar'
    pattern = r'foo(?!bar)'

    ```

- `(?<!pat)` : Negative Lookbehind Assertion, Asserts that the given pattern does not match before the current position.

    ```python
    # Match 'bar' only if it is not preceded by 'foo'
    pattern = r'(?<!foo)bar', 

    ```


- `(?!pat1)(?=pat2)`: Multiple Assertions, 
  
    ```python 
    # Match 'foo' only if it is not followed by 'bar' and followed by 'baz'
    pattern = r'foo(?!bar)(?=baz)'


    ```

- `((?!pat).)*`: Negate a Grouping
  ```python 
    # Match any sequence of characters that does not contain 'foo'
    pattern = r'((?!foo).)*'

  ```

In [59]:
# Match 'foo' only if it is followed by 'bar'
pattern = r'foo(?=bar)'
print(re.search(pattern, 'foobar'))
print(re.search(pattern, 'foobaz'))

# Match 'bar' only if it is preceded by 'foo'
pattern = r'(?<=foo)bar'
print(re.search(pattern, 'foobar'))  # Match found: 'bar'
print(re.search(pattern, 'bazbar'))  # No match


# Match 'foo' only if it is not followed by 'bar'
pattern = r'foo(?!bar)'
print(re.search(pattern, 'foobar'))  # No match
print(re.search(pattern, 'foobaz'))  # Match found: 'foo')

# Match 'bar' only if it is not preceded by 'foo'
pattern = r'(?<!foo)bar'
print(re.search(pattern, 'foobar'))  # No match
print(re.search(pattern, 'bazbar'))  # Match found: 'bar'

# Match 'foo' only if it is not followed by 'bar' and followed by 'baz'
pattern = r'foo(?!bar)(?=baz)'
print(re.search(pattern, 'foobarbaz'))  # No match
print(re.search(pattern, 'foobaz'))     # Match found: 'foo'


# Match any sequence of characters that does not contain 'foo'
pattern = r'((?!foo).)*'
print(re.search(pattern, 'foobarbaz'))  # Matches '', stops before 'foo'
print(re.search(pattern, 'barbaz'))     # Matches 'barbaz'

<re.Match object; span=(0, 3), match='foo'>
None
<re.Match object; span=(3, 6), match='bar'>
None
None
<re.Match object; span=(0, 3), match='foo'>
None
<re.Match object; span=(3, 6), match='bar'>
None
<re.Match object; span=(0, 3), match='foo'>
<re.Match object; span=(0, 0), match=''>
<re.Match object; span=(0, 6), match='barbaz'>


**Funcions**

| Function        | Description                                                                                                                                                        |
|-----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `re.search`     | Check if the given pattern is present anywhere in the input string. Output is a `re.Match` object, usable in conditional expressions. Prefer r-strings to define RE. Use byte pattern for byte input. Python also maintains a small cache of recent RE. |
| `re.fullmatch`  | Ensures pattern matches the entire input string.                                                                                                                   |
| `re.compile`    | Compile a pattern for reuse, outputs `re.Pattern` object.                                                                                                          |
| `re.sub`        | Search and replace. `re.sub(r'pat', f, s)` uses function `f` with `re.Match` object as argument.                                                                   |
| `re.escape`     | Automatically escape all metacharacters.                                                                                                                           |
| `re.split`      | Split a string based on RE. Text matched by the groups will be part of the output. Portion matched by pattern outside group won't be in output.                    |
| `re.findall`    | Returns all the matches as a list. If 1 capture group is used, only its matches are returned. If 1+ capture groups are used, each element will be a tuple of capture groups. Portion matched by pattern outside group won't be in output.            |
| `re.finditer`   | Iterator with `re.Match` object for each match.                                                                                                                    |
| `re.subn`       | Gives tuple of modified string and number of substitutions.                                                                                                        |


In [60]:
import re

######################### Search ###############################

# Match 'hello' followed by any characters and then 'world'
pattern = r'(^h\w+).*(w\w+)$'
match = re.search(pattern, 'hello crazy world')

if match:
    print(match.group(1))  # Outputs: hello
    print(match.group(2))  # Outputs: world
    print(match.group())  # Outputs: world
    



######################### Full Match ###############################

match = re.fullmatch(r'hello world', 'hello world')
if match:
    print(match.group())  # Outputs: hello world


######################### Compile ###############################

pattern = re.compile(r'\d+')
matches = pattern.findall('123 abc 456 def')
print(matches)  # Outputs: ['123', '456']


######################### Sub ###############################
result = re.sub(r'cat', 'dog', 'the cat sat on the cat')
print(result)  # Outputs: the dog sat on the dog


######################### Escape ###############################
escaped_string = re.escape('hello. how are you?')
print(escaped_string)  # Outputs: hello\. how are you\?

######################### Split ###############################
result = re.split(r'\d+', 'one1two2three3four')
print(result)

######################### Find Iter ###############################
matches = re.finditer(r'\b\w+\b', 'hello world')
for match in matches:
    print(match.group())  # Outputs: hello \n world

######################### subn ###############################
result = re.subn(r'cat', 'dog', 'the cat sat on the cat')
print(result) 

hello
world
hello crazy world
hello world
['123', '456']
the dog sat on the dog
hello\.\ how\ are\ you\?
['one', 'two', 'three', 'four']
hello
world
('the dog sat on the dog', 2)


In [61]:
# practice questios: 

sample_string = "The quick brown fox jumps over the lazy dog. Email: test.email+alex@leetcode.com or contact_us@company.org. Visit https://www.example.com/path?query=123&lang=en or http://short.url for more info. Today is 2024-06-21. Call us at (123) 456-7890 or 123-456-7890. Hex color codes: #1a2b3c, #FFF, #123456. IPv4 addresses: 192.168.1.1, 255.255.255.255. Use password P@ssw0rd123! and reset it by 12:34 PM."

In [62]:
# find all email address
pattern = r'((www\.)?[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-zA-Z]{2,})'
match = re.finditer(pattern,sample_string )
for m in match:
    print(m.group())

test.email+alex@leetcode.com
contact_us@company.org


In [63]:
# url 
pattern = r"https?:\/\/(www\.)?[^\s\/]+\.[a-zA-Z]{2,}(\/[\S]+)?"
match = re.finditer(pattern,sample_string )
for m in match:
    print(m.group())

https://www.example.com/path?query=123&lang=en
http://short.url


In [64]:
# phone number 

sample_text = "Here are some phone numbers: (123) 456-7890, +1 (234) 567-8901, 345.678.9012 4374383992"
pattern = r"(\+\d{1,3})?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}"
match = re.finditer(pattern,sample_text )
for m in match:
    print(m.group())

 (123) 456-7890
+1 (234) 567-8901
 345.678.9012
 4374383992


In [65]:
# IP address
pattern = r'(?:\d{1,3}\.){3}\d{1,3}'
print(re.findall(pattern,sample_string))


pattern = r'(\d{1,3}\.){3}\d{1,3}'
print(re.findall(pattern,sample_string))

['192.168.1.1', '255.255.255.255']
['1.', '255.']



- **Capturing Group (`()`)**: Captures the matched text and stores it for later use or extraction using functions like `re.findall`.
- **Non-Capturing Group `(?:)`**: Matches the pattern enclosed in parentheses but does not capture or store the matched text. It is useful when you need to group elements for applying quantifiers or alternatives but don't need to extract them separately.


In [66]:
import re
text = "123-4567 hello 8910"

# Capturing 
pattern = r"(\d{3})?-?(\d{4})"
matches = re.findall(pattern, text)
print(matches)  # Output: [('123', '4567'), ('', '8910')]


# Non Capturing 
pattern = r"(?:\d{3})?-?(?:\d{4})"
matches = re.findall(pattern, text)
print(matches)  # Output: ['123-4567', '8910']


[('123', '4567'), ('', '8910')]
['123-4567', '8910']


## Text Wrangling

#### Bag of Words

- Turns arbitrary text into fixed-length vectors by counting how many times each word appears in the corpus. 
- This approach is very simple and flexible technique. It involves two things:
  - A vocabulary of known words
  - A measure of the presence of known words
- The feature vector representing each will be sparse in nature as the words in each document will represent only a small subset of words out of all words (bag-of-words) present in entire set of document.


Drawbacks: 
- The size of the vector increases with the size of the vocabulary which may cause sparsity.
- Discarding the word order ignores the context and in turn meaning of the words in documents.
- Cannot handle out of vocabulary (OOV) tokens.

In [97]:
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer

text = ["We love Natural Language Processing we.", "Guys, we need to work on the Natural Language Processing."]

vectorizer = CountVectorizer()
data = vectorizer.fit_transform(text)

bow = pd.DataFrame(data.toarray(), columns=vectorizer.get_feature_names_out())
bow

Unnamed: 0,guys,language,love,natural,need,on,processing,the,to,we,work
0,0,1,1,1,0,0,1,0,0,2,0
1,1,1,0,1,1,1,1,1,1,1,1


In [99]:
# print the vocabulary 
print(vectorizer.vocabulary_)

{'we': 9, 'love': 2, 'natural': 3, 'language': 1, 'processing': 6, 'guys': 0, 'need': 4, 'to': 8, 'work': 10, 'on': 5, 'the': 7}


In [None]:
# Some important arguments in Count Vectorizer; 
ngram_range=(1,3) # N-Gram of length 1-3
min_df=2 # Repeated atleast twice 
max_df=0.9 # Occurs in more than 90% of the document.

#### TF-IDF

- In bag of words all words in the text are treated equally and there’s no consideration that some words in the document are more important than others. 
- The TF-IDF technique aims to quantify the importance of a given word relative to other words in the document.
- The main intuition behind TF-IDF is that if a word $w$ appears much time in the document $d_i$ but doesn’t appear in the rest of the document $d_j$ then word w must be of great importance to the document $di$.
- The importance of $w$ should increase in proportion to its frequency in $d_i$, but at the same time, its importance should decrease in the proportion to other documents $d_j$.

<u>Term Frequency:</u> 

- The TF score measures how often a word occurs in a document.

$$\mathrm{TF}\left(t,d\right)=\frac{(\text{Number of occurrences of term }t\text{ in document }d)}{(\text{Total number of terms in the document }d)}$$

- In a given corpus, we can have different lengths of documents, and the occurrence of a word in a document may vary based on the length of the document.To normalize this, we divide the number of occurrences by the length of the document.

<u>Inverse Document Frequency</u>

- It measures the importance of a word across the corpus.
  
$$
\mathrm{IDF}\left(t\right)=\log_e\frac{(\text{Total number of documents in the corpus})}{(\text{Number of documents with term }t\text{ in them })}
$$

- The Term frequency(TF) gives equal weight to all words, but this may not be useful as stopwords like is, am, are, etc., are not important, even they occur frequently.

- To account for this IDF weights up the term that occurs less commonly in the corpus and weights down the words that occur very frequently in the corpus. 

Drawbacks: 
-  It is based on the bag of words (BOW) model, therefore it doesn’t capture the position in the text, semantics, etc.
- Cannot handle out of vocabulary (OOV) tokens.
- The feature vectors are sparse and high dimensional.

Example: 

![](https://miro.medium.com/v2/resize:fit:720/format:webp/1*-0g0HFp9BKdgYSJOUhlKOA.png)

![](https://miro.medium.com/v2/resize:fit:640/format:webp/1*ZXVW1MFpPGOgDne8spdawA.png)

![](https://miro.medium.com/v2/resize:fit:828/format:webp/1*owBBWicmILMFw7gHvE5rzw.png)

![](https://miro.medium.com/v2/resize:fit:828/format:webp/1*FCG4RqeoOIdttCVHh9qxcA.png)

In [103]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer

text = ["Jupiter is the largest planet", "Mars is the fourth planet from the sun"]

vectorizer = TfidfVectorizer()
data = vectorizer.fit_transform(text)

tfidf = pd.DataFrame(data.toarray(), columns=vectorizer.get_feature_names_out())
tfidf

Unnamed: 0,fourth,from,is,jupiter,largest,mars,planet,sun,the
0,0.0,0.0,0.379303,0.533098,0.533098,0.0,0.379303,0.0,0.379303
1,0.376957,0.376957,0.268208,0.0,0.0,0.376957,0.268208,0.376957,0.536416


In [104]:
print(vectorizer.vocabulary_)

{'jupiter': 3, 'is': 2, 'the': 8, 'largest': 4, 'planet': 6, 'mars': 5, 'fourth': 0, 'from': 1, 'sun': 7}


In [None]:
# Some important arguments in TF-IDF Vectorizer; 
ngram_range=(1,3) # N-Gram of length 1-3
min_df=2 # Repeated atleast twice 
max_df=0.9 # Occurs in more than 90% of the document.

## Text Preprocessing

#### Puncutation Removal

In [67]:
import string
my_string = "Hello!Arjan!#^*(!*@)Ho*w are you F*cker?"

'''
translate() : retuns string where each char is mapped to corresponding char in 
translation table. 

translation table is formed using maketrans()

'''
puncts = string.punctuation
table = str.maketrans('','',puncts)
clean_str = my_string.translate(table)
print(clean_str)

HelloArjanHow are you Fcker


#### WhiteSpace Removal

In [68]:
# Remove leading and ending white spaces 

input_string = "\t Hello Rajan Ghimire  "
print(input_string)
print(input_string.strip())

	 Hello Rajan Ghimire  
Hello Rajan Ghimire


#### POS Tagging

In [69]:
import nltk
from textblob import TextBlob

# download 
# nltk.download("tagsets")
# nltk.download("averaged_perceptron_tagger")

text = ("I love to write programming blogs.""Those blogs are available at my portfolio site.")

tagger = TextBlob(text)

print(tagger.tags)


[('I', 'PRP'), ('love', 'VBP'), ('to', 'TO'), ('write', 'VB'), ('programming', 'VBG'), ('blogs.Those', 'JJ'), ('blogs', 'NNS'), ('are', 'VBP'), ('available', 'JJ'), ('at', 'IN'), ('my', 'PRP$'), ('portfolio', 'NN'), ('site', 'NN')]


In [70]:
# print the tags-set
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

#### Named Entity Recognization

(NER) seeks to locate and classify named entities in text into pre-defined categories such as the 
- names of persons, 
- organizations, 
- locations, 
- expressions of times, 
- quantities, 
- monetary values, 
- percentages, etc. 

NER is used in many fields in Natural Language Processing (NLP), and it can help answering many real-world questions:

Which companies were mentioned in the news article?
Were specified products mentioned in complaints or reviews?
Does the tweet contain the name of a person? Does the tweet contain this person’s location?


In [71]:
# nltk.download("maxent_ne_chunker")
# nltk.download("words")

In [72]:
from nltk.tokenize import sent_tokenize, word_tokenize

my_string = "Rajan Ghimire worked for Google and attended meeting in Kathmandu. I study in Lambton College in Toronto."

# apply sentence tokenizer 
sentences = sent_tokenize(my_string)

# apply word tokneizer 
words = [word_tokenize(x) for x in sentences]

# apply pos tagger ? Why pos 
pos = [nltk.pos_tag(x) for x in words]

# get ner chunks named entity : chunker to chunk the given list of tagged sentences
chunked_sentences = nltk.ne_chunk_sents(pos, binary=True)

# for sent in chunked_sentences:
#     print(sent)


In [73]:
# only print the named entities
# use hasattr(), if NE chunk then its a nltk object and 
# in that object it is stored in 'label' varibale. First check if object has label 
# attribute and if present, check if label is NE
for sent in chunked_sentences:
    for chunk in sent:
        if hasattr(chunk, "label") and chunk.label() == "NE":
            print(chunk)

(NE Rajan/NNP Ghimire/NNP)
(NE Google/NNP)
(NE Kathmandu/NNP)
(NE Lambton/NNP College/NNP)
(NE Toronto/NNP)


#### Collocations

-  It is a phrase consisting of more than one word but these words more commonly co-occur in a given context than its individual word parts.
-  For example, the phrase ‘CT scan’ is more likely to co-occur than do ‘CT’ and ‘scan’ individually. ‘CT scan’ is also a meaningful phrase.
-  **How do we make good selections for collocations?**
-  Co-occurences may not be sufficient as phrases such as ‘of the’ may co-occur frequently, but are not meaningful.
  
Methods to filter out the most meaningful collocations: 
- frequency counting, 
- Pointwise Mutual Information (PMI), and 
- hypothesis testing (t-test and chi-square).

Some uses for collocation identification are:
-  Keyword extraction
-  Bigrams/Trigrams can be concatenated (e.g. social media -> social_media) and counted as one word to improve insights analysis

In [74]:
# lets use a sample dataset provided by NLTK. 
# Load the data and convert the data into lowercase

import nltk
import pandas as pd

nltk.download('webtext')

from nltk.corpus import webtext

words = [w.lower() for w in webtext.words('grail.txt')]

print(words[-5:])

[']', 'cameraman', ':', 'christ', '!']


[nltk_data] Downloading package webtext to /home/rjn/nltk_data...
[nltk_data]   Package webtext is already up-to-date!


Method 1: Counting frequencies of adjacent words with part of speech filters

In [75]:
# The simplest method is to rank the most frequent bigrams or trigrams:

from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.collocations import BigramAssocMeasures, TrigramAssocMeasures

##################Bigram##################

# Create a BigramCollocationFinder object
bigram_finder = BigramCollocationFinder.from_words(words)

# Apply frequency filter to remove infrequent bigrams
bigram_finder.apply_freq_filter(5)

# filter bigram using likelihood ratio
filtered = bigram_finder.nbest(score_fn=BigramAssocMeasures.likelihood_ratio, n=5)

print(filtered)


##################Trigram##################

# Create a TrigramCollocationFinder object
trigram_finder = TrigramCollocationFinder.from_words(words)

# Apply frequency filter to remove infrequent trigrams
trigram_finder.apply_freq_filter(5)

# filter trigram using likelihood ratio
filtered = trigram_finder.nbest(score_fn=TrigramAssocMeasures.likelihood_ratio, n=5)

print(filtered)

[("'", 's'), ('arthur', ':'), ('#', '1'), ("'", 't'), ('villager', '#')]
[('[', 'boom', ']'), ('[', 'singing', ']'), ('[', 'music', ']'), ('[', 'clang', ']'), ('.', 'arthur', ':')]


However, a common issue with this is adjacent spaces, stop words, articles, prepositions or pronouns are common and are not meaningful. Lets apply text cleaning and see the reults.

In [76]:
from nltk.corpus import stopwords

stopsets = set(stopwords.words('english'))

# if len < 3, remove else remove stopwords
filter_stop = lambda x : len(x) < 3 or x in stopsets


##################Bigram##################

# Create a BigramCollocationFinder object
bigram_finder = BigramCollocationFinder.from_words(words)

# apply text cleaner 
bigram_finder.apply_word_filter(fn=filter_stop)

# Apply frequency filter to remove infrequent bigrams
bigram_finder.apply_freq_filter(5)

# filter bigram using likelihood ratio
filtered = bigram_finder.nbest(score_fn=BigramAssocMeasures.likelihood_ratio, n=5)

print(filtered)



##################Trigram##################

# Create a TrigramCollocationFinder object
trigram_finder = TrigramCollocationFinder.from_words(words)

# Apply frequency filter to remove infrequent bigrams
trigram_finder.apply_word_filter(fn=filter_stop)

# Apply frequency filter to remove infrequent trigrams
trigram_finder.apply_freq_filter(5)

# filter trigram using likelihood ratio
filtered = trigram_finder.nbest(score_fn=TrigramAssocMeasures.likelihood_ratio, n=5)

print(filtered)


[('black', 'knight'), ('clop', 'clop'), ('head', 'knight'), ('mumble', 'mumble'), ('squeak', 'squeak')]
[('clop', 'clop', 'clop'), ('mumble', 'mumble', 'mumble'), ('squeak', 'squeak', 'squeak'), ('saw', 'saw', 'saw'), ('pie', 'iesu', 'domine')]


**Working Mechanims**

Words -> BigramCollocationFinder/TrigramCollocationFinder -> create frequencey distrbution -> apply filtering (word eliminator, frequency filter) -> apply generic sorting function

In [77]:
trigram_freq = trigram_finder.ngram_fd.items()
trigramFreqTable = pd.DataFrame(list(trigram_freq), columns=['trigram','freq']).sort_values(by='freq', ascending=False)

trigramFreqTable.head()

Unnamed: 0,trigram,freq
6,"(squeak, squeak, squeak)",15
5,"(saw, saw, saw)",14
0,"(clop, clop, clop)",13
2,"(pie, iesu, domine)",10
9,"(mumble, mumble, mumble)",10


Method 2: Pointwise Mutual Information (PMI)
Bigram: 

$$PMI(w^1,w^2)=log_2\frac{P(w^1,w^2)}{P(w^1)P(w^2)}$$

Trigram:

$$PMI(w^1,w^2,w^3)=log_2\frac{P(w^1,w^2,w^3)}{P(w^1)P(w^2)P(w^3)}$$

- The main intuition is that it measures how much more likely the words co-occur than if they were independent.
- However, it is very sensitive to rare combination of words. 
- For example, if a random bigram ‘abc xyz’ appears, and neither ‘abc’ nor ‘xyz’ appeared anywhere else in the text, ‘abc xyz’ will be identified as highly significant bigram when it could just be a random misspelling or a phrase too rare to generalize as a bigram. Therefore, this method is often used with a frequency filter.

In [78]:
##################Bigram##################

# Create a BigramCollocationFinder object
bigram_finder = BigramCollocationFinder.from_words(words)

# apply text cleaner 
bigram_finder.apply_word_filter(fn=filter_stop)

# Apply frequency filter to remove infrequent bigrams
bigram_finder.apply_freq_filter(5)

# filter bigram using PMI
filtered = bigram_finder.nbest(score_fn=BigramAssocMeasures.pmi, n=5)

print(filtered)



##################Trigram##################

# Create a TrigramCollocationFinder object
trigram_finder = TrigramCollocationFinder.from_words(words)

# Apply frequency filter to remove infrequent bigrams
trigram_finder.apply_word_filter(fn=filter_stop)

# Apply frequency filter to remove infrequent trigrams
trigram_finder.apply_freq_filter(5)

# filter trigram using PMI
filtered = trigram_finder.nbest(score_fn=TrigramAssocMeasures.pmi, n=5)

print(filtered)


pd.DataFrame(list(trigram_finder.score_ngrams(TrigramAssocMeasures.pmi)), columns=['trigram','PMI']).sort_values(by='PMI', ascending=False)

[('dramatic', 'chord'), ('dona', 'eis'), ('eis', 'requiem'), ('hand', 'grenade'), ('angels', 'sing')]
[('dona', 'eis', 'requiem'), ('pie', 'iesu', 'domine'), ('saw', 'saw', 'saw'), ('clap', 'clap', 'clap'), ('squeak', 'squeak', 'squeak')]


Unnamed: 0,trigram,PMI
0,"(dona, eis, requiem)",22.100888
1,"(pie, iesu, domine)",21.457032
2,"(saw, saw, saw)",19.645854
3,"(clap, clap, clap)",18.702144
4,"(squeak, squeak, squeak)",17.433835
5,"(heh, heh, heh)",17.044521
6,"(mumble, mumble, mumble)",16.702144
7,"(clop, clop, clop)",15.945121
8,"(brave, sir, robin)",14.61355
9,"(boom, boom, boom)",13.947257


#### Synonym and Antonym

- Synonyms and Antonyms are part of the WordNet. WordNet is a large lexical database for the English language.
- We use synsets to extract the synonyms and antonyms. 
- Synset: It is also called as synonym set or collection of synonym words

Wordnet: 
- WordNet is a large lexical database of English. Nouns, verbs, adjectives and adverbs are grouped into sets of cognitive synonyms (synsets), each expressing a distinct concept. 
- Synsets are interlinked by means of `conceptual-semantic` and `lexical relations`.
- WordNet superficially resembles a thesaurus, in that it groups words together based on their meanings. But there are some differences: 
  
  - WordNet connects specific meanings of words, not just the words themselves. This means that words that are related in WordNet are clearly defined by their meanings, avoiding confusion about what each word means.
  - WordNet shows the relationships between words, like whether they are synonyms, antonyms, or related in other ways. In contrast, a thesaurus groups words together based only on similar meanings, without showing how they are related.

**Synset components:**
- `<lemma>` is the word’s morphological stem
- `<pos>` is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB
  - n    NOUN
  - v    VERB
  - a    ADJECTIVE
  - s    ADJECTIVE SATELLITE
  - r    ADVERB 

- `<number>` is the sense number, counting from 0. This is used to disambiguate word meanings


In [87]:
from nltk.corpus import wordnet
syns = wordnet.synsets("good")
print(syns)
print(syns[1].lemmas())
print(syns[1].name())
print(syns[1].definition())
print(syns[1].examples())
print(syns[1].lemmas()[0].name())

# check if antonym
print(syns[1].lemmas()[0].antonyms())
print(syns[1].lemmas()[0].antonyms()[-1].name())

[Synset('good.n.01'), Synset('good.n.02'), Synset('good.n.03'), Synset('commodity.n.01'), Synset('good.a.01'), Synset('full.s.06'), Synset('good.a.03'), Synset('estimable.s.02'), Synset('beneficial.s.01'), Synset('good.s.06'), Synset('good.s.07'), Synset('adept.s.01'), Synset('good.s.09'), Synset('dear.s.02'), Synset('dependable.s.04'), Synset('good.s.12'), Synset('good.s.13'), Synset('effective.s.04'), Synset('good.s.15'), Synset('good.s.16'), Synset('good.s.17'), Synset('good.s.18'), Synset('good.s.19'), Synset('good.s.20'), Synset('good.s.21'), Synset('well.r.01'), Synset('thoroughly.r.02')]
[Lemma('good.n.02.good'), Lemma('good.n.02.goodness')]
good.n.02
moral excellence or admirableness
['there is much good to be found in people']
good
[Lemma('evil.n.03.evil')]
evil


In [80]:
def find_synonym_antonym(word):
    
    synonym = []
    antonym = []
    
    for synset_object in wordnet.synsets(word):
        for lemma in synset_object.lemmas():
            synonym.append(lemma.name())
            
            if lemma.antonyms():
                for ant in lemma.antonyms():
                    antonym.append(ant.name())
                    
    
    return set(synonym), set(antonym)

In [83]:
find_synonym_antonym("hell")

({'Hades',
  'Hell',
  'Inferno',
  'Scheol',
  'blaze',
  'hell',
  'hell_on_earth',
  'hellhole',
  'infernal_region',
  'inferno',
  'nether_region',
  'netherworld',
  'perdition',
  'pit',
  'sin',
  'snake_pit',
  'the_pits',
  'underworld'},
 {'Heaven'})

#### Fix word lengthening

In [88]:
import re 

def remove_lengthening(text):
    pattern = re.compile(r"(.)\1+")
    result = pattern.sub(r"\1\1", text)
    if len(result) > 1 and result[-1] == result[-2]:
        result = result[:-1]
    
    return result

print(remove_lengthening('Hellllloooooo'))  

Hello


#### Spell Checking

In [90]:
from autocorrect import Speller
spell = Speller(lang="en")
print(spell("mussage"))
print(spell("caaar"))

message
car


In [92]:
from pattern.en import suggest
print(suggest("mussage"))
print(suggest("survice"))
print(suggest("hte"))
print(suggest("caaar"))