# Natural Language Processing


## Regular Expressions

In [1]:
import re

In [2]:
text = "The text number of the Agent is 205-830-2801. Call soon!"

In [4]:
"number" in text

True

In [6]:
pattern = "number"

match = re.search(pattern, text)

In [7]:
match.span()

(9, 15)

In [8]:
match.start()

9

In [9]:
match.end()

15

In [10]:
text = "My phone is a new phone"

In [11]:
pattern = 'phone'

matches = re.findall(pattern, text)

In [12]:
print(matches)

['phone', 'phone']


In [13]:
# iteration over the matches

for match in re.finditer(pattern, text):
    print(match)

<re.Match object; span=(3, 8), match='phone'>
<re.Match object; span=(18, 23), match='phone'>


<table ><tr><th>Character</th><th>Description</th><th>Example Pattern Code</th><th >Exammple Match</th></tr>

<tr ><td><span >\d</span></td><td>A digit</td><td>file_\d\d</td><td>file_25</td></tr>

<tr ><td><span >\w</span></td><td>Alphanumeric</td><td>\w-\w\w\w</td><td>A-b_1</td></tr>



<tr ><td><span >\s</span></td><td>White space</td><td>a\sb\sc</td><td>a b c</td></tr>



<tr ><td><span >\D</span></td><td>A non digit</td><td>\D\D\D</td><td>ABC</td></tr>

<tr ><td><span >\W</span></td><td>Non-alphanumeric</td><td>\W\W\W\W\W</td><td>*-+=)</td></tr>

<tr ><td><span >\S</span></td><td>Non-whitespace</td><td>\S\S\S\S</td><td>Yoyo</td></tr></table>

<table ><tr><th>Character</th><th>Description</th><th>Example Pattern Code</th><th >Exammple Match</th></tr>

<tr ><td><span >+</span></td><td>Occurs one or more times</td><td>	Version \w-\w+</td><td>Version A-b1_1</td></tr>

<tr ><td><span >{3}</span></td><td>Occurs exactly 3 times</td><td>\D{3}</td><td>abc</td></tr>



<tr ><td><span >{2,4}</span></td><td>Occurs 2 to 4 times</td><td>\d{2,4}</td><td>123</td></tr>



<tr ><td><span >{3,}</span></td><td>Occurs 3 or more</td><td>\w{3,}</td><td>anycharacters</td></tr>

<tr ><td><span >\*</span></td><td>Occurs zero or more times</td><td>A\*B\*C*</td><td>AAACC</td></tr>

<tr ><td><span >?</span></td><td>Once or none</td><td>plurals?</td><td>plural</td></tr>

<tr ><td><span >^</span></td><td>Starts with</td><td>A\*B\*C*</td><td>AAACC</td></tr>

<tr ><td><span >'$'</span></td><td>Ends with</td><td>plurals?</td><td>plural</td></tr> </table>

#### Defining pattern for the phone number

In [16]:
pattern = r'\d\d\d-\d\d\d-\d\d\d\d'

text = 'My ne phone number is 205-830-2801.'

match = re.findall(pattern, text)

In [17]:
print(match)

['205-830-2801']


In [19]:
pattern = r'\d{3}-\d{3}-\d{4}'

match = re.findall(pattern, text)

print(match)

['205-830-2801']


#### Grouping together the small patterns in our original pattern

In [21]:
pattern = r'(\d{3})-(\d{3})-(\d{4})'

match = re.search(pattern, text)

print(match)

print(match.group(1))
print(match.group(2))

<re.Match object; span=(22, 34), match='205-830-2801'>
205
830


#### Using or '|' operator to mind either of the matching strings

In [22]:
re.search(r'man | woman', 'This man is so notorious')

<re.Match object; span=(5, 9), match='man '>

#### Wildcard operators

*Find all words in text ending with at and having a single letter before*

In [23]:
text = "This men has cat who eat rat and she's very fat"


match = re.findall(r'.at', text)

print(match)

['cat', 'eat', 'rat', 'fat']


In [24]:
# for string having more preceding characters

text = "This men has cat who eat rat and she's very fat and splat"


match = re.findall(r'.at', text)

print(match)

['cat', 'eat', 'rat', 'fat', 'lat']


In [25]:
re.search(r'\d$', "My phone number is 205-830-2801")

<re.Match object; span=(30, 31), match='1'>

In [26]:
# Excluding digits

re.findall(r'[^\d]', "There are 3 numbers 34 inside 5 sentences")

['T',
 'h',
 'e',
 'r',
 'e',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 ' ',
 'n',
 'u',
 'm',
 'b',
 'e',
 'r',
 's',
 ' ',
 ' ',
 'i',
 'n',
 's',
 'i',
 'd',
 'e',
 ' ',
 ' ',
 's',
 'e',
 'n',
 't',
 'e',
 'n',
 'c',
 'e',
 's']

In [27]:
# get everything that's not a digits

re.findall(r'[^\d]+', "There are 3 numbers 34 inside 5 sentences")


#This can be used for removing punctuations from the strings

['There are ', ' numbers ', ' inside ', ' sentences']

In [29]:
text = "This is a string! which has punctuation. how to remove it ?"

re.findall(r'[^\W]+', text)

['This',
 'is',
 'a',
 'string',
 'which',
 'has',
 'punctuation',
 'how',
 'to',
 'remove',
 'it']

In [31]:
# Extracting hypened words

text = "only find the hyphen-words. which are long-ish ?"

match = re.findall(r'[\w]+-[\w]+', text)

print(match)

['hyphen-words', 'long-ish']


## Bag Of Words Model (Vector based model)

#### Numpy Implementation

In [1]:
import numpy as np

# Sample documents
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Create a list of unique words (vocabulary)
words = set(" ".join(documents).split())

# Initialize an empty NumPy array to store the count vectors
count_matrix = np.zeros((len(documents), len(words)), dtype=int)

# Loop through each document and count word occurrences
for i, doc in enumerate(documents):
    for j, word in enumerate(words):
        count_matrix[i, j] = doc.split().count(word)

# Print the count_matrix
print(count_matrix)

[[0 0 0 1 1 0 0 1 1 1 0 0 0]
 [1 0 0 1 0 0 0 1 1 1 0 0 1]
 [0 1 0 1 0 1 1 0 1 0 1 0 0]
 [0 1 1 0 1 0 0 0 1 0 0 1 0]]


#### SciPy Implementation

*Preffered because of sparse matrix*

In [2]:
from scipy.sparse import dok_matrix

# Sample documents
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Create a list of unique words (vocabulary)
words = set(" ".join(documents).split())

# Initialize an empty dictionary of keys (DOK) matrix
count_matrix = dok_matrix((len(documents), len(words)), dtype=int)

# Loop through each document and count word occurrences
for i, doc in enumerate(documents):
    for j, word in enumerate(words):
        count_matrix[i, j] = doc.split().count(word)

# Convert the DOK matrix to a Compressed Sparse Row (CSR) matrix
count_matrix = count_matrix.tocsr()

# Print the count_matrix
print(count_matrix.toarray())

[[0 0 0 1 1 0 0 1 1 1 0 0 0]
 [1 0 0 1 0 0 0 1 1 1 0 0 1]
 [0 1 0 1 0 1 1 0 1 0 1 0 0]
 [0 1 1 0 1 0 0 0 1 0 0 1 0]]


#### Sklearn Implementation

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample documents
train_documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

test_documents = [
    
    "is this the third one?",
    "this one is the first document",
]

vectorizer = CountVectorizer()

train_vectorizer = vectorizer.fit_transform(train_documents)

test_vectorizer = vectorizer.transform(test_documents)

print(train_vectorizer)

  (0, 8)	1
  (0, 3)	1
  (0, 6)	1
  (0, 2)	1
  (0, 1)	1
  (1, 8)	1
  (1, 3)	1
  (1, 6)	1
  (1, 1)	2
  (1, 5)	1
  (2, 8)	1
  (2, 3)	1
  (2, 6)	1
  (2, 0)	1
  (2, 7)	1
  (2, 4)	1
  (3, 8)	1
  (3, 3)	1
  (3, 6)	1
  (3, 2)	1
  (3, 1)	1
