# Importing Libraries

In [3]:
import re

In [9]:
text = """Diwali (English: /dɪˈwɑːliː/; Deepavali[3] (IAST: dīpāvalī) or Divali;[a]) is the Hindu festival of lights with its variations also celebrated in other Indian religions. It symbolises the spiritual "victory of light over darkness, good over evil, and knowledge over ignorance".[4][5][6][7] Diwali is celebrated during the Hindu lunisolar months of Ashvin (according to the amanta tradition) and Kartika—between around mid-September and mid-November.[8][9][10][11] The celebrations generally last five or six days.[12][13]
Diwali is connected to various religious events, deities and personalities, such as being the day Rama returned to his kingdom in Ayodhya with his wife Sita and his brother Lakshmana after defeating the demon king Ravana.[14] It is also widely associated with Lakshmi, the goddess of prosperity, and Ganesha, the god of wisdom and the remover of obstacles.[15] Other regional traditions connect the holiday to Vishnu, Krishna, Durga, Shiva, Kali, Hanuman, Kubera, Yama, Yami, Dhanvantari, or Vishvakarman."""

# Sentence Segmentation

In [6]:
def segment_sentences(text):
  sentences = re.split("(?<=[.!?]) +", text)
  return [sentence.strip() for sentence in sentences]

In [10]:
segment_sentences(text)

['Diwali (English: /dɪˈwɑːliː/; Deepavali[3] (IAST: dīpāvalī) or Divali;[a]) is the Hindu festival of lights with its variations also celebrated in other Indian religions.',
 'It symbolises the spiritual "victory of light over darkness, good over evil, and knowledge over ignorance".[4][5][6][7] Diwali is celebrated during the Hindu lunisolar months of Ashvin (according to the amanta tradition) and Kartika—between around mid-September and mid-November.[8][9][10][11] The celebrations generally last five or six days.[12][13]\nDiwali is connected to various religious events, deities and personalities, such as being the day Rama returned to his kingdom in Ayodhya with his wife Sita and his brother Lakshmana after defeating the demon king Ravana.[14] It is also widely associated with Lakshmi, the goddess of prosperity, and Ganesha, the god of wisdom and the remover of obstacles.[15] Other regional traditions connect the holiday to Vishnu, Krishna, Durga, Shiva, Kali, Hanuman, Kubera, Yama, Y

# Tokenization

In [8]:
def tokenize_document(document):
  pattern = r"[\s.,;!?()]+|[.,;!?()]"
  tokens = re.split(pattern, document)
  tokens = [token for token in tokens if token]
  return tokens

In [11]:
tokenize_document(text)

['Diwali',
 'English:',
 '/dɪˈwɑːliː/',
 'Deepavali[3]',
 'IAST:',
 'dīpāvalī',
 'or',
 'Divali',
 '[a]',
 'is',
 'the',
 'Hindu',
 'festival',
 'of',
 'lights',
 'with',
 'its',
 'variations',
 'also',
 'celebrated',
 'in',
 'other',
 'Indian',
 'religions',
 'It',
 'symbolises',
 'the',
 'spiritual',
 '"victory',
 'of',
 'light',
 'over',
 'darkness',
 'good',
 'over',
 'evil',
 'and',
 'knowledge',
 'over',
 'ignorance"',
 '[4][5][6][7]',
 'Diwali',
 'is',
 'celebrated',
 'during',
 'the',
 'Hindu',
 'lunisolar',
 'months',
 'of',
 'Ashvin',
 'according',
 'to',
 'the',
 'amanta',
 'tradition',
 'and',
 'Kartika—between',
 'around',
 'mid-September',
 'and',
 'mid-November',
 '[8][9][10][11]',
 'The',
 'celebrations',
 'generally',
 'last',
 'five',
 'or',
 'six',
 'days',
 '[12][13]',
 'Diwali',
 'is',
 'connected',
 'to',
 'various',
 'religious',
 'events',
 'deities',
 'and',
 'personalities',
 'such',
 'as',
 'being',
 'the',
 'day',
 'Rama',
 'returned',
 'to',
 'his',
 'kingd

# Stemming

In [12]:
import nltk
from nltk.stem import PorterStemmer

In [13]:
stemmer = PorterStemmer()


In [14]:
def stem_words(words):
  return [stemmer.stem(word) for word in words]

In [15]:
words = ["running", "flies", "happily", "denied"]
stemmed_words = stem_words(words)
stemmed_words

['run', 'fli', 'happili', 'deni']

# Lemmatization

In [16]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [17]:
def lemmatize_sentence(sentence):
  doc = nlp(sentence)
  return [token.lemma_ for token in doc]

In [18]:
sentence = "The cats are running faster than the dogs"
lemmas = lemmatize_sentence(sentence)
lemmas

['the', 'cat', 'be', 'run', 'fast', 'than', 'the', 'dog']

# Stopwords

In [19]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [20]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [21]:
def identify_stopwords(sentence):
  stop_words = set(stopwords.words('english'))
  words = word_tokenize(sentence)
  return [word for word in words if word.lower() in stop_words]


In [22]:
sentence = "This is a sample sentence and it contains some stop words"
identify_stopwords(sentence)

['This', 'is', 'a', 'and', 'it', 'some']

# Parts of Speech Tagging

In [23]:
import spacy

In [24]:
nlp = spacy.load("en_core_web_sm")

In [25]:
def pos_tagging(sentences):
  doc = nlp(sentence)
  pos_tags = [(token.text, token.pos_) for token in doc]
  return pos_tags

In [26]:
sentence = "The quick brown fox jumps over the lazy dog"
tags = pos_tagging(sentence)
for word, pos in tags:
  print(f"{word} -> {pos}")

The -> DET
quick -> ADJ
brown -> ADJ
fox -> NOUN
jumps -> VERB
over -> ADP
the -> DET
lazy -> ADJ
dog -> NOUN


# Dependency Parsing

- Identify how words relate to each other in terms of grammatical functions
- A sentence is viewed as a directed graph
  - Words are nodes
  - Grammatical relationships between words are directed edges

In [27]:
import spacy
from spacy import displacy

In [28]:
nlp = spacy.load('en_core_web_sm')

In [31]:
def parse_dependencies(sentence):
  doc = nlp(sentence)
  displacy.render(doc, style="dep", jupyter=True)

In [32]:
sentence = "The cat sat on the mat."
parse_dependencies(sentence)

# Ambiguity and Challenges in NLP

There are 3 kinds of Ambiguity

1. Lexical Ambiguity
- Lexical  Ambiguity exists in the presence of 2 or more possible meanings of the sentence within a single word.
- Example: Tom is looking for a **match**
  - It could be that either "Tom is looking for a partner" or "Tom is looking to play a game."

2. Syntactic Ambiguity
- Syntactic Ambiguity exists in the presence of 2 or more possible meanings within the sentence
- Example: I saw Nancy with a binocular
  - It could be that either Nancy had binoculars or I spotted Nancy with binoculars

3. Referential Ambiguity
- Referential Ambiguity exists when you are referring to something using the pronoun
- Example: Jay went to meet Tina. She said, "I am hungry"
  - Who is hungry - Tina or Jay?