**Shibu Mohapatra MSC AI**

# UNIT 1: Introduction to Natural Language Processing

## Text processing terminology

### Noise removal

In [None]:
import re
 
text = "Five fantastic fish flew off to find faraway functions. Maybe find another five fantastic fish? Find my fish with a function please!"
# remove punctuation
result = re.sub(r'[\.\?\!\,\:\;\"]', '', text)
print(result)

Five fantastic fish flew off to find faraway functions Maybe find another five fantastic fish Find my fish with a function please


### Text Normalizing - Stemming

In [None]:
from nltk.stem import PorterStemmer
 
tokenized = ["So", "many", "squids", "are", "jumping"]
 
stemmer = PorterStemmer()
stemmed = [stemmer.stem(token) for token in tokenized]
 
print(stemmed)

['So', 'mani', 'squid', 'are', 'jump']


### Lemmatization

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from nltk.stem import WordNetLemmatizer
 
tokenized = ["squids","jumps"]
 
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(token) for token in tokenized]
 
print(lemmatized)

['squid', 'jump']


### Stopword

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
 
example = "This is a sample sentence, showing off the stop words filtration."
stop_words = set(stopwords.words('english')) 

word_tokens = word_tokenize(example)
 
# remove stopwords from tokens in dataset
statement_no_stop = [word for word in example if word not in stop_words]
statement_no_stop = []

for w in word_tokens:
  if w not in stop_words:
    statement_no_stop.append(w)

print(word_tokens)
print(statement_no_stop)

['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


### TF-IDF Model

In [None]:
paragraph = "Five fantastic fish flew off to find faraway functions. Maybe find another five fantastic fish? Find my fish with a function please!"

In [None]:
#these steps are the same as above

sentences = nltk.sent_tokenize(paragraph)

corpus = []

#These steps can be considered as light text pre-processing
for i in range(len(sentences)):
  review = re.sub('[^a-zA-Z]', ' ', sentences[i]) #replacing characters with space other than alphabeticals 
  review = review.lower() #lower-casing the words
  review = review.split() #removind the spaces before and after words
  review = [stemmer.stem(word) for word in review if word not in set(stopwords.words('english'))] #removing stop-words
  review = ' '.join(review) #joining each words separated by a space in between
  corpus.append(review) #add above item to a new list

In [None]:
#creating TF-IDF Model
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer()
tf_idf_vector = tf_idf.fit_transform(corpus).toarray()

In [None]:
from gensim.models import Word2Vec

#These steps can be considered as light text pre-processing
for i in range(len(sentences)):
  review = re.sub('[^a-zA-Z]', ' ', sentences[i]) #replacing characters with space other than alphabetical ones
  review = review.lower() #lower-casing the words
  review = review.split() #removind the spaces before and after words
  review = [lemmatizer.lemmatize(word) for word in review if word not in set(stopwords.words('english'))] #removing stop-words
  review = ' '.join(review) #joining each words separated by a space in between
  corpus.append(review) #add above item to a new list
corpus = [nltk.word_tokenize(word) for word in corpus] #word tokenizing the sentences in the "corpus"

In [None]:
corpus

[['five', 'fantast', 'fish', 'flew', 'find', 'faraway', 'function'],
 ['mayb', 'find', 'anoth', 'five', 'fantast', 'fish'],
 ['find', 'fish', 'function', 'pleas'],
 ['five', 'fantastic', 'fish', 'flew', 'find', 'faraway', 'function'],
 ['maybe', 'find', 'another', 'five', 'fantastic', 'fish'],
 ['find', 'fish', 'function', 'please']]

## Tokenization

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
 
text1 = "This is a text to tokenize. This is also a sentence."
tokenized = word_tokenize(text1)
sentence = sent_tokenize(text1) 

print(tokenized)
print(sentence)

['This', 'is', 'a', 'text', 'to', 'tokenize', '.', 'This', 'is', 'also', 'a', 'sentence', '.']
['This is a text to tokenize.', 'This is also a sentence.']


## Sentence Segmentation

In [None]:
#import spacy library
import spacy
  
#load core english library
nlp = spacy.load("en_core_web_sm")

In [None]:
#take unicode string  
#here u stands for unicode

doc = nlp("I like Artificial Intelligence. I taught myself many concepts of Natural Language Processing. I have a vision to become a Data Scientist.")

#to print sentences
for sent in doc.sents:
  print(sent)

I like Artificial Intelligence.
I taught myself many concepts of Natural Language Processing.
I have a vision to become a Data Scientist.


In [None]:
#converting the generator object result in to list
doc1 = list(doc.sents)
  
#Now we can use it randomly as
doc1[1]

I taught myself many concepts of Natural Language Processing.