## import the libraries

In [None]:
import nltk
import string
from nltk import word_tokenize
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag


In [None]:
# Read the file
file = open("sample.txt", 'r')
text = ''
for i in file.readlines():
    text+=i

print(text)

### 1) Trim unwanted spaces

In [None]:
trimmed_text = text.strip()
print(trimmed_text)

### 2) Convert to lower or upper case

In [None]:
"""If we are trying to understand 
the sentiment of a given tweet, we 
may not convert the tweet to lowercase 
because uppercase is often used to emphasize sentiment i.e. 
'AWESOME'and 'awesome' have different levels of emphasis."""

In [None]:
converted_text = trimmed_text.lower()
print(converted_text)

### Step 3: Tokenize the text and determine its vocabulary

In [None]:
# tokenize using the word_tokenizer which handles
# the smileys, hashtags and etc. ....

tokenized_list = word_tokenize(converted_text)
print(len(tokenized_list))
print(tokenized_list)

In [None]:
# tokenization using word punct tokenizer
punct_token_list = wordpunct_tokenize(converted_text)
print(len(punct_token_list))
print((punct_token_list))

In [None]:
# get the vocab
vocab_set = set(tokenized_list)
print(len(vocab_set))
print((vocab_set))

### Step 4: Remove stop words from the text

In [None]:
set_wo_stopwords = vocab_set - set(stopwords.words("english"))

In [None]:
set_wo_stopwords

In [None]:
#remove punctuation
set_wo_punct = set_wo_stopwords - set(punctuation)
print(set_wo_punct)

### Step 6: Normalize the text using stemming and/or lemmatization

#### Stemming

In [None]:
stemmed_list = []
stemObj = SnowballStemmer("english")
for i in set_wo_punct:
    stemmed_list.append(stemObj.stem(i))
    
print(stemmed_list)

#### Lemmatization

In [None]:
#parts of speech
pos_tag_list = pos_tag(set_wo_punct)
print(pos_tag_list)

In [None]:
#for getting the parts of speech
from nltk import wordnet

#Lemmatization
lemma_list = []
lemmaObj = WordNetLemmatizer()
for word, pos in pos_tag_list:
    if pos.startswith('J'):  # Adjective
        lemma = lemmaObj.lemmatize(word, 'a')
    elif pos.startswith('V'):  # Verb
        lemma = lemmaObj.lemmatize(word, 'v')
    elif pos.startswith('N'):  # Noun
        lemma = lemmaObj.lemmatize(word, 'n')
    elif pos.startswith('R'):  # Adverb
        lemma = lemmaObj.lemmatize(word, 'r')
    else:
        lemma = word
    lemma_list.append(lemma)
        
print(lemma_list)

### Step 7: Create n-grams from text 

In [None]:
from nltk import ngrams
bigrams = ngrams(set_wo_punct, 2)
print(list(bigrams))

## Regular Expressions 

In [None]:
import re

### Search

Search method searches for the string present in the r"" in the whole input string and returns a match object if there is a match else returns None. Search only returns the first match present in the string.

In [None]:
sent= "1947 was when India got their independence. "
print(" Occurances of a-z: ", re.search(r"[a-z]+", sent))

The match at the string "1947" is the only string according to search method because our set only ranges from 0 to 9.

In [None]:
sent1 = "1947 was when India became independent."
print(" Occurances of 0-9: ", re.search(r"[0-9]+", sent))

Our desired set is namely a-z, A-Z, 0-9, '_' and a space character. 

In [None]:
sent1 = "1947_was when India became independent."
print("Occurances of w and space: ", re.search(r"[\w ]+", sent1))

# substitution

Sub is substitution of a substring with another string in the given input string. So understandably it takes three parameters. 

First argument is the string to be removed, second argument is the resultant string and the last argument is the input string.

In [None]:
sent = "I like coffee" 
print(re.sub(r"coffee","tea",sent)) 

## Findall

Findall parses our input string from left to right and returns all the substrings matching with our raw string as a list.

In [None]:

print(re.findall(r"coffee",sent))

print(len(re.findall(r"coffee",sent)))



# Chunking

In [None]:
from nltk import ne_chunk
nltk.download('words')
nltk.download('maxent_ne_chunker')
barack = """Barack Hussein Obama (born August 4, 1961) is an American politician 
who served as the 44th President of the United States from January 20, 2009, to January 20, 2017.
A member of the Democratic Party, he was the first African American to assume the presidency 
and previously served as a United States Senator from Illinois (2005–2008)."""

token_barack = word_tokenize(barack)
pos_list = pos_tag(token_barack)
print(ne_chunk(pos_list))

The below code demonstrates the usage of RegexpParser which can give a more desirable result in comparison to the default NE Chunker. Here we need to configure how entities are determined, i.e. what kind of POS combinations results in a specific named entity.

In [None]:
from nltk import RegexpParser
grammar = r"""Place: {<NNP><NNPS>+}
           Date: {<NNP><CD><,><CD>}
           Person: {<NNP>+}"""
regParser = RegexpParser(grammar)
reg_lines = regParser.parse(pos_list)
print(reg_lines)

In [None]:
trump = """Donald John Trump (born June 14, 1946) is the 45th and current President of the United States.
Before entering politics, he was a businessman and television personality. 
Trump was born and raised in the New York City borough of Queens, and received an economics degree from the
 Wharton School of the University of Pennsylvania. 
He took charge of his family's real estate business in 1971, renamed it The Trump Organization, and expanded 
it from Queens and Brooklyn into Manhattan. 
The company built or renovated skyscrapers, hotels, casinos, and golf courses. 
Trump later started various side ventures, including licensing his name for real estate and consumer products.
He managed the company until his 2017 inauguration. 
He co-authored several books, including The Art of the Deal. He owned the Miss Universe and Miss USA beauty 
pageants from 1996 to 2015, and he produced and hosted the reality television show The Apprentice from 2003 to 2015.
Forbes estimates his net worth to be $3.1 billion."""

# Tagging

In [None]:
sent1 = " The race offcials refused to permit the team to race roday"
print(pos_tag(word_tokenize(sent1)))

Here, both the races are classified as NOUN which is incorrect. Hence, it is making the errors.

## Default Tagger

In [None]:
from nltk.corpus import brown
nltk.download('brown')
#brown is a corpus

# Trying to get the most common tag in the brown corpus

tags = [tag for (word,tag) in brown.tagged_words()]
most_common_tag = nltk.FreqDist(tags).max()
print(most_common_tag)


In [None]:
# Use the most common tagger as the input for the default tagger
from nltk import DefaultTagger
default_tag = DefaultTagger(most_common_tag)
def_tagged_barack = default_tag.tag(token_barack)
print(def_tagged_barack)

## Lookup taggers

A NgramTagger tags a word based on the previous n words occurring in the text.

In [None]:
sent1 = "the quick brown fox jumps over the lazy dog"
training = pos_tag(word_tokenize(sent1))
print(training)

In [None]:
# Let us train the n gram tagger
ngram_tagger = nltk.NgramTagger(n=2, train = [training])


In [None]:
sent2 = "the lazy dog was jumped over by the quick brown fox"
tag = ngram_tagger.tag(word_tokenize(sent2))
print(tag)

This looking up of occurrence of words in the sequence appearing in the training set can be considered as the context.

Therefore, we can now understand that a NgramTagger tags words that appear in context, and the context is defined by the window 'n' which is the number of tokens to consider together.

## Example of tagging pipeline

### <b> UnigramTagger --> RegexpTagger --> DefaultTagger. Note that --> indicates backoff.<b>

In [None]:
default_tag = DefaultTagger('NN')
patterns = [
    (r'.*\'s$', 'NN$'), #possessive nouns
    (r'.*es$','VBZ'),
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
    (r'[Aa][Nn][Dd]','CC'),
    (r'.*ed$', 'VBD'),
    (r',' , ','),
    (r'.*ould$', 'MD'),
    (r'.*ing$', 'VBG'),
    (r'.*s$', 'NNS'),
]
regexp_tag = nltk.RegexpTagger(patterns, backoff = default_tag)
unigram_tag = nltk.UnigramTagger(train = [pos_list], backoff = regexp_tag)
trump_tag = unigram_tag.tag(word_tokenize(trump))
print(trump_tag)


### In the above code, the UnigramTagger is first invoked to tag the tokens in the Trump article. Whichever words are tagged None by this UnigramTagger are then sent as backoff to the RegexpTagger. The RegexpTagger then tags the words based on the patterns rule it is fed. Any words that are still left untagged are then sent to the DefaultTagger as backoff. 

# Corpus

Let us now deal with the collection of the documents .
Collection of the documents is called <b> Corpus <b> . 

In [None]:
from nltk.corpus import brown 

# brown corpus is a tagged corpus where each word in each file of the corpus is associated 
# with the POS tag

# Display all the files in the corpus
print(brown.fileids())

In [None]:
# To display the contents of the file
# in the no. of paragraphs, sentences and words
print(brown.paras('ck11'))
print("----------------------------------------")
print(brown.sents('ck11'))
print("----------------------------------------")
print(brown.words('ck11'))


In [None]:
# To display yhe POS tag for each word in a specific file of the corpus
print(brown.tagged_words('ck11'))

In [None]:
nltk.download('conll2000')

In [None]:
from nltk.corpus import conll2000
print(conll2000.fileids())

In [None]:
print(conll2000.words('train.txt'))

In [None]:
print(conll2000.chunked_sents('train.txt'))

1. What are the different types of Chunking in NLP?
Group of words make up phrases and there are five major categories.
  - Noun Phrase (NP)
  - Verb phrase (VP)
  - Adjective phrase (ADJP)
  - Adverb phrase (ADVP)
  - Prepositional phrase (PP)

# Make our own corpora

In [None]:
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
path =  "E:/Natural Language Processing/text_doc/"
president_corpus = PlaintextCorpusReader(path, ".*")
print(president_corpus.fileids())


In [None]:
# Display the senetences in a sepcifc file
print((president_corpus.sents('barack.txt')))

In [None]:
# Display the sentences in all the files of the corpus
print(president_corpus.sents())

In [None]:
# Display the words in as specific file
print(president_corpus.words('barack.txt'))

In [None]:
print(president_corpus.words())

Note that the PlainTextCorpusReader automatically splits the text data into paragraphs, sentences and words using appropriate tokenizers. 

# Vectorizing textual data

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
sen1= "India is a republic country. We are proud Indians."
sent2 = "The current Prime Minister of India is Shri. Narendra Modi."
count_vectorizer = CountVectorizer()
# DTM = document text matrix
dtm = count_vectorizer.fit_transform([sent1,sent2])
print(type(dtm))
print(type(dtm.toarray()))
print(pd.DataFrame(data = dtm.toarray(), columns = count_vectorizer.get_feature_names_out()))

The similarity between the two documents can now be found using commonly used distance metrics like Euclidean distance or cosine distance.

The below code demonstrates the use of cosine distance. The smaller this value, the more similar the two documents.

In [None]:
from scipy.spatial.distance import cosine
print(cosine(dtm[0].toarray().flat, dtm[1].toarray().flat))
# cosine only accepts 1D vectors

The values is very high so the two documents are not very similar

In [None]:
# We will use the TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
sent1 = "India is a republic country. We are proud Indians."
sent2 = "The current Prime Minister of India is Shri. Narendra Modi."
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectors = tfidf_vectorizer.fit_transform([sent1,sent2])
print(pd.DataFrame(data = tfidf_vectors.toarray(), columns = tfidf_vectorizer.get_feature_names_out()))

In [None]:
# Let us calculate the cosine distance
from scipy.spatial.distance import cosine
print(cosine(tfidf_vectors[0].toarray().flat, tfidf_vectors[1].toarray().flat))

Since the values is high, they are not similar.

In [None]:
#  Let us now determine the Tf-IDF vectors and similarities between the 3 documents
#  in our corpus (president_corpus) 
from nltk.corpus.reader.plaintext import PlaintextCorpusReader


path = "E:/Natural Language Processing/text_doc/"
president_corpus = PlaintextCorpusReader(path, ".*", encoding = "utf-8")

tf_idf = TfidfVectorizer(input='filename')
files = [path+filename for filename in list(president_corpus.fileids())]
tf_idf_matrix = tf_idf.fit_transform(raw_documents = files)
print(president_corpus.fileids())
barack = tf_idf_matrix.toarray()[0]
bush = tf_idf_matrix.toarray()[1]
trump = tf_idf_matrix.toarray()[2]

In [None]:
# calculate the cosine distance
from scipy.spatial.distance import cosine
print("Distance between articles on barack and bush is: ",cosine(barack, bush))

print("Distance between articles on bush and trump is: ", cosine(bush, trump))

print("Distance between articles on barack and trump is: ", cosine(barack, trump))


The most related documents are bush and trump

In [None]:
from numpy import sqrt
def euclidean_distance(x,y):
    return sqrt(sum(pow(a-b,2) for a , b in zip(x, y)))
print("Distance between articles on barack and trump is: ", euclidean_distance(barack, trump))


# Find about jaccard similarity

Jaccard Similarity is used to find similarities between sets. The Jaccard similarity measures similarity between finite sample sets and is defined as the cardinality of the intersection of sets divided by the cardinality of the union of the sample sets.
Suppose you want to find Jaccard similarity between two sets A and B, it is the ratio of the cardinality of A ∩ B and A ∪ B.
Jaccard Similarity J(A,B) = |A∩B|/|A∪B|

In [None]:
from math import *
def jaccard(x, y):
    
    intersect_card = len(set.intersection(*[set(x), set(y)]))
    union_card = len(set.union(*[set(x), set(y)]))
    return intersect_card/float(union_card)

print("Distance between articles on barack and trump is: ", jaccard(barack, trump))


If two datasets share the exact same members, their Jaccard Similarity Index will be 1. Conversely, if they have no members in common then their similarity will be 0.

# Wordlist

## detect unusual words in text

In [None]:
import nltk 
sent1 = """Just forced myself to eat a slice. I'm really not hungry tho. 
           Mark is getting worried. He knows I'm sick when I turn down pizza. Lol"""
sent2 = "I call you later, don't have nw. If urgnt, sms me."
sent3 = "Watching a telugu movie..wat abt u?"
def find_unusual_words(text):
    
    # finds the vocab set for the words in the sentecnce 
    # and convert them to lowercase if they are alphabets
    text_vocab_set = set(w.lower() for w in text if w.isalpha())
    
    # make the vocabulary set for the corpus
    english_vocab_set = set(w.lower() for w in nltk.corpus.words.words())
    unusual_set = text_vocab_set - english_vocab_set
    return sorted(unusual_set)

print(find_unusual_words(nltk.wordpunct_tokenize(sent1)))
print(find_unusual_words(nltk.wordpunct_tokenize(sent2)))
print(find_unusual_words(nltk.wordpunct_tokenize(sent3)))

## Detect Possible Mistakes

If a word could not found in the word list, it is probable that it is a spelling mistake.

The below code, compares the unusual words with known words and suggests possible words based on edit distance. Edit distance is the measure of how similar or dissimilar two words are. 

In [None]:
unusual_words_found = ['knows', 'lol', 'nw', 'sms', 'urgnt', 'abt']
from nltk.metrics import edit_distance
possible_suggestions = {}
english_vocab_set = set(w.lower() for w in nltk.corpus.words.words())
for unusual_word in unusual_words_found:
    
    for word in english_vocab_set :
        
        dist = edit_distance(unusual_word, word)
        if dist < len(unusual_word)/2:
            
            if unusual_word not in possible_suggestions.keys():
                
                possible_suggestions[unusual_word] = [word]
            else:
                
                possible_suggestions[unusual_word].append(word)

print(possible_suggestions["lol"])
                
        

# Detect the names of the people 

In [None]:
nltk.download('names')
def names_in_text(text):
    
    names = []
    words_set = set(i for i in text if i.isalpha())
    male_names = nltk.corpus.names.words('male.txt')
    female_names = nltk.corpus.names.words('female.txt')
    for w in words_set:
        if male_names.count(w) > 0 or female_names.count(w) > 0:
            
            names.append(w)

    return names

sent1 = "John and Mary go to the church every Sunday"
sent2 = "No man has ever seen the dark side of the Moon"
print(names_in_text(word_tokenize(sent1)))
print(names_in_text(word_tokenize(sent2)))

# Wordnet

In [None]:
from nltk.corpus import wordnet as wn

# To get the all possible meanings of the word "dog"
print(wn.synsets("dog"))

In [None]:
# To get the all lemma names of "dog"