In [35]:
# 1: Segmentation & Tokenization
import nltk
import spacy

# $ python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

document = '''I Love Coding.. Geeks for Geeks helped me in this regard very much. 
I Love Geeks for Geeks..'''

# Segmentation
doc = nlp(document)
for i in doc.sents:
    print(i)


# Tokenization
doc = nlp(document)
for sentence in doc.sents:
    words = [word.text for word in sentence]
    print(words)


I Love Coding..
Geeks for Geeks helped me in this regard very much. 

I Love Geeks for Geeks..
['I', 'Love', 'Coding', '..']
['Geeks', 'for', 'Geeks', 'helped', 'me', 'in', 'this', 'regard', 'very', 'much', '.', '\n']
['I', 'Love', 'Geeks', 'for', 'Geeks', '..']


In [10]:
# 2: Stemming & Lemmatization
import nltk

# nltk.download('punkt')
# nltk.download('wordnet')

words = ['eating', 'eats', 'eaten', 'eat' ]
words = ['studies', 'studying', 'cries', 'cry']
stemmer = nltk.stem.PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()

stemmed_words = [stemmer.stem(word) for word in words]
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

print(stemmed_words)
print(lemmatized_words)


['studi', 'studi', 'cri', 'cri']
['study', 'studying', 'cry', 'cry']


In [23]:
# 3. NGram
import nltk
from nltk.util import ngrams, trigrams
text = "The flame that burns Twice as bright burns half as long"
words = nltk.word_tokenize(text)
trigrams = ngrams(words, 3)
for trigram in trigrams:
    print(trigram)

['The', 'flame', 'that', 'burns', 'Twice', 'as', 'bright', 'burns', 'half', 'as', 'long']
('The', 'flame', 'that')
('flame', 'that', 'burns')
('that', 'burns', 'Twice')
('burns', 'Twice', 'as')
('Twice', 'as', 'bright')
('as', 'bright', 'burns')
('bright', 'burns', 'half')
('burns', 'half', 'as')
('half', 'as', 'long')


In [25]:
# 4. POS Tagging
import nltk
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('universal_tagset')
text = "Joe waited for the train, but the train was late"
words = nltk.word_tokenize(text)
hmm_tagged = nltk.pos_tag(words)
nn_tagged = nltk.pos_tag(words, tagset='universal')
print("PoS tagging using HMM:", hmm_tagged)
print("PoS tagging using NN:", nn_tagged)

PoS tagging using HMM: [('Joe', 'NNP'), ('waited', 'VBD'), ('for', 'IN'), ('the', 'DT'), ('train', 'NN'), (',', ','), ('but', 'CC'), ('the', 'DT'), ('train', 'NN'), ('was', 'VBD'), ('late', 'JJ')]
PoS tagging using NN: [('Joe', 'NOUN'), ('waited', 'VERB'), ('for', 'ADP'), ('the', 'DET'), ('train', 'NOUN'), (',', '.'), ('but', 'CONJ'), ('the', 'DET'), ('train', 'NOUN'), ('was', 'VERB'), ('late', 'ADJ')]


In [28]:
# 5. Syntactic Parsing
import nltk
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('maxent_ne_chunker')
# nltk.download('words')
# nltk.download('treebank')
text = "I ate hot ice-cream ,before match start"
words = nltk.word_tokenize(text)
tagged_words = nltk.pos_tag(words)
syntactic_tree = nltk.ne_chunk(tagged_words, binary=True)
print("Syntactic tree:", syntactic_tree)

Syntactic tree: (S I/PRP ate/VBP hot/JJ ice-cream/NN ,/, before/IN match/JJ start/NN)


In [30]:
# 6. Dependency Parsing
import spacy
# $ python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")
text = "John likes Mary because she is beautiful."
doc = nlp(text)
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,[child for child in token.children])

John nsubj likes VERB []
likes ROOT likes VERB [John, Mary, is, .]
Mary dobj likes VERB []
because mark is AUX []
she nsubj is AUX []
is advcl likes VERB [because, she, beautiful]
beautiful acomp is AUX []
. punct likes VERB []


In [60]:
# 7. Named Entity Recognition
import spacy
# $ python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")
text = "Nitin is studying at Indian Institute of technology Bombay."
doc = nlp(text)
for entity in doc.ents:
    print(entity.label_, entity.text)

PERSON Nitin
ORG Indian Institute of technology
GPE Bombay


In [67]:
# 8. Text Summarization
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from heapq import nlargest

# nltk.download('stopwords')

text = """
Natural language processing (NLP) is a branch of artificial intelligence that
focuses on the
interaction between computers and human language. NLP has been around for
several decades, but
recent advances in machine learning and deep learning have dramatically
improved its capabilities.
NLP is used in a wide range of applications, from virtual assistants like Siri and
Alexa to
sentiment analysis, machine translation, and even content generation. NLP
involves a range of
techniques, including tokenization, part-of-speech tagging, named entity
recognition, and
sentiment analysis, among others. These techniques can be used to analyze and
understand human
language in a variety of contexts, from social media posts to scientific literature.
Despite its many
successes, NLP remains a challenging field, as natural language is complex and
often ambiguous.
As NLP continues to evolve, it has the potential to transform the way we
interact with technology
and with each other, opening up new possibilities for communication,
collaboration, and creativity.
"""

num_sentences = 2
sentences = sent_tokenize(text)
words = word_tokenize(text.lower())  # Convert all words to lowercase
stop_words = set(stopwords.words('english'))
word_freq = {}
for word in words:
    if word not in stop_words:
        if word not in word_freq:
            word_freq[word] = 1
        else:
            word_freq[word] += 1

max_freq = max(word_freq.values())
        
for word in word_freq.keys():
    word_freq[word] = (word_freq[word] / max_freq)

sent_scores = {}
for sentence in sentences:
    for word in word_tokenize(sentence.lower()):
        if word in word_freq.keys():
            if len(sentence.split()) < 30:  # Use `split()` instead of `split(' ')`
                if sentence not in sent_scores.keys():
                    sent_scores[sentence] = word_freq[word]
                else:
                    sent_scores[sentence] += word_freq[word]
                    
summary_sentences = nlargest(num_sentences, sent_scores, key=sent_scores.get)
summary = ' '.join(summary_sentences)
print(summary)


NLP
involves a range of
techniques, including tokenization, part-of-speech tagging, named entity
recognition, and
sentiment analysis, among others. NLP is used in a wide range of applications, from virtual assistants like Siri and
Alexa to
sentiment analysis, machine translation, and even content generation.


[nltk_data] Downloading package stopwords to /home/me/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
