In [1]:
import textblob

In [2]:
from textblob import TextBlob

In [6]:
blob = TextBlob("John is learning natural language processing")

In [7]:
for np in blob.noun_phrases:
    print(np)

john
natural language processing


# SIMILARITY IN THE TEXT

In [8]:
documents = (
"I like NLP",
"I am exploring NLP",
"I am a beginner in NLP",
"I want to learn NLP",
"I like advanced NLP"
)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

In [12]:
tfidf_matrix.shape

(5, 10)

In [15]:
cosine_similarity(tfidf_matrix[0:3], tfidf_matrix)

array([[1.        , 0.17682765, 0.14284054, 0.13489366, 0.68374784],
       [0.17682765, 1.        , 0.37765328, 0.09223325, 0.12090552],
       [0.14284054, 0.37765328, 1.        , 0.07450559, 0.09766691]])

# PART OF SPEECH 

In [16]:
Text = "I love NLP and I will learn NLP in 2 month"

In [17]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
stop_words = stopwords.subdirwords('english')

In [19]:
tokens = sent_tokenize(Text)

In [20]:
tokens

['I love NLP and I will learn NLP in 2 month']

In [21]:
for i in tokens:
    words = nltk.word_tokenize(i)
    words = [w for w in words if not w in stop_words]
    tags = nltk.pos_tag(words)

In [22]:
tags

[('I', 'PRP'),
 ('love', 'VBP'),
 ('NLP', 'NNP'),
 ('I', 'PRP'),
 ('learn', 'VBP'),
 ('NLP', 'RB'),
 ('2', 'CD'),
 ('month', 'NN')]

# EXTRACTING ENTITIES FROM THE TEXT

In [23]:
sent = "John is studying at Stanford University in California"

In [24]:
from nltk import ne_chunk, word_tokenize

In [25]:
ne_chunk(nltk.pos_tag(word_tokenize(sent)), binary= False)

The Ghostscript executable isn't found.
See http://web.mit.edu/ghostscript/www/Install.htm
If you're using a Mac, you can try installing
https://docs.brew.sh/Installation then `brew install ghostscript`


LookupError: 

Tree('S', [Tree('PERSON', [('John', 'NNP')]), ('is', 'VBZ'), ('studying', 'VBG'), ('at', 'IN'), Tree('ORGANIZATION', [('Stanford', 'NNP'), ('University', 'NNP')]), ('in', 'IN'), Tree('GPE', [('California', 'NNP')])])

using spacy

In [28]:
import spacy

In [35]:
nlp = spacy.load('en_core_web_sm')

In [36]:
doc = nlp(u'Apple is ready to launch new phone worth $10000 in New york time square ')

In [38]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char,ent.label_)

Apple 0 5 ORG
10000 42 47 MONEY
New york 51 59 GPE


# Extracting topics from text

In [54]:
doc1 = "I am learning NLP, it is very interesting and exciting. it includes machine learning and deep learning"

In [55]:
doc2 = "My father is a data scientist and he is nlp expert"
doc3 = "My sister has good exposure into android development"

In [56]:
doc_complete = [doc1, doc2, doc3]
doc_complete

['I am learning NLP, it is very interesting and exciting. it includes machine learning and deep learning',
 'My father is a data scientist and he is nlp expert',
 'My sister has good exposure into android development']

cleaning and processing

In [57]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

In [58]:
stop = stopwords.words('english')

In [59]:
exclude = set(string.punctuation)

In [60]:
lemma = WordNetLemmatizer()

In [78]:
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = "".join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join([lemma.lemmatize(word) for word in punc_free.split()])
    return normalized

In [79]:
doc_clean = [clean(doc).split() for doc in doc_complete]

In [80]:
doc_clean

[['learning',
  'nlp',
  'interesting',
  'exciting',
  'includes',
  'machine',
  'learning',
  'deep',
  'learning'],
 ['father', 'data', 'scientist', 'nlp', 'expert'],
 ['sister', 'good', 'exposure', 'android', 'development']]

Preparing document term matrix

In [85]:
import gensim
from gensim import corpora

In [82]:
dictionary = corpora.Dictionary(doc_clean)

In [83]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
doc_term_matrix

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 3), (5, 1), (6, 1)],
 [(6, 1), (7, 1), (8, 1), (9, 1), (10, 1)],
 [(11, 1), (12, 1), (13, 1), (14, 1), (15, 1)]]

In [86]:
Lda = gensim.models.ldamodel.LdaModel

In [87]:
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word =dictionary, passes=50)

In [88]:
print(ldamodel.print_topics())

[(0, '0.233*"learning" + 0.093*"deep" + 0.093*"includes" + 0.093*"interesting" + 0.093*"machine" + 0.093*"exciting" + 0.093*"nlp" + 0.023*"father" + 0.023*"scientist" + 0.023*"data"'), (1, '0.129*"nlp" + 0.129*"father" + 0.129*"data" + 0.129*"scientist" + 0.129*"expert" + 0.032*"exposure" + 0.032*"android" + 0.032*"good" + 0.032*"development" + 0.032*"sister"'), (2, '0.129*"sister" + 0.129*"good" + 0.129*"exposure" + 0.129*"development" + 0.129*"android" + 0.032*"nlp" + 0.032*"father" + 0.032*"scientist" + 0.032*"data" + 0.032*"expert"')]
