In [5]:
corpus=['The smell of fresh-baked bread fills the air',
        'Learning a new language can be challenging but rewarding',
        'The scientist conducted experiments to test her hypothesis',
        'The book, which was published last year, became an instant bestseller',
        'The long wait for the bus frustrated commuters, leading to complaints from many of them',
        'The river wound its way down the canyon toward the sea'
        'The Eiffel Tower is a famous landmark in Paris, France',
        'In 1969, Neil Armstrong became the first person to walk on the moon']

preprocessing

In [6]:
#preprocessing
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\Shruti
[nltk_data]     Nathavani\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Shruti
[nltk_data]     Nathavani\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
#stopwords for english language
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
stopwords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [13]:
#make function to tokenize
def tokenize(doc_text):
    tokens=nltk.word_tokenize(doc_text)
    return tokens


In [15]:
def remove_stopwords(doc_text):
    cleaned_text = []
    for words in doc_text:
        if words not in stopwords:
            cleaned_text.append(words)
    return cleaned_text

In [24]:
#have the stemming process 
def stemming_process(doc_tokens):
    stemmer=nltk.stem.PorterStemmer()
    stemmed_words=[]
    for word in doc_tokens:
        stemmed_words.append(stemmer.stem(word))
    return stemmed_words
        

In [25]:
text=corpus[0]    
text

'The smell of fresh-baked bread fills the air'

In [26]:
tokens=tokenize(text)
print(tokens)
cleaned_text=remove_stopwords(tokens)
print(cleaned_text)
stemmed_tokens=stemming_process(tokens)
print(stemmed_tokens)
new_text=" ".join(stemmed_tokens)
print(new_text)

['The', 'smell', 'of', 'fresh-baked', 'bread', 'fills', 'the', 'air']
['The', 'smell', 'fresh-baked', 'bread', 'fills', 'air']
['the', 'smell', 'of', 'fresh-bak', 'bread', 'fill', 'the', 'air']
the smell of fresh-bak bread fill the air


In [31]:
#preprocessing the corpus
cleaned_corpus = []
for docs in corpus:
    tokens=tokenize(docs)
    cleaned_tokens=remove_stopwords(tokens)
    stemmed_tokens=stemming_process(cleaned_tokens)
    new_text=" ".join(stemmed_tokens)
    cleaned_corpus.append(new_text)
print(cleaned_corpus)

['the smell fresh-bak bread fill air', 'learn new languag challeng reward', 'the scientist conduct experi test hypothesi', 'the book , publish last year , becam instant bestsel', 'the long wait bu frustrat commut , lead complaint mani', 'the river wound way canyon toward seath eiffel tower famou landmark pari , franc', 'in 1969 , neil armstrong becam first person walk moon']


Vectorise the Document

In [32]:
#tfidf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer()
vectorizer.fit(cleaned_corpus)
doc_vectors=vectorizer.transform(cleaned_corpus)


In [33]:
doc_vectors.shape

(7, 52)

In [34]:
doc_vectors

<7x52 sparse matrix of type '<class 'numpy.float64'>'
	with 57 stored elements in Compressed Sparse Row format>

In [35]:
vectorizer.get_feature_names_out()

array(['1969', 'air', 'armstrong', 'bak', 'becam', 'bestsel', 'book',
       'bread', 'bu', 'canyon', 'challeng', 'commut', 'complaint',
       'conduct', 'eiffel', 'experi', 'famou', 'fill', 'first', 'franc',
       'fresh', 'frustrat', 'hypothesi', 'in', 'instant', 'landmark',
       'languag', 'last', 'lead', 'learn', 'long', 'mani', 'moon', 'neil',
       'new', 'pari', 'person', 'publish', 'reward', 'river', 'scientist',
       'seath', 'smell', 'test', 'the', 'toward', 'tower', 'wait', 'walk',
       'way', 'wound', 'year'], dtype=object)

In [36]:
import pandas as pd

In [37]:
vector_frame=pd.DataFrame(doc_vectors.toarray(),
                          columns=vectorizer.get_feature_names_out())

In [38]:
vector_frame

Unnamed: 0,1969,air,armstrong,bak,becam,bestsel,book,bread,bu,canyon,...,smell,test,the,toward,tower,wait,walk,way,wound,year
0,0.0,0.398689,0.0,0.398689,0.0,0.0,0.0,0.398689,0.0,0.0,...,0.398689,0.0,0.215139,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.434734,0.234589,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.314187,0.378499,0.378499,0.0,0.0,0.0,...,0.0,0.0,0.204244,0.0,0.0,0.0,0.0,0.0,0.0,0.378499
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.34729,0.0,...,0.0,0.0,0.187403,0.0,0.0,0.34729,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285235,...,0.0,0.0,0.153917,0.285235,0.285235,0.0,0.0,0.285235,0.285235,0.0
6,0.339245,0.0,0.339245,0.0,0.281603,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.339245,0.0,0.0,0.0


Cosine Similarity with Query

In [46]:
query="Neil Armstrong"
tokens=tokenize(query)
cleaned_tokens=remove_stopwords(tokens)
stemmed_tokens=stemming_process(cleaned_tokens)
cleaned_query=" ".join(stemmed_tokens)
query_vector=vectorizer.transform([cleaned_query])

In [47]:
query_vector

<1x52 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [48]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_doc_query=cosine_similarity(doc_vectors,query_vector).flatten()

In [51]:
cosine_doc_query

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.47976552])

In [52]:
#recommand top-k documents for a given query
document_index=cosine_doc_query.argsort()[:-10:-1]
document_index

array([6, 5, 4, 3, 2, 1, 0], dtype=int64)

In [53]:
for docs_recommend in document_index:
    print(corpus[docs_recommend])

In 1969, Neil Armstrong became the first person to walk on the moon
The river wound its way down the canyon toward the seaThe Eiffel Tower is a famous landmark in Paris, France
The long wait for the bus frustrated commuters, leading to complaints from many of them
The book, which was published last year, became an instant bestseller
The scientist conducted experiments to test her hypothesis
Learning a new language can be challenging but rewarding
The smell of fresh-baked bread fills the air
