In [1]:
#Prepare data for use in this exercise

import nltk
import os
#Download punkt package, used part of the other commands
nltk.download('punkt')

#Read the base file into a token list
base_file = open(os.getcwd()+ "/example.txt", 'rt')
raw_text = base_file.read()
base_file.close()

#Execute the same pre-processing done in module 3
token_list = nltk.word_tokenize(raw_text)

token_list2 = list(filter(lambda token: nltk.tokenize.punkt.PunktToken(token).is_non_punct, token_list))

token_list3=[word.lower() for word in token_list2 ]

nltk.download('stopwords')
from nltk.corpus import stopwords
token_list4 = list(filter(lambda token: token not in stopwords.words('english'), token_list3))

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
token_list6 = [lemmatizer.lemmatize(word) for word in token_list4 ]

print("\n Total Tokens : ",len(token_list6))

[nltk_data] Downloading package punkt to /Users/moo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/moo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/moo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



 Total Tokens :  155


## 1. Build ngrams

In [2]:

from nltk.util import ngrams
from collections import Counter

#Find bigrams and print the most common 5
bigrams = ngrams(token_list6,2)
print("Most common bigrams : ")
print(Counter(bigrams).most_common(5))

#Find trigrams and print the most common 5
trigrams = ngrams(token_list6,3)
print(" \n Most common trigrams : " )
print(Counter(trigrams).most_common(5))

Most common bigrams : 
[(('le', 'miserables'), 2), (('hate', 'job'), 2), (('abandon', 'hope'), 1), (('hope', 'ye'), 1), (('ye', 'enter'), 1)]
 
 Most common trigrams : 
[(('abandon', 'hope', 'ye'), 1), (('hope', 'ye', 'enter'), 1), (('ye', 'enter', 'scrawled'), 1), (('enter', 'scrawled', 'blood'), 1), (('scrawled', 'blood', 'red'), 1)]


## 2. Parts-of-Speech Tagging

Some examples of Parts-of-Speech abbreviations:
NN : noun
NNS : noun plural
VBP : Verb singular present.

In [3]:
#download the tagger package
nltk.download('averaged_perceptron_tagger')

#Tag and print the first 10 tokens
nltk.pos_tag(token_list4)[:10]

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/moo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('abandon', 'NNS'),
 ('hope', 'VBP'),
 ('ye', 'RB'),
 ('enter', 'RB'),
 ('scrawled', 'VBN'),
 ('blood', 'NN'),
 ('red', 'JJ'),
 ('lettering', 'JJ'),
 ('side', 'NN'),
 ('chemical', 'NN')]

## 3. TF-IDF Analysis

Term Frequency-Inverse Document Frequency (TF-IDF) can be used to identify important terms within documents.

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([" ".join(token_list6)])
feature_names = vectorizer.get_feature_names_out()
tfidf_scores = dict(zip(feature_names, tfidf_matrix.toarray().flatten()))
sorted_scores = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)
print("Top TF-IDF Scores:", sorted_scores[:10])

Top TF-IDF Scores: [('price', 0.26490647141300877), ('word', 0.26490647141300877), ('bus', 0.19867985355975656), ('hate', 0.19867985355975656), ('side', 0.19867985355975656), ('american', 0.13245323570650439), ('another', 0.13245323570650439), ('cab', 0.13245323570650439), ('driver', 0.13245323570650439), ('enough', 0.13245323570650439)]


## 4. Building TF-IDF matrix

In [5]:
#Use scikit-learn library
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

#Use a small corpus for each visualization
vector_corpus = [
    'NBA is a Basketball league',
    'Basketball is popular in America.',
    'TV in America telecast BasketBall.',
]

#Create a vectorizer for english language
vectorizer = TfidfVectorizer(stop_words='english')

#Create the vector
tfidf=vectorizer.fit_transform(vector_corpus)

print("Tokens used as features are : ")
print(vectorizer.get_feature_names_out())

print("\n Size of array. Each row represents a document. Each column represents a feature/token")
print(tfidf.shape)

print("\n Actual TF-IDF array")
tfidf.toarray()


Tokens used as features are : 
['america' 'basketball' 'league' 'nba' 'popular' 'telecast' 'tv']

 Size of array. Each row represents a document. Each column represents a feature/token
(3, 7)

 Actual TF-IDF array


array([[0.        , 0.38537163, 0.65249088, 0.65249088, 0.        ,
        0.        , 0.        ],
       [0.54783215, 0.42544054, 0.        , 0.        , 0.72033345,
        0.        , 0.        ],
       [0.44451431, 0.34520502, 0.        , 0.        , 0.        ,
        0.5844829 , 0.5844829 ]])

## 5. Topic Modeling

Unsupervised technique to identify topics in a collection of documents.

In [10]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=2, random_state=42)
lda.fit(tfidf)

for idx, topic in enumerate(lda.components_):
    print("Topic %d:" % (idx+1))
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])

Topic 1:
['telecast', 'tv', 'america', 'nba', 'league', 'popular', 'basketball']
Topic 2:
['league', 'nba', 'popular', 'basketball', 'america', 'tv', 'telecast']


## 6. Document Similarity

Check how similar two documents are using cosine similarity.

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(tfidf)
print(similarity)

[[1.         0.16395271 0.13303222]
 [0.16395271 1.         0.39038344]
 [0.13303222 0.39038344 1.        ]]
