In [1]:
users = [
    {"id": 1, "name": "Raj", "review": "Airtel network is amazing. I never face call drops in my area."},
    {"id": 2, "name": "Priya", "review": "Internet speed is slow sometimes, but customer support is helpful."},
    {"id": 3, "name": "Ankit", "review": "Very satisfied with Airtel recharge offers and plans."},
    {"id": 4, "name": "Sneha", "review": "The app often crashes, making it hard to check balance."},
    {"id": 5, "name": "Vikram", "review": "Airtel service is okay, but the billing is confusing at times."},
    {"id": 6, "name": "Neha", "review": "I love the network coverage, but sometimes calls get disconnected."},
]


In [2]:
# Create the corpus
corpus = [user['review'] for user in users]
print("Corpus:\n", corpus)


Corpus:
 ['Airtel network is amazing. I never face call drops in my area.', 'Internet speed is slow sometimes, but customer support is helpful.', 'Very satisfied with Airtel recharge offers and plans.', 'The app often crashes, making it hard to check balance.', 'Airtel service is okay, but the billing is confusing at times.', 'I love the network coverage, but sometimes calls get disconnected.']


In [3]:
# Convert all text to lowercase and split words
all_words = [word.lower() for review in corpus for word in review.split()]
print("all_words\n", all_words)
print("\nNumber of words:", len(all_words))

print()
# Remove duplicates
vocab = list(set(all_words))
print("\nVocabulary:\n", vocab)
print("\nNumber of unique words:", len(vocab))


all_words
 ['airtel', 'network', 'is', 'amazing.', 'i', 'never', 'face', 'call', 'drops', 'in', 'my', 'area.', 'internet', 'speed', 'is', 'slow', 'sometimes,', 'but', 'customer', 'support', 'is', 'helpful.', 'very', 'satisfied', 'with', 'airtel', 'recharge', 'offers', 'and', 'plans.', 'the', 'app', 'often', 'crashes,', 'making', 'it', 'hard', 'to', 'check', 'balance.', 'airtel', 'service', 'is', 'okay,', 'but', 'the', 'billing', 'is', 'confusing', 'at', 'times.', 'i', 'love', 'the', 'network', 'coverage,', 'but', 'sometimes', 'calls', 'get', 'disconnected.']

Number of words: 61


Vocabulary:
 ['never', 'drops', 'hard', 'coverage,', 'in', 'sometimes', 'okay,', 'get', 'billing', 'face', 'my', 'customer', 'making', 'airtel', 'but', 'sometimes,', 'recharge', 'calls', 'times.', 'network', 'satisfied', 'offers', 'area.', 'at', 'the', 'often', 'speed', 'check', 'i', 'service', 'support', 'love', 'disconnected.', 'balance.', 'amazing.', 'plans.', 'slow', 'is', 'and', 'crashes,', 'call', 'it',

In [4]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

# Sentence tokenization
sent_tokens = [sent_tokenize(review) for review in corpus]
print("\nSentence Tokens:\n", sent_tokens)

# Word tokenization
word_tokens = [word_tokenize(review.lower()) for review in corpus]
print("\nWord Tokens:\n", word_tokens)



Sentence Tokens:
 [['Airtel network is amazing.', 'I never face call drops in my area.'], ['Internet speed is slow sometimes, but customer support is helpful.'], ['Very satisfied with Airtel recharge offers and plans.'], ['The app often crashes, making it hard to check balance.'], ['Airtel service is okay, but the billing is confusing at times.'], ['I love the network coverage, but sometimes calls get disconnected.']]

Word Tokens:
 [['airtel', 'network', 'is', 'amazing', '.', 'i', 'never', 'face', 'call', 'drops', 'in', 'my', 'area', '.'], ['internet', 'speed', 'is', 'slow', 'sometimes', ',', 'but', 'customer', 'support', 'is', 'helpful', '.'], ['very', 'satisfied', 'with', 'airtel', 'recharge', 'offers', 'and', 'plans', '.'], ['the', 'app', 'often', 'crashes', ',', 'making', 'it', 'hard', 'to', 'check', 'balance', '.'], ['airtel', 'service', 'is', 'okay', ',', 'but', 'the', 'billing', 'is', 'confusing', 'at', 'times', '.'], ['i', 'love', 'the', 'network', 'coverage', ',', 'but', 'so

In [9]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\default.DESKTOP-
[nltk_data]     GAN0M7C\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to C:\Users\default.DESKTOP-
[nltk_data]     GAN0M7C\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

# Remove stopwords and non-alphabetic tokens
filtered_words = [
    [word for word in tokens if word.isalpha() and word not in stop_words]
    for tokens in word_tokens
]
print("\nFiltered Words (Stopwords removed):\n", filtered_words)



Filtered Words (Stopwords removed):
 [['airtel', 'network', 'amazing', 'never', 'face', 'call', 'drops', 'area'], ['internet', 'speed', 'slow', 'sometimes', 'customer', 'support', 'helpful'], ['satisfied', 'airtel', 'recharge', 'offers', 'plans'], ['app', 'often', 'crashes', 'making', 'hard', 'check', 'balance'], ['airtel', 'service', 'okay', 'billing', 'confusing', 'times'], ['love', 'network', 'coverage', 'sometimes', 'calls', 'get', 'disconnected']]


In [6]:
from nltk.stem import PorterStemmer


stemmer = PorterStemmer()
stemmed_words = [[stemmer.stem(word) for word in tokens] for tokens in filtered_words]
print("\nStemmed Words:\n", stemmed_words)



Stemmed Words:
 [['airtel', 'network', 'amaz', 'never', 'face', 'call', 'drop', 'area'], ['internet', 'speed', 'slow', 'sometim', 'custom', 'support', 'help'], ['satisfi', 'airtel', 'recharg', 'offer', 'plan'], ['app', 'often', 'crash', 'make', 'hard', 'check', 'balanc'], ['airtel', 'servic', 'okay', 'bill', 'confus', 'time'], ['love', 'network', 'coverag', 'sometim', 'call', 'get', 'disconnect']]


In [7]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
lemmatized_words = [[lemmatizer.lemmatize(word) for word in tokens] for tokens in filtered_words]
print("\nLemmatized Words:\n", lemmatized_words)



Lemmatized Words:
 [['airtel', 'network', 'amazing', 'never', 'face', 'call', 'drop', 'area'], ['internet', 'speed', 'slow', 'sometimes', 'customer', 'support', 'helpful'], ['satisfied', 'airtel', 'recharge', 'offer', 'plan'], ['app', 'often', 'crash', 'making', 'hard', 'check', 'balance'], ['airtel', 'service', 'okay', 'billing', 'confusing', 'time'], ['love', 'network', 'coverage', 'sometimes', 'call', 'get', 'disconnected']]


In [8]:
pos_tags = [nltk.pos_tag(tokens) for tokens in filtered_words]
print("\nPOS Tags:\n", pos_tags)



POS Tags:
 [[('airtel', 'NN'), ('network', 'NN'), ('amazing', 'VBG'), ('never', 'RB'), ('face', 'VBP'), ('call', 'JJ'), ('drops', 'NNS'), ('area', 'NN')], [('internet', 'JJ'), ('speed', 'NN'), ('slow', 'JJ'), ('sometimes', 'RB'), ('customer', 'NN'), ('support', 'NN'), ('helpful', 'NN')], [('satisfied', 'JJ'), ('airtel', 'NN'), ('recharge', 'NN'), ('offers', 'NNS'), ('plans', 'NNS')], [('app', 'RB'), ('often', 'RB'), ('crashes', 'VBZ'), ('making', 'VBG'), ('hard', 'JJ'), ('check', 'NN'), ('balance', 'NN')], [('airtel', 'NN'), ('service', 'NN'), ('okay', 'IN'), ('billing', 'VBG'), ('confusing', 'VBG'), ('times', 'NNS')], [('love', 'NN'), ('network', 'NN'), ('coverage', 'NN'), ('sometimes', 'RB'), ('calls', 'VBZ'), ('get', 'NN'), ('disconnected', 'VBN')]]


In [9]:
nltk.download('maxent_ne_chunker')
nltk.download('words')


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\default.DESKTOP-
[nltk_data]     GAN0M7C\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.
[nltk_data] Downloading package words to C:\Users\default.DESKTOP-
[nltk_data]     GAN0M7C\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

In [12]:
nltk.download('maxent_ne_chunker_tab')

[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     C:\Users\default.DESKTOP-
[nltk_data]     GAN0M7C\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker_tab.zip.


True

In [13]:
import nltk
from nltk import pos_tag, ne_chunk
from nltk.tokenize import word_tokenize

for review in corpus:
    tokens = word_tokenize(review)
    pos = pos_tag(tokens)
    ner = ne_chunk(pos)
    
    print("\nReview:", review)
    print("NER Tree:", ner)



Review: Airtel network is amazing. I never face call drops in my area.
NER Tree: (S
  (GPE Airtel/NNP)
  network/NN
  is/VBZ
  amazing/VBG
  ./.
  I/PRP
  never/RB
  face/VBP
  call/JJ
  drops/NNS
  in/IN
  my/PRP$
  area/NN
  ./.)

Review: Internet speed is slow sometimes, but customer support is helpful.
NER Tree: (S
  Internet/NNP
  speed/NN
  is/VBZ
  slow/JJ
  sometimes/RB
  ,/,
  but/CC
  customer/NN
  support/NN
  is/VBZ
  helpful/JJ
  ./.)

Review: Very satisfied with Airtel recharge offers and plans.
NER Tree: (S
  Very/RB
  satisfied/JJ
  with/IN
  (PERSON Airtel/NNP)
  recharge/NN
  offers/NNS
  and/CC
  plans/NNS
  ./.)

Review: The app often crashes, making it hard to check balance.
NER Tree: (S
  The/DT
  app/NN
  often/RB
  crashes/VBZ
  ,/,
  making/VBG
  it/PRP
  hard/JJ
  to/TO
  check/VB
  balance/NN
  ./.)

Review: Airtel service is okay, but the billing is confusing at times.
NER Tree: (S
  (GPE Airtel/NNP)
  service/NN
  is/VBZ
  okay/JJ
  ,/,
  but/CC
  the/DT
 

In [14]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# small vocabulary example
vocab = sorted(list(set(all_words)))
word_to_index = {word: i for i, word in enumerate(vocab)}

# Example: encode a sentence
sentence = word_tokens[0]
ohe_matrix = np.zeros((len(sentence), len(vocab)))

for i, word in enumerate(sentence):
    if word in word_to_index:
        ohe_matrix[i][word_to_index[word]] = 1

print("\nOHE matrix shape:", ohe_matrix.shape)



OHE matrix shape: (14, 49)


In [15]:
ohe_matrix

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(corpus)

print("\nBoW Vocabulary:", vectorizer.get_feature_names_out())
print("\nBoW Matrix:\n", bow.toarray())



BoW Vocabulary: ['airtel' 'amazing' 'and' 'app' 'area' 'at' 'balance' 'billing' 'but'
 'call' 'calls' 'check' 'confusing' 'coverage' 'crashes' 'customer'
 'disconnected' 'drops' 'face' 'get' 'hard' 'helpful' 'in' 'internet' 'is'
 'it' 'love' 'making' 'my' 'network' 'never' 'offers' 'often' 'okay'
 'plans' 'recharge' 'satisfied' 'service' 'slow' 'sometimes' 'speed'
 'support' 'the' 'times' 'to' 'very' 'with']

BoW Matrix:
 [[1 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0 1 1 1 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 2 0 0 0 0 0 0 0 0 0 0 0
  0 0 1 1 1 1 0 0 0 0 0]
 [1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1
  1 0 0 0 0 0 0 0 0 1 1]
 [0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0
  0 0 0 0 0 0 1 0 1 0 0]
 [1 0 0 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 1 0 0
  0 1 0 0 0 0 1 1 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
  0 

Each word becomes a vector with one “1”.

In [19]:
ngram_vectorizer = CountVectorizer(ngram_range=(2,2))  # bigrams
ngrams = ngram_vectorizer.fit_transform(corpus)

print("\nBigrams:", ngram_vectorizer.get_feature_names_out())
print("\nBigram Matrix:\n", ngrams.toarray())



Bigrams: ['airtel network' 'airtel recharge' 'airtel service' 'amazing never'
 'and plans' 'app often' 'at times' 'billing is' 'but customer'
 'but sometimes' 'but the' 'call drops' 'calls get' 'check balance'
 'confusing at' 'coverage but' 'crashes making' 'customer support'
 'drops in' 'face call' 'get disconnected' 'hard to' 'in my'
 'internet speed' 'is amazing' 'is confusing' 'is helpful' 'is okay'
 'is slow' 'it hard' 'love the' 'making it' 'my area' 'network coverage'
 'network is' 'never face' 'offers and' 'often crashes' 'okay but'
 'recharge offers' 'satisfied with' 'service is' 'slow sometimes'
 'sometimes but' 'sometimes calls' 'speed is' 'support is' 'the app'
 'the billing' 'the network' 'to check' 'very satisfied' 'with airtel']

Bigram Matrix:
 [[1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 1 0 1 0 0 0 0 0 0 0 1 0 1 1
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0
  0 0 0 0 0 0 1 1 0 1 1 0 0 0 0 0 0]
 [0 1 

In [20]:
ngram_range=(1,3)  # uni + bi + trigrams


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(corpus)

print("\nTF-IDF Features:", tfidf_vectorizer.get_feature_names_out())
print("\nTF-IDF Matrix:\n", tfidf.toarray())



TF-IDF Features: ['airtel' 'amazing' 'and' 'app' 'area' 'at' 'balance' 'billing' 'but'
 'call' 'calls' 'check' 'confusing' 'coverage' 'crashes' 'customer'
 'disconnected' 'drops' 'face' 'get' 'hard' 'helpful' 'in' 'internet' 'is'
 'it' 'love' 'making' 'my' 'network' 'never' 'offers' 'often' 'okay'
 'plans' 'recharge' 'satisfied' 'service' 'slow' 'sometimes' 'speed'
 'support' 'the' 'times' 'to' 'very' 'with']

TF-IDF Matrix:
 [[0.22308279 0.32222849 0.         0.         0.32222849 0.
  0.         0.         0.         0.32222849 0.         0.
  0.         0.         0.         0.         0.         0.32222849
  0.32222849 0.         0.         0.         0.32222849 0.
  0.22308279 0.         0.         0.         0.32222849 0.26423197
  0.32222849 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.