In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import re

In [3]:
sample = """
Narendra Damodardas Modi[a] (born 17 September 1950) is an Indian politician who has served as the prime minister of India since 2014. Modi was the chief minister of Gujarat from 2001 to 2014 and is the member of parliament (MP) for Varanasi. He is a member of the Bharatiya Janata Party (BJP) and of the Rashtriya Swayamsevak Sangh (RSS), a right-wing Hindutva paramilitary volunteer organisation. He is the longest-serving prime minister outside the Indian National Congress.

Modi was born and raised in Vadnagar, Bombay State (present-day Gujarat), where he completed his secondary education. He was introduced to the RSS at the age of eight, becoming a full-time worker for the organisation in Gujarat in 1971. The RSS assigned him to the BJP in 1985, and he rose through the party hierarchy, becoming general secretary in 1998.[b] In 2001, Modi was appointed chief minister of Gujarat and elected to the legislative assembly soon after. His administration is considered complicit in the 2002 Gujarat riots[c] and has been criticised for its management of the crisis. According to official records, a little over 1,000 people were killed, three-quarters of whom were Muslim; independent sources estimated 2,000 deaths, mostly Muslim.[4] A Special Investigation Team appointed by the Supreme Court of India in 2012 found no evidence to initiate prosecution proceedings against him.[d] While his policies as chief minister were credited for encouraging economic growth, his administration was criticised for failing to significantly improve health, poverty and education indices in the state.[e]

In the 2014 Indian general election, Modi led the BJP to a parliamentary majority, the first for a party since 1984. His administration increased direct foreign investment and reduced spending on healthcare, education, and social-welfare programs. Modi began a high-profile sanitation campaign and weakened or abolished environmental and labour laws. His demonetisation of banknotes in 2016 and introduction of the Goods and Services Tax in 2017 sparked controversy. Modi's administration launched the 2019 Balakot airstrike against an alleged terrorist training camp in Pakistan; the airstrike failed,[5][6] but the action had nationalist appeal.[7] Modi's party won the 2019 general election which followed. In its second term, his administration revoked the special status of Jammu and Kashmir and introduced the Citizenship Amendment Act, prompting widespread protests and spurring the 2020 Delhi riots in which Muslims were brutalised and killed by Hindu mobs.[8][9][10] Three controversial farm laws led to sit-ins by farmers across the country, eventually causing their formal repeal. Modi oversaw India's response to the COVID-19 pandemic, during which, according to the World Health Organization, 4.7 million Indians died.[11][12] In the 2024 general election, Modi's party lost its majority in the lower house of Parliament and formed a government leading the National Democratic Alliance coalition. Following a terrorist attack in Indian-administered Jammu and Kashmir, Modi presided over the 2025 India–Pakistan conflict, which resulted in a ceasefire.
"""

In [4]:
sentences = sent_tokenize(sample)

In [5]:
len(sentences)

23

In [6]:
sentences

['\nNarendra Damodardas Modi[a] (born 17 September 1950) is an Indian politician who has served as the prime minister of India since 2014.',
 'Modi was the chief minister of Gujarat from 2001 to 2014 and is the member of parliament (MP) for Varanasi.',
 'He is a member of the Bharatiya Janata Party (BJP) and of the Rashtriya Swayamsevak Sangh (RSS), a right-wing Hindutva paramilitary volunteer organisation.',
 'He is the longest-serving prime minister outside the Indian National Congress.',
 'Modi was born and raised in Vadnagar, Bombay State (present-day Gujarat), where he completed his secondary education.',
 'He was introduced to the RSS at the age of eight, becoming a full-time worker for the organisation in Gujarat in 1971.',
 'The RSS assigned him to the BJP in 1985, and he rose through the party hierarchy, becoming general secretary in 1998.',
 '[b] In 2001, Modi was appointed chief minister of Gujarat and elected to the legislative assembly soon after.',
 'His administration is

In [7]:
lemmatizer = WordNetLemmatizer()
corpus = []

for i in range(len(sentences)):
    review = re.sub(r'[^a-zA-Z]', ' ', sentences[i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)
    

In [8]:
corpus

['narendra damodardas modi born september indian politician served prime minister india since',
 'modi chief minister gujarat member parliament mp varanasi',
 'member bharatiya janata party bjp rashtriya swayamsevak sangh rss right wing hindutva paramilitary volunteer organisation',
 'longest serving prime minister outside indian national congress',
 'modi born raised vadnagar bombay state present day gujarat completed secondary education',
 'introduced rss age eight becoming full time worker organisation gujarat',
 'rss assigned bjp rose party hierarchy becoming general secretary',
 'b modi appointed chief minister gujarat elected legislative assembly soon',
 'administration considered complicit gujarat riot c criticised management crisis',
 'according official record little people killed three quarter muslim independent source estimated death mostly muslim',
 'special investigation team appointed supreme court india found evidence initiate prosecution proceeding',
 'policy chief mini

## Count Vectorizer

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
cv = CountVectorizer(ngram_range=(3, 3))
X = cv.fit_transform(corpus).toarray()
cv.vocabulary_

{'narendra damodardas modi': 153,
 'damodardas modi born': 51,
 'modi born september': 144,
 'born september indian': 32,
 'september indian politician': 198,
 'indian politician served': 106,
 'politician served prime': 172,
 'served prime minister': 199,
 'prime minister india': 176,
 'minister india since': 138,
 'modi chief minister': 145,
 'chief minister gujarat': 38,
 'minister gujarat member': 137,
 'gujarat member parliament': 87,
 'member parliament mp': 133,
 'parliament mp varanasi': 164,
 'member bharatiya janata': 132,
 'bharatiya janata party': 26,
 'janata party bjp': 115,
 'party bjp rashtriya': 166,
 'bjp rashtriya swayamsevak': 28,
 'rashtriya swayamsevak sangh': 183,
 'swayamsevak sangh rss': 213,
 'sangh rss right': 195,
 'rss right wing': 194,
 'right wing hindutva': 188,
 'wing hindutva paramilitary': 226,
 'hindutva paramilitary volunteer': 94,
 'paramilitary volunteer organisation': 162,
 'longest serving prime': 127,
 'serving prime minister': 201,
 'prime min

In [11]:
print(X)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 ...
 [0 0 1 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## Term Frequency - Inverse Document Frequrncy (TF-IDF)

In [12]:
tfidf = TfidfVectorizer()

In [13]:
X = tfidf.fit_transform(corpus)

In [14]:
tfidf.get_feature_names_out()

array(['abolished', 'according', 'across', 'act', 'action',
       'administered', 'administration', 'age', 'airstrike', 'alleged',
       'alliance', 'amendment', 'appeal', 'appointed', 'assembly',
       'assigned', 'attack', 'balakot', 'banknote', 'becoming', 'began',
       'bharatiya', 'bjp', 'bombay', 'born', 'brutalised', 'camp',
       'campaign', 'causing', 'ceasefire', 'chief', 'citizenship',
       'coalition', 'completed', 'complicit', 'conflict', 'congress',
       'considered', 'controversial', 'controversy', 'country', 'court',
       'covid', 'credited', 'crisis', 'criticised', 'damodardas', 'day',
       'death', 'delhi', 'democratic', 'demonetisation', 'died', 'direct',
       'economic', 'education', 'eight', 'elected', 'election',
       'encouraging', 'environmental', 'estimated', 'eventually',
       'evidence', 'failed', 'failing', 'farm', 'farmer', 'first',
       'followed', 'following', 'foreign', 'formal', 'formed', 'found',
       'full', 'general', 'good', 

In [15]:
X[0].toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.29357975,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.33223493, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [16]:
tfidf = TfidfVectorizer(max_features=100, ngram_range=(3, 3)) # top 100 features with high accurence
X = tfidf.fit_transform(corpus).toarray()
X

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.57735027, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]], shape=(23, 100))

In [17]:
tfidf = TfidfVectorizer(max_features=100, ngram_range=(1, 2)) 
X = tfidf.fit_transform(corpus).toarray()
tfidf.vocabulary_

{'modi': np.int64(67),
 'born': np.int64(19),
 'indian': np.int64(55),
 'prime': np.int64(76),
 'minister': np.int64(65),
 'india': np.int64(54),
 'since': np.int64(80),
 'modi born': np.int64(68),
 'prime minister': np.int64(77),
 'chief': np.int64(25),
 'gujarat': np.int64(52),
 'member': np.int64(64),
 'parliament': np.int64(74),
 'varanasi': np.int64(99),
 'chief minister': np.int64(26),
 'minister gujarat': np.int64(66),
 'party': np.int64(75),
 'bjp': np.int64(18),
 'rss': np.int64(79),
 'organisation': np.int64(72),
 'national': np.int64(71),
 'vadnagar': np.int64(97),
 'state': np.int64(82),
 'completed': np.int64(30),
 'education': np.int64(47),
 'vadnagar bombay': np.int64(98),
 'completed secondary': np.int64(31),
 'introduced': np.int64(56),
 'becoming': np.int64(17),
 'time': np.int64(93),
 'time worker': np.int64(94),
 'general': np.int64(50),
 'appointed': np.int64(16),
 'administration': np.int64(12),
 'complicit': np.int64(32),
 'riot': np.int64(78),
 'criticised': np.