In [10]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk

In [11]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
doc = """
AHMEDABAD, India, Jan 12 (Reuters) - German Chancellor Friedrich Merz floated on Monday the possibility that the European Union and India could sign a landmark free trade agreement by the end of January, a move that could reshape global trade ties as protectionism rises and U.S.-India talks remain stalled.
"""
print(len(doc.split()))

49


In [13]:
from nltk.tokenize import word_tokenize
from string import punctuation

token = word_tokenize(doc, language = 'english', preserve_line = True)
token = [i for i in token if i not in punctuation]

print(len(token))

48


In [14]:
from nltk.corpus import stopwords

token = [i for i in token if i not in stopwords.words('english')]
print(len(token))

35


In [15]:
token

['AHMEDABAD',
 'India',
 'Jan',
 '12',
 'Reuters',
 'German',
 'Chancellor',
 'Friedrich',
 'Merz',
 'floated',
 'Monday',
 'possibility',
 'European',
 'Union',
 'India',
 'could',
 'sign',
 'landmark',
 'free',
 'trade',
 'agreement',
 'end',
 'January',
 'move',
 'could',
 'reshape',
 'global',
 'trade',
 'ties',
 'protectionism',
 'rises',
 'U.S.-India',
 'talks',
 'remain',
 'stalled']

In [16]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()

In [17]:
print(ps.stem('university'))
print(ps.stem('his'))
print(ps.stem('boys'))
print(ps.stem('species'))

univers
hi
boy
speci


In [18]:
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default fallback

def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    
    # POS tag the FULL token list
    tagged_tokens = pos_tag(tokens)

    lemmatized = [
        lemmatizer.lemmatize(word, get_wordnet_pos(tag))
        for word, tag in tagged_tokens
    ]

    return lemmatized

token = lemmatize_tokens(token)
token

['AHMEDABAD',
 'India',
 'Jan',
 '12',
 'Reuters',
 'German',
 'Chancellor',
 'Friedrich',
 'Merz',
 'float',
 'Monday',
 'possibility',
 'European',
 'Union',
 'India',
 'could',
 'sign',
 'landmark',
 'free',
 'trade',
 'agreement',
 'end',
 'January',
 'move',
 'could',
 'reshape',
 'global',
 'trade',
 'tie',
 'protectionism',
 'rise',
 'U.S.-India',
 'talk',
 'remain',
 'stalled']

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words = 'english')

d1 = "The flavors in each dish were perfectly balanced, and the ingredients tasted incredibly fresh."
d2 = "Bad plot, bad dialogue, bad acting, and an idiotic director; I was very disappointed."
d3 = "The speakers produce okay sound quality for the price, but the output is not great."
d4 = "This app is a total game-changer for my daily routine, and I highly recommend it!"

y = [1, 0, 0, 1]
corpus = [d1, d2, d3, d4]

data = cv.fit_transform(corpus)

In [25]:
cv.get_feature_names_out()

array(['acting', 'app', 'bad', 'balanced', 'changer', 'daily', 'dialogue',
       'director', 'disappointed', 'dish', 'flavors', 'fresh', 'game',
       'great', 'highly', 'idiotic', 'incredibly', 'ingredients', 'okay',
       'output', 'perfectly', 'plot', 'price', 'produce', 'quality',
       'recommend', 'routine', 'sound', 'speakers', 'tasted', 'total'],
      dtype=object)

In [27]:
import pandas as pd

pd.DataFrame(data.toarray(), columns = cv.get_feature_names_out())

Unnamed: 0,acting,app,bad,balanced,changer,daily,dialogue,director,disappointed,dish,...,plot,price,produce,quality,recommend,routine,sound,speakers,tasted,total
0,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,1,0,3,0,0,0,1,1,1,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,1,1,0,0
3,0,1,0,0,1,1,0,0,0,0,...,0,0,0,0,1,1,0,0,0,1


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words = 'english')

d1 = "The flavors in each dish were perfectly balanced, and the ingredients tasted incredibly fresh."
d2 = "Bad plot, bad dialogue, bad acting, and an idiotic director; I was very disappointed."
d3 = "The speakers produce okay sound quality for the price, but the output is not great."
d4 = "This app is a total game-changer for my daily routine, and I highly recommend it!"

y = [1, 0, 0, 1]
corpus = [d1, d2, d3, d4]

data = tfidf.fit_transform(corpus)

In [29]:
import pandas as pd

pd.DataFrame(data.toarray(), columns = tfidf.get_feature_names_out())

Unnamed: 0,acting,app,bad,balanced,changer,daily,dialogue,director,disappointed,dish,...,plot,price,produce,quality,recommend,routine,sound,speakers,tasted,total
0,0.0,0.0,0.0,0.353553,0.0,0.0,0.0,0.0,0.0,0.353553,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.353553,0.0
1,0.258199,0.0,0.774597,0.0,0.0,0.0,0.258199,0.258199,0.258199,0.0,...,0.258199,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.353553,0.353553,0.353553,0.0,0.0,0.353553,0.353553,0.0,0.0
3,0.0,0.353553,0.0,0.0,0.353553,0.353553,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.353553,0.353553,0.0,0.0,0.0,0.353553
