In [1]:
import re
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import OneHotEncoder

In [2]:
sentences = "The food is good.The food is bad.Pizza is amazing."

In [3]:
lemmatizer = WordNetLemmatizer()

In [4]:
sentences_list  = sentences.split('.')
lemmatizer_sentences=[]
for sentence in sentences_list:
    sentence=re.sub('[^A-Za-z]',' ',sentence)
    sentence = sentence.lower()
    if sentence: 
        sentence = sentence.split()
        sentence= [lemmatizer.lemmatize(word) for word in sentence]
        sentence=' '.join(sentence).strip()
        lemmatizer_sentences.append(sentence)
print(lemmatizer_sentences)



['the food is good', 'the food is bad', 'pizza is amazing']


In [5]:
all_words = [word for sentence in lemmatizer_sentences for word in sentence.split()]
print(all_words)

['the', 'food', 'is', 'good', 'the', 'food', 'is', 'bad', 'pizza', 'is', 'amazing']


In [7]:
all_words_reshaped = [[word] for word in all_words]
print(all_words_reshaped)

[['the'], ['food'], ['is'], ['good'], ['the'], ['food'], ['is'], ['bad'], ['pizza'], ['is'], ['amazing']]


In [8]:
# Initialize and fit OneHotEncoder
encoder= OneHotEncoder(sparse_output=False)


In [9]:
one_hot_encoder_words=encoder.fit_transform(all_words_reshaped)

In [10]:
# Get the vocabulary
encoded_vocab = encoder.categories_[0]
print("\nOneHotEncoder Vocabulary:", encoded_vocab)



OneHotEncoder Vocabulary: ['amazing' 'bad' 'food' 'good' 'is' 'pizza' 'the']


In [11]:
# Map words to their one-hot encodings
word_to_one_hot = {word: one_hot_encoder_words[i] for i, word in enumerate(all_words)}
vocabulary = encoder.get_feature_names_out()
print(vocabulary)

# Print expanded one-hot encoding
print("\nOne-Hot Encoded Words (Expanded):")
for word, encoding in word_to_one_hot.items():
    encoding_list = encoding.tolist()  # Convert to list for easier reading
    print(f"Word: '{word}' -> One-Hot: {encoding_list}")

['x0_amazing' 'x0_bad' 'x0_food' 'x0_good' 'x0_is' 'x0_pizza' 'x0_the']

One-Hot Encoded Words (Expanded):
Word: 'the' -> One-Hot: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]
Word: 'food' -> One-Hot: [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]
Word: 'is' -> One-Hot: [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]
Word: 'good' -> One-Hot: [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
Word: 'bad' -> One-Hot: [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Word: 'pizza' -> One-Hot: [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]
Word: 'amazing' -> One-Hot: [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
