In [13]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from autocorrect import Speller

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')  # Required for WordNet compatibility

# Initialize processing tools
regex_tokenizer = RegexpTokenizer(r'\w+')
stopword_list = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
speller = Speller()

def create_vocab(sentences):
    """
    Preprocess the sentences and construct a unique vocabulary:
    1. Convert text to lowercase.
    2. Remove punctuation.
    3. Apply lemmatization.
    4. Perform spell correction.
    5. Exclude stopwords.
    """
    combined_text = " ".join(sentences)
    lower_text = combined_text.lower()
    tokens = regex_tokenizer.tokenize(lower_text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
    filtered_words = [speller(word) for word in lemmatized_words if word not in stopword_list]
    unique_vocab = list(set(filtered_words))
    return unique_vocab

def generate_sentence_vector(text, vocab):
    """
    Convert a given text into a vector representation based on a vocabulary.
    """
    words = nltk.word_tokenize(text.lower())
    processed_words = [lemmatizer.lemmatize(speller(word)) for word in words if word not in stopword_list]
    return [1 if vocab_word in processed_words else 0 for vocab_word in vocab]

# Input text (Updated)
text1 = "The cat is sleeping on the warm couch"
text2 = "I enjoy baking cookies on a cold afternoon"
text3 = "Tomorrow we will go hiking in the mountains"
texts = [text1, text2, text3]

# Build the vocabulary
vocabulary = create_vocab(texts)
print("Generated Vocabulary:", vocabulary)

# Convert each sentence into its corresponding vector
for text in texts:
    print(f"Text: '{text}'")
    print("Vector Representation:", generate_sentence_vector(text, vocabulary))


Generated Vocabulary: ['warm', 'baking', 'enjoy', 'afternoon', 'hiking', 'mountain', 'tomorrow', 'cat', 'cook', 'go', 'sleeping', 'couch', 'cold']
Text: 'The cat is sleeping on the warm couch'
Vector Representation: [1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0]
Text: 'I enjoy baking cookies on a cold afternoon'
Vector Representation: [0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1]
Text: 'Tomorrow we will go hiking in the mountains'
Vector Representation: [0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shamb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shamb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shamb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\shamb\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
