In [51]:
import nltk
from nltk.corpus import stopwords
from collections import Counter
from nltk.tokenize import word_tokenize
from sklearn.base import BaseEstimator, TransformerMixin


class CarTextTransformation(BaseEstimator, TransformerMixin):
    def __init__(self, language='english'):
        self.vocab = set()
        self.stopwords = set(stopwords.words(language))

    def fit(self, X, y=None):
        for doc in X:
            # Convert document to lowercase and tokenize into words
            words = word_tokenize(doc.lower())
            # Remove stopwords and non-alphabetic characters
            words = [w for w in words if w not in self.stopwords and w.isalpha()]
            # Update vocabulary with unique words in document
            self.vocab.update(words)
        return self

    def transform(self, X, y=None):
        transformed_X = []
        for doc in X:
            # Convert document to lowercase and tokenize into words
            words = word_tokenize(doc.lower())
            # Remove stopwords and non-alphabetic characters
            words = [w for w in words if w not in self.stopwords and w.isalpha()]
            # Count occurrences of each word in document
            word_counts = Counter(words)
            # Create document index dictionary
            doc_index = {w: word_counts[w] for w in self.vocab}
            # Add document index to transformed_X
            transformed_X.append(doc_index)
        return transformed_X

if __name__ == '__main__':
    # Create instance of CarTextTransformation
    transformer = CarTextTransformation()

    # Define example car text
    car_text = ['La voiture est noire et spacieuse.','Le bateau est blanc et rapide.','il y a deux voiture jeep rouges']


    # Transform car text into BoW vectors
    transformer.fit(car_text)
    bow_vectors = transformer.transform(car_text)

    # Print BoW vectors
    print(bow_vectors)


[{'bateau': 0, 'noire': 1, 'deux': 0, 'il': 0, 'blanc': 0, 'spacieuse': 1, 'jeep': 0, 'est': 1, 'et': 1, 'le': 0, 'la': 1, 'voiture': 1, 'rapide': 0, 'rouges': 0}, {'bateau': 1, 'noire': 0, 'deux': 0, 'il': 0, 'blanc': 1, 'spacieuse': 0, 'jeep': 0, 'est': 1, 'et': 1, 'le': 1, 'la': 0, 'voiture': 0, 'rapide': 1, 'rouges': 0}, {'bateau': 0, 'noire': 0, 'deux': 1, 'il': 1, 'blanc': 0, 'spacieuse': 0, 'jeep': 1, 'est': 0, 'et': 0, 'le': 0, 'la': 0, 'voiture': 1, 'rapide': 0, 'rouges': 1}]
