In [24]:
# Define a list of text documents
text_documents = [
    "This is the first document. Let's continue documenting",
    "This document is the second document.",
    "And this is the third one from all documents.",
    "Is this the first document?",
]

def countVectoriserDefault(text_documents):
    from sklearn.feature_extraction.text import CountVectorizer
    # Create a CountVectorizer object
    vectorizer = CountVectorizer()

    # Use the vectorizer to transform the text documents into a matrix of token counts
    matrix = vectorizer.fit_transform(text_documents)

    # Print the feature names (i.e., the unique tokens in the corpus)
    print('Corpus tokens:\n', vectorizer.get_feature_names_out())

    # Print the matrix
    print('Vectorised Documents:\n', matrix.toarray())

In [25]:
def countVectoriserStemming(text_documents):
	from sklearn.feature_extraction.text import CountVectorizer
	from nltk.stem import PorterStemmer
	import re

	# Create a stemmer object
	stemmer = PorterStemmer()

	# Define a function to tokenize and stem the text
	def tokenize_and_stem(text):
				# Tokenize the text
				tokens = [word.lower() for word in re.findall(r'\b\w+\b', text)]
				# Stem the tokens
				stems = [stemmer.stem(token) for token in tokens]
				return stems

	# Create a CountVectorizer object with the tokenizer parameter set to the stemmer function
	vectorizer = CountVectorizer(tokenizer=tokenize_and_stem, token_pattern=None)

	# Use the vectorizer to transform the text documents into a matrix of token counts
	matrix = vectorizer.fit_transform(text_documents)

	# Print the feature names (i.e., the unique tokens in the corpus)
	print('Corpus tokens:\n', vectorizer.get_feature_names_out())

	# Print the matrix
	print('Vectorised Documents:\n', matrix.toarray())

In [26]:
countVectoriserDefault(text_documents)

Corpus tokens:
 ['all' 'and' 'continue' 'document' 'documenting' 'documents' 'first'
 'from' 'is' 'let' 'one' 'second' 'the' 'third' 'this']
Vectorised Documents:
 [[0 0 1 1 1 0 1 0 1 1 0 0 1 0 1]
 [0 0 0 2 0 0 0 0 1 0 0 1 1 0 1]
 [1 1 0 0 0 1 0 1 1 0 1 0 1 1 1]
 [0 0 0 1 0 0 1 0 1 0 0 0 1 0 1]]


In [27]:
countVectoriserStemming(text_documents)

Corpus tokens:
 ['all' 'and' 'continu' 'document' 'first' 'from' 'is' 'let' 'one' 's'
 'second' 'the' 'thi' 'third']
Vectorised Documents:
 [[0 0 1 2 1 0 1 1 0 1 0 1 1 0]
 [0 0 0 2 0 0 1 0 0 0 1 1 1 0]
 [1 1 0 1 0 1 1 0 1 0 0 1 1 1]
 [0 0 0 1 1 0 1 0 0 0 0 1 1 0]]
