In [1]:
def vectorize_data(X_train_processed, X_test_processed):
    """
    This function uses CountVectorizer to vectorize the train and test text datasets ready for sentiment analysis.
    
    The function takes preprocessed text data (X_train_processed and X_test_processed) as inputs,
    inputs are vectorized with the CountVectorizer function from scikit-learn to convert text into 
    numerical feature vectors, and returns the vectorized train and test data.
    
    Parameters:
    X_train_processed (pd.Series, column format, lemmatized/stemmatized): Preprocessed textual data for training.
    X_test_processed (pd.Series, column format, lemmatized/stemmatized): Preprocessed textual data for testing.
    
    Returns:
    train_X (scipy.sparse matrix): Vectorized training data.
    test_X (scipy.sparse matrix): Vectorized testing data.

    The sparse matrix type is handled by most classification/regression models without using the deprecated dense array types.

    """
    from sklearn.feature_extraction.text import CountVectorizer

    vectorizer = CountVectorizer()

    train_X = vectorizer.fit_transform(X_train_processed)
    test_X = vectorizer.transform(X_test_processed)

    return train_X, test_X


In [2]:
import pandas as pd
import numpy as np

train_test = pd.Series({
    0: 'favorit book android dream electr sheep',
    1: 'arriv 1 00pm 4 30pm'
})

test_test = pd.Series({
    0: 'favorit book android dream electr sheep'
})

X_train, X_test = vectorize_data(train_test, test_test)

print(X_train)
print()
print(X_test)

  (0, 7)	1
  (0, 4)	1
  (0, 2)	1
  (0, 5)	1
  (0, 6)	1
  (0, 8)	1
  (1, 3)	1
  (1, 0)	1
  (1, 1)	1

  (0, 2)	1
  (0, 4)	1
  (0, 5)	1
  (0, 6)	1
  (0, 7)	1
  (0, 8)	1
