# **Sparse Vectorization Representations**

In [None]:
# Import required libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
# Load the dataset (update path as needed)
df = pd.read_csv('/content/dataset_cleaned.csv')

# Ensure the dataset contains the necessary column
if 'review_text' not in df.columns:
    raise ValueError("Dataset must contain a 'review_text' column.")

print(f"Number of reviews in the dataset: {len(df)}")

Number of reviews in the dataset: 11809


# 1. **Count** **Vectorizer**

In [98]:
# Initialize Count Vectorizer
count_vectorizer = CountVectorizer(max_features=3000,min_df=5, max_df=0.8)
count_matrix = count_vectorizer.fit_transform(df['review_text'])  # Transform text into sparse matrix

In [99]:
# Display the shape of the Count Vectorizer matrix
print(f"Count Vectorizer Matrix Shape: {count_matrix.shape}")

Count Vectorizer Matrix Shape: (11809, 3000)


In [100]:
# Display features from the vocabulary
print(" Features from Count Vectorizer:", count_vectorizer.get_feature_names_out()[:10])


 Features from Count Vectorizer: ['10pm' '1st' '2nd' '30am' '30pm' '3rd' '4th' '5am' '5min' '5th']


In [101]:
# Calculate sparsity
sparsity_count = (1 - count_matrix.nnz / float(count_matrix.shape[0] * count_matrix.shape[1])) * 100
print(f"Count Vectorizer Sparsity: {sparsity_count:.2f}%")

Count Vectorizer Sparsity: 98.71%


In [102]:
print(f"Count Vectorizer: Shape = {count_matrix.shape}, Sparsity = {sparsity_count:.2f}%")


Count Vectorizer: Shape = (11809, 3000), Sparsity = 98.71%


# 2. **TF**-**IDF** **Vectorizer**

In [103]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=3000,min_df=5, max_df=0.8)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['review_text'])  # Transform text into sparse matrix

In [104]:
# Display the shape of the TF-IDF matrix
print(f"TF-IDF Vectorizer Matrix Shape: {tfidf_matrix.shape}")

TF-IDF Vectorizer Matrix Shape: (11809, 3000)


In [105]:
#Display a few feature names (words in the vocabulary)
print(" Features from TF-IDF Vectorizer:", tfidf_vectorizer.get_feature_names_out()[:10])

 Features from TF-IDF Vectorizer: ['10pm' '1st' '2nd' '30am' '30pm' '3rd' '4th' '5am' '5min' '5th']


In [106]:
# Calculate sparsity
sparsity_tfidf = (1 - tfidf_matrix.nnz / float(tfidf_matrix.shape[0] * tfidf_matrix.shape[1])) * 100
print(f"TF-IDF Vectorizer Sparsity: {sparsity_tfidf:.2f}%")

TF-IDF Vectorizer Sparsity: 98.71%


In [107]:
print(f"TF-IDF Vectorizer: Shape = {tfidf_matrix.shape}, Sparsity = {sparsity_tfidf:.2f}%")

TF-IDF Vectorizer: Shape = (11809, 3000), Sparsity = 98.71%
