In [None]:
# Efficient Methods for Visualizing the Sparse Term-Document Matrix

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

print("=== Method 1: Sparsity Pattern Visualization ===")
# Show the structure of sparsity in a subset of the matrix
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Take a sample for visualization (first 100 documents, first 200 features)
sample_matrix = X_counts[:100, :200].toarray()

# Method 1a: Spy plot to show non-zero entries
ax1.spy(sample_matrix, markersize=0.5)
ax1.set_title('Sparsity Pattern (100 docs × 200 terms)')
ax1.set_xlabel('Term Index')
ax1.set_ylabel('Document Index')

# Method 1b: Heatmap of most frequent terms
# Get the most frequent terms across all documents
term_sums = np.array(X_counts.sum(axis=0)).flatten()
top_term_indices = np.argsort(term_sums)[-50:]  # Top 50 most frequent terms
feature_names = count_vect.get_feature_names_out()
top_terms = feature_names[top_term_indices]

# Create heatmap with top terms and sample documents
heatmap_data = X_counts[:20, top_term_indices].toarray()
sns.heatmap(heatmap_data, 
            xticklabels=top_terms,
            yticklabels=[f'Doc {i}' for i in range(20)],
            cmap='YlOrRd',
            ax=ax2,
            cbar_kws={'label': 'Term Frequency'})
ax2.set_title('Heatmap: Top 50 Terms × 20 Documents')
ax2.set_xlabel('Terms')
ax2.set_ylabel('Documents')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

print(f"Matrix sparsity: {(X_counts.nnz / (X_counts.shape[0] * X_counts.shape[1]) * 100):.2f}% non-zero elements")
print(f"Total vocabulary size: {len(feature_names)}")
print(f"Total documents: {X_counts.shape[0]}")

In [None]:
print("\n=== Method 2: Dimensionality Reduction Visualization ===")
# Use PCA to reduce dimensionality and visualize document relationships

# Apply PCA to reduce to 2D
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_counts.toarray())

# Create a scatter plot colored by newsgroup category
plt.figure(figsize=(12, 8))
colors = plt.cm.Set3(np.linspace(0, 1, len(twenty_train.target_names)))
target_names = twenty_train.target_names

for i, (color, target_name) in enumerate(zip(colors, target_names)):
    mask = twenty_train.target == i
    plt.scatter(X_pca[mask, 0], X_pca[mask, 1], 
                c=[color], label=target_name, alpha=0.7, s=30)

plt.xlabel(f'First Principal Component (explains {pca.explained_variance_ratio_[0]:.1%} variance)')
plt.ylabel(f'Second Principal Component (explains {pca.explained_variance_ratio_[1]:.1%} variance)')
plt.title('Document Clustering via PCA\n(Each point is a document, colors represent newsgroup categories)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

print(f"PCA explains {pca.explained_variance_ratio_.sum():.1%} of total variance with 2 components")

In [None]:
print("\n=== Method 3: Term Frequency Analysis ===")
# Analyze and visualize the most important terms

# Calculate term frequencies across the entire corpus
term_frequencies = np.array(X_counts.sum(axis=0)).flatten()
feature_names = count_vect.get_feature_names_out()

# Create a DataFrame for easier manipulation
term_freq_df = pd.DataFrame({
    'term': feature_names,
    'frequency': term_frequencies
}).sort_values('frequency', ascending=False)

# Plot the top 20 most frequent terms
plt.figure(figsize=(12, 6))
top_20_terms = term_freq_df.head(20)
plt.barh(range(len(top_20_terms)), top_20_terms['frequency'])
plt.yticks(range(len(top_20_terms)), top_20_terms['term'])
plt.xlabel('Total Frequency Across All Documents')
plt.title('Top 20 Most Frequent Terms in the Corpus')
plt.gca().invert_yaxis()  # To have the highest frequency at the top
plt.tight_layout()
plt.show()

print("Top 10 most frequent terms:")
print(term_freq_df.head(10).to_string(index=False))
print(f"\nTotal unique terms in vocabulary: {len(feature_names)}")
print(f"Terms appearing only once: {sum(term_frequencies == 1)}")
print(f"Terms appearing more than 100 times: {sum(term_frequencies > 100)}")

In [None]:
print("\n=== Method 4: Document Length and Sparsity Analysis ===")
# Analyze document characteristics

# Calculate document lengths (number of words per document)
doc_lengths = np.array(X_counts.sum(axis=1)).flatten()
doc_sparsity = np.array((X_counts > 0).sum(axis=1)).flatten()  # Number of unique terms per document

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 5))

# Document length distribution
ax1.hist(doc_lengths, bins=50, alpha=0.7, color='skyblue', edgecolor='black')
ax1.set_xlabel('Document Length (Total Words)')
ax1.set_ylabel('Number of Documents')
ax1.set_title('Distribution of Document Lengths')
ax1.axvline(np.mean(doc_lengths), color='red', linestyle='--', 
           label=f'Mean: {np.mean(doc_lengths):.1f}')
ax1.legend()

# Unique terms per document
ax2.hist(doc_sparsity, bins=50, alpha=0.7, color='lightcoral', edgecolor='black')
ax2.set_xlabel('Unique Terms per Document')
ax2.set_ylabel('Number of Documents')
ax2.set_title('Distribution of Vocabulary Diversity')
ax2.axvline(np.mean(doc_sparsity), color='red', linestyle='--', 
           label=f'Mean: {np.mean(doc_sparsity):.1f}')
ax2.legend()

# Scatter plot: document length vs unique terms
ax3.scatter(doc_lengths, doc_sparsity, alpha=0.6, s=20)
ax3.set_xlabel('Document Length (Total Words)')
ax3.set_ylabel('Unique Terms per Document')
ax3.set_title('Document Length vs Vocabulary Diversity')

# Add correlation coefficient
correlation = np.corrcoef(doc_lengths, doc_sparsity)[0, 1]
ax3.text(0.05, 0.95, f'Correlation: {correlation:.3f}', 
         transform=ax3.transAxes, verticalalignment='top',
         bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))

plt.tight_layout()
plt.show()

print(f"Average document length: {np.mean(doc_lengths):.1f} words")
print(f"Average unique terms per document: {np.mean(doc_sparsity):.1f}")
print(f"Correlation between length and diversity: {correlation:.3f}")