In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from scipy.sparse import csr_matrix

In [2]:
# ------------------------------------------------------------
# Assignment 3: Text Documents, TF-IDF, CSR, SVD and KMeans
# ------------------------------------------------------------
# This script:
# 1. Defines 5 short text documents.
# 2. Builds the vocabulary and prints the index of each term.
# 3. Constructs the term-document frequency matrix using CountVectorizer.
# 4. Computes the TF-IDF matrix (dense and CSR/sparse forms).
# 5. Applies 4 different (SVD + KMeans) pipelines and prints the clusters.
# All steps use only tools and style consistent with the class notes.
# ------------------------------------------------------------

In [3]:
# 1) Define the documents
docs = [
    "This session talks about CSR",
    "SVD works with CSR",
    "Kmeans clustering can be used in a pipeline",
    "pipeline can have any clustering approach",
    "TFIDF needs CSR"
]

titles = ["Doc1", "Doc2", "Doc3", "Doc4", "Doc5"]

print("=== DOCUMENTS ===")
for t, d in zip(titles, docs):
    print(f"{t}: {d}")
print("\n")

=== DOCUMENTS ===
Doc1: This session talks about CSR
Doc2: SVD works with CSR
Doc3: Kmeans clustering can be used in a pipeline
Doc4: pipeline can have any clustering approach
Doc5: TFIDF needs CSR




In [4]:
# ------------------------------------------------------------
# 2) Term index using TF-IDF vocabulary
# ------------------------------------------------------------
# We first fit a TfidfVectorizer to learn the vocabulary.
# Then we print each term with its column index in the TF-IDF matrix.

tf = TfidfVectorizer()
tfidf_csr = tf.fit_transform(docs)      # TF-IDF in sparse (CSR) format
terms = tf.get_feature_names_out()      # learned vocabulary (sorted by index)

print("=== TERM INDEX (from TfidfVectorizer) ===")
for idx, term in enumerate(terms):
    print(f"{idx} -> {term}")
print("\n")

=== TERM INDEX (from TfidfVectorizer) ===
0 -> about
1 -> any
2 -> approach
3 -> be
4 -> can
5 -> clustering
6 -> csr
7 -> have
8 -> in
9 -> kmeans
10 -> needs
11 -> pipeline
12 -> session
13 -> svd
14 -> talks
15 -> tfidf
16 -> this
17 -> used
18 -> with
19 -> works




In [5]:
# ------------------------------------------------------------
# 3) Term-Document Frequency Matrix (CountVectorizer)
# ------------------------------------------------------------
# Here we use CountVectorizer to obtain raw term frequencies.
# Rows correspond to documents, columns correspond to terms.

cv = CountVectorizer()
freq_csr = cv.fit_transform(docs)

freq_array = freq_csr.toarray()
freq_terms = cv.get_feature_names_out()

print("=== COUNT VECTORIZER VOCABULARY ===")
print(cv.vocabulary_)
print("\n")

print("=== FREQUENCY MATRIX (array) ===")
print(freq_array)
print("\n")

=== COUNT VECTORIZER VOCABULARY ===
{'this': 16, 'session': 12, 'talks': 14, 'about': 0, 'csr': 6, 'svd': 13, 'works': 19, 'with': 18, 'kmeans': 9, 'clustering': 5, 'can': 4, 'be': 3, 'used': 17, 'in': 8, 'pipeline': 11, 'have': 7, 'any': 1, 'approach': 2, 'tfidf': 15, 'needs': 10}


=== FREQUENCY MATRIX (array) ===
[[1 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 1 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 1]
 [0 0 0 1 1 1 0 0 1 1 0 1 0 0 0 0 0 1 0 0]
 [0 1 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0]]




In [6]:
# Present frequency matrix as a DataFrame for readability
freq_df = pd.DataFrame(freq_array, columns=freq_terms, index=titles)

print("=== FREQUENCY MATRIX (DataFrame) ===")
print(freq_df)
print("\n")

=== FREQUENCY MATRIX (DataFrame) ===
      about  any  approach  be  can  clustering  csr  have  in  kmeans  needs  \
Doc1      1    0         0   0    0           0    1     0   0       0      0   
Doc2      0    0         0   0    0           0    1     0   0       0      0   
Doc3      0    0         0   1    1           1    0     0   1       1      0   
Doc4      0    1         1   0    1           1    0     1   0       0      0   
Doc5      0    0         0   0    0           0    1     0   0       0      1   

      pipeline  session  svd  talks  tfidf  this  used  with  works  
Doc1         0        1    0      1      0     1     0     0      0  
Doc2         0        0    1      0      0     0     0     1      1  
Doc3         1        0    0      0      0     0     1     0      0  
Doc4         1        0    0      0      0     0     0     0      0  
Doc5         0        0    0      0      1     0     0     0      0  




In [7]:
# ------------------------------------------------------------
# 4) TF-IDF Matrix: Dense + CSR (Sparse) Representations
# ------------------------------------------------------------
# TfidfVectorizer already returned a CSR sparse matrix (tfidf_csr).
# We show both:
# - dense TF-IDF matrix (for interpretation),
# - CSR structure (to emphasize sparse representation).

tfidf_dense = tfidf_csr.toarray()
tfidf_df = pd.DataFrame(tfidf_dense, columns=terms, index=titles)

print("=== TF-IDF MATRIX (Dense, DataFrame) ===")
print(tfidf_df)
print("\n")

print("=== TF-IDF MATRIX (CSR / Sparse representation) ===")
print(tfidf_csr)   # compressed sparse row format (row, col, value)
print("\n")

=== TF-IDF MATRIX (Dense, DataFrame) ===
         about       any  approach        be       can  clustering       csr  \
Doc1  0.474125  0.000000  0.000000  0.000000  0.000000    0.000000  0.317527   
Doc2  0.000000  0.000000  0.000000  0.000000  0.000000    0.000000  0.360638   
Doc3  0.000000  0.000000  0.000000  0.409865  0.330677    0.330677  0.000000   
Doc4  0.000000  0.449342  0.449342  0.000000  0.362526    0.362526  0.000000   
Doc5  0.000000  0.000000  0.000000  0.000000  0.000000    0.000000  0.427993   

          have        in    kmeans    needs  pipeline   session       svd  \
Doc1  0.000000  0.000000  0.000000  0.00000  0.000000  0.474125  0.000000   
Doc2  0.000000  0.000000  0.000000  0.00000  0.000000  0.000000  0.538498   
Doc3  0.000000  0.409865  0.409865  0.00000  0.330677  0.000000  0.000000   
Doc4  0.449342  0.000000  0.000000  0.00000  0.362526  0.000000  0.000000   
Doc5  0.000000  0.000000  0.000000  0.63907  0.000000  0.000000  0.000000   

         talks 

In [8]:
# Explicit CSR from dense, just to illustrate the same structure
tfidf_csr_explicit = csr_matrix(tfidf_dense)
print("=== EXPLICIT CSR FROM DENSE TF-IDF (should match above structurally) ===")
print(tfidf_csr_explicit)
print("\n")

=== EXPLICIT CSR FROM DENSE TF-IDF (should match above structurally) ===
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 25 stored elements and shape (5, 20)>
  Coords	Values
  (0, 0)	0.4741246485558491
  (0, 6)	0.31752680284846835
  (0, 12)	0.4741246485558491
  (0, 14)	0.4741246485558491
  (0, 16)	0.4741246485558491
  (1, 6)	0.3606383263504801
  (1, 13)	0.5384979101064753
  (1, 18)	0.5384979101064753
  (1, 19)	0.5384979101064753
  (2, 3)	0.40986538560224284
  (2, 4)	0.33067681238156543
  (2, 5)	0.33067681238156543
  (2, 8)	0.40986538560224284
  (2, 9)	0.40986538560224284
  (2, 11)	0.33067681238156543
  (2, 17)	0.40986538560224284
  (3, 1)	0.4493418549869351
  (3, 2)	0.4493418549869351
  (3, 4)	0.36252617931707143
  (3, 5)	0.36252617931707143
  (3, 7)	0.4493418549869351
  (3, 11)	0.36252617931707143
  (4, 6)	0.42799292268317357
  (4, 10)	0.6390704413963749
  (4, 15)	0.6390704413963749




In [9]:
# ------------------------------------------------------------
# 5) SVD + KMeans Pipelines (4 scenarios)
# ------------------------------------------------------------
# We now reduce dimensionality of the TF-IDF matrix using TruncatedSVD,
# then cluster the documents using KMeans.
#
# Four scenarios:
#   1) SVD with 2 components, KMeans with 2 clusters
#   2) SVD with 3 components, KMeans with 2 clusters
#   3) SVD with 2 components, KMeans with 3 clusters
#   4) SVD with 3 components, KMeans with 3 clusters
#
# For each scenario, we:
# - build a pipeline: [TruncatedSVD -> KMeans]
# - fit on the TF-IDF matrix
# - predict cluster labels for each document
# - print a table with document, scenario, and assigned cluster

In [10]:
scenarios = [
    (2, 2),
    (3, 2),
    (2, 3),
    (3, 3)
]


In [12]:
results = []  # to store DataFrames for all scenarios

In [18]:

print("=== SVD + KMeans CLUSTERING RESULTS ===")

for n_comp, n_clusters in scenarios:
    # Create SVD and KMeans with given parameters
    svd = TruncatedSVD(n_components=n_comp)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)

    # Create pipeline: first SVD, then KMeans
    pipe = make_pipeline(svd, kmeans)

    # Fit the pipeline on TF-IDF features
    pipe.fit(tfidf_csr)

    # Predict cluster for each document
    labels = pipe.predict(tfidf_csr)

    # Build result DataFrame for this scenario
    df_result = pd.DataFrame({
        "Title": titles,
        "Document": docs,
        "Scenario": f"SVD={n_comp}, KMeans={n_clusters}",
        "Cluster": labels
    })

    # Sort by cluster label for clearer grouping
    df_sorted = df_result.sort_values("Cluster")

    print(df_sorted)
    print("-" * 60)

    # Save for potential further analysis
    results.append(df_result)



=== SVD + KMeans CLUSTERING RESULTS ===
  Title                                     Document         Scenario  Cluster
0  Doc1                 This session talks about CSR  SVD=2, KMeans=2        0
1  Doc2                           SVD works with CSR  SVD=2, KMeans=2        0
4  Doc5                              TFIDF needs CSR  SVD=2, KMeans=2        0
2  Doc3  Kmeans clustering can be used in a pipeline  SVD=2, KMeans=2        1
3  Doc4    pipeline can have any clustering approach  SVD=2, KMeans=2        1
------------------------------------------------------------
  Title                                     Document         Scenario  Cluster
1  Doc2                           SVD works with CSR  SVD=3, KMeans=2        0
4  Doc5                              TFIDF needs CSR  SVD=3, KMeans=2        0
2  Doc3  Kmeans clustering can be used in a pipeline  SVD=3, KMeans=2        1
0  Doc1                 This session talks about CSR  SVD=3, KMeans=2        1
3  Doc4    pipeline can have a

