DEMO TO Joyanthan Nanduri

Parsing

In [1]:
import os
import csv
import re

def parse_metadata_and_body(file_path):

    with open(file_path, 'r', encoding='latin1') as f:
        content = f.read()
    
    meta = {}
    lines = content.split("\n")
    body_start = 0
    for i, line in enumerate(lines):
        if ": " in line:
            key, value = line.split(": ", 1)
            meta[key.strip()] = value.strip()
        else:
            body_start = i + 1
            break

    body = "\n".join(lines[body_start:]).strip()
    return meta, body

def create_csv_from_newsgroups(data_dir, output_csv):
    
    fieldnames = ["id","label", "from", "subject",  "lines", "organization", "body"]
    
    with open(output_csv, mode='w', encoding='utf-8', newline='') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        
        for category in os.listdir(data_dir):
            category_dir = os.path.join(data_dir, category)
            if not os.path.isdir(category_dir):
                continue
            
            for file_name in os.listdir(category_dir):
                file_path = os.path.join(category_dir, file_name)
                metadata, body = parse_metadata_and_body(file_path)
                
                writer.writerow({
                    "id": file_name,
                    "label": category,
                    "from": metadata.get("From", ""),
                    "subject": metadata.get("Subject", ""),
                    "lines": metadata.get("Lines", ""),
                    "organization": metadata.get("Organization", ""),
                    "body": body
                })

train_data_dir = "/Users/hunjunsin/Desktop/Jun/Unsupervised/hw1/20news-bydate/20news-bydate-train"
train_output_csv = "/Users/hunjunsin/Desktop/Jun/Unsupervised/hw1/20news_train.csv"

create_csv_from_newsgroups(train_data_dir, train_output_csv)

test_data_dir = "/Users/hunjunsin/Desktop/Jun/Unsupervised/hw1/20news-bydate/20news-bydate-test"
test_output_csv = "/Users/hunjunsin/Desktop/Jun/Unsupervised/hw1/20news_test.csv"

create_csv_from_newsgroups(test_data_dir, test_output_csv)

Train_Test_Split

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_csv = "/Users/hunjunsin/Desktop/Jun/Unsupervised/hw1/20news_train.csv"
test_csv = "/Users/hunjunsin/Desktop/Jun/Unsupervised/hw1/20news_test.csv"

train_data = pd.read_csv(train_csv)
test_data = pd.read_csv(test_csv)

combined_data = pd.concat([train_data, test_data], ignore_index=True)

combined_data = combined_data.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"data number : {len(combined_data)}")


data number : 18846


In [3]:
combined_data.head()

Unnamed: 0,id,label,from,subject,lines,organization,body
0,76198,misc.forsale,Peter Todd Chan <pc1o+@andrew.cmu.edu>,*REDUCED* Sony CD Players 4 Sale,22.0,"Fifth yr. senior, Electrical and Computer Engi...",ITEM: Sony ES-CDPX229\nCONDITION: mint\nAGE: 1...
1,76341,talk.politics.mideast,warren@nysernet.org (Warren Burstein),"Re: To be exact, 2.5 million Muslims were exte...",34.0,"NYSERNet, Inc.",ac = In <9304202017@zuma.UUCP> sera@zuma.UUCP ...
2,60428,comp.sys.ibm.pc.hardware,richk@grebyn.com (Richard Krehbiel),Re: IDE vs SCSI,12.0,"Grebyn Timesharing, Inc.",In article <1qm5c9$6on@hcx1.ssd.csd.harris.com...
3,179009,talk.politics.misc,rodger-scoggin@ksc.nasa.gov (Rodger C. Scoggin),Re: The earth also pollutes & some scientists ...,32.0,,In article <C5uDn9.Gr@ncratl.AtlantaGA.NCR.COM...
4,178545,talk.politics.misc,smith@phoneme.harvard.edu (Steven Smith),The Manitoban Candidate,,"Harvard Robotics Lab, Harvard University",Lines: 18\n\nbross@sandbanks.cosc.brocku.ca (B...


Normalization, Tfidf vectorize

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

data_20 = combined_data.dropna(subset = ['body']).reset_index(drop = True)
train_documents = data_20['body'].reset_index(drop=True)
train_labels = data_20['label'].reset_index(drop=True)

vectorizer = TfidfVectorizer(
    max_features=10000, 
    lowercase=True, 
    stop_words="english", 
    max_df=0.8, 
    min_df=5
)
data_20_vecotorize = vectorizer.fit_transform(train_documents)

In [5]:
print(data_20_vecotorize.shape)
print(train_labels.shape)

(18817, 10000)
(18817,)


In [1]:
import numpy as np
from tqdm import tqdm
from scipy.sparse import csr_matrix, issparse
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix, issparse



def compute_kmeans_objective(data, labels, centroids):
 
    labels = labels.astype(int)
    sq_distances = np.sum((data - centroids[labels]) ** 2, axis=1)
    return np.sum(sq_distances)

def compute_purity_gini(pred_labels, true_labels):

    from collections import Counter
    total = len(true_labels)
    cluster_counts = {}
    
    for p, t in zip(pred_labels, true_labels):
        if p not in cluster_counts:
            cluster_counts[p] = []
        cluster_counts[p].append(t)
    
    purity_sum, gini_sum = 0.0, 0.0
    for cluster, items in cluster_counts.items():
        count = len(items)
        label_counts = Counter(items)
        max_count = max(label_counts.values())
        purity_sum += max_count
        
        # Gini index for the cluster
        gini = 1.0 - sum((c / count) ** 2 for c in label_counts.values())
        gini_sum += gini * count
    
    purity = purity_sum / total
    gini_index = gini_sum / total
    return purity, gini_index

def kman_clustering_batch(data, train_labels, k, batch_size, max_iter=100, random_state=42):
    np.random.seed(random_state)
    
    n_samples, n_features = data.shape
    
    indices = np.random.choice(n_samples, k, replace=False)
    centroids = data[indices].toarray().astype(float)  # Convert to dense

    # Initialize cumulative sum and count for each centroid
    centroid_sums = np.zeros((k, n_features))
    centroid_counts = np.zeros(k, dtype=int)
    
    for iteration in tqdm(range(max_iter), desc="Clustering Progress"):
        for start in range(0, n_samples, batch_size):
            end = min(start + batch_size, n_samples)
            batch_data = data[start:end].toarray()  # Convert to dense

            if batch_data.shape[0] == 0:
                continue
            
            # Compute distances between batch_data and centroids
            distances = np.linalg.norm(batch_data[:, np.newaxis] - centroids, axis=2)
            labels_batch = np.argmin(distances, axis=1)  # shape: (batch_size,)
            
            # Update centroid sums and counts
            for i in range(k):
                points_in_cluster = batch_data[labels_batch == i]
                n_points = points_in_cluster.shape[0]
                if n_points > 0:
                    centroid_sums[i] += points_in_cluster.sum(axis=0)
                    centroid_counts[i] += n_points
        
        # Update centroids
        for i in range(k):
            if centroid_counts[i] > 0:
                centroids[i] = centroid_sums[i] / centroid_counts[i]
            else:
                # Reinitialize centroid to a random data point to handle empty cluster
                random_idx = np.random.choice(n_samples)
                centroids[i] = data[random_idx].toarray().astype(float)
                print(f"Reinitialized centroid {i} to a random data point.")
        
        # Reset sums and counts for next iteration
        centroid_sums[:] = 0
        centroid_counts[:] = 0
    
    
    labels_all = np.empty(n_samples, dtype=int)
    for start in tqdm(range(0, n_samples, batch_size), desc="Final Labeling"):
        end = min(start + batch_size, n_samples)
        batch_data = data[start:end].toarray() 
        distances = np.linalg.norm(batch_data[:, np.newaxis] - centroids, axis=2)
        labels_batch = np.argmin(distances, axis=1)
        labels_all[start:end] = labels_batch
    
    return labels_all, centroids, train_labels


In [None]:
for d, lbl in [(data_20_vecotorize, train_labels)]:
    for k in [10, 20, 40]:
        print(f"\n K={k}")
        pred_labels, centroids, train_labels_fin = kman_clustering_batch(
            d, lbl, k, batch_size=1000, max_iter=10, random_state=42
        )
        # Compute metrics
        d_dense = d.toarray()
        obj = compute_kmeans_objective(d_dense, pred_labels, centroids)
        purity, gini = compute_purity_gini(pred_labels, train_labels_fin)
        print(f"Objective={obj:.2f}, Purity={purity:.4f}, Gini={gini:.4f}")

print("Original Labels: ", train_labels[:10])  
print("Unshuffled Labels: ", train_labels_fin[:10])  


 K=10


Clustering Progress: 100%|██████████| 10/10 [00:53<00:00,  5.38s/it]
Final Labeling: 100%|██████████| 19/19 [00:04<00:00,  4.18it/s]


Objective=18181.92, Purity=0.3219, Gini=0.7652

 K=20


Clustering Progress: 100%|██████████| 10/10 [03:54<00:00, 23.41s/it]
Final Labeling: 100%|██████████| 19/19 [00:22<00:00,  1.18s/it]


Objective=17988.05, Purity=0.3905, Gini=0.7084

 K=40


Clustering Progress: 100%|██████████| 10/10 [10:56<00:00, 65.65s/it]
Final Labeling: 100%|██████████| 19/19 [01:26<00:00,  4.55s/it]


Objective=17727.22, Purity=0.4483, Gini=0.6511
Original Labels:  0                misc.forsale
1       talk.politics.mideast
2    comp.sys.ibm.pc.hardware
3          talk.politics.misc
4          talk.politics.misc
5             rec.motorcycles
6                     sci.med
7                   sci.crypt
8          talk.politics.guns
9              comp.windows.x
Name: label, dtype: object
Unshuffled Labels:  0                misc.forsale
1       talk.politics.mideast
2    comp.sys.ibm.pc.hardware
3          talk.politics.misc
4          talk.politics.misc
5             rec.motorcycles
6                     sci.med
7                   sci.crypt
8          talk.politics.guns
9              comp.windows.x
Name: label, dtype: object


Purity Increase when K increase, Gini impurity decrease when K increase