Parsing

In [3]:
import os
import csv
import re

def parse_metadata_and_body(file_path):

    with open(file_path, 'r', encoding='latin1') as f:
        content = f.read()
    
    meta = {}
    lines = content.split("\n")
    body_start = 0
    for i, line in enumerate(lines):
        if ": " in line:
            key, value = line.split(": ", 1)
            meta[key.strip()] = value.strip()
        else:
            body_start = i + 1
            break

    body = "\n".join(lines[body_start:]).strip()
    return meta, body

def create_csv_from_newsgroups(data_dir, output_csv):
    
    fieldnames = ["id","label", "from", "subject",  "lines", "organization", "body"]
    
    with open(output_csv, mode='w', encoding='utf-8', newline='') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        
        for category in os.listdir(data_dir):
            category_dir = os.path.join(data_dir, category)
            if not os.path.isdir(category_dir):
                continue
            
            for file_name in os.listdir(category_dir):
                file_path = os.path.join(category_dir, file_name)
                metadata, body = parse_metadata_and_body(file_path)
                
                writer.writerow({
                    "id": file_name,
                    "label": category,
                    "from": metadata.get("From", ""),
                    "subject": metadata.get("Subject", ""),
                    "lines": metadata.get("Lines", ""),
                    "organization": metadata.get("Organization", ""),
                    "body": body
                })

train_data_dir = "/Users/hunjunsin/Desktop/Jun/Unsupervised/hw1/20news-bydate/20news-bydate-train"
train_output_csv = "/Users/hunjunsin/Desktop/Jun/Unsupervised/hw1/20news_train.csv"

create_csv_from_newsgroups(train_data_dir, train_output_csv)

test_data_dir = "/Users/hunjunsin/Desktop/Jun/Unsupervised/hw1/20news-bydate/20news-bydate-test"
test_output_csv = "/Users/hunjunsin/Desktop/Jun/Unsupervised/hw1/20news_test.csv"

create_csv_from_newsgroups(test_data_dir, test_output_csv)

Train_Test_Split

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_csv = "/Users/hunjunsin/Desktop/Jun/Unsupervised/hw1/20news_train.csv"
test_csv = "/Users/hunjunsin/Desktop/Jun/Unsupervised/hw1/20news_test.csv"

train_data = pd.read_csv(train_csv)
test_data = pd.read_csv(test_csv)

combined_data = pd.concat([train_data, test_data], ignore_index=True)

combined_data = combined_data.sample(frac=1, random_state=42).reset_index(drop=True)


train_data, temp_data = train_test_split(combined_data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"Train : {len(train_data)}")
print(f"Validation : {len(val_data)}")
print(f"Test : {len(test_data)}")

train_data.to_csv("/Users/hunjunsin/Desktop/Jun/Unsupervised/20news_train_split.csv", index=False)
val_data.to_csv("/Users/hunjunsin/Desktop/Jun/Unsupervised/20news_val_split.csv", index=False)
test_data.to_csv("/Users/hunjunsin/Desktop/Jun/Unsupervised/20news_test_split.csv", index=False)


Train : 15076
Validation : 1885
Test : 1885


In [5]:
train_data.head()

Unnamed: 0,id,label,from,subject,lines,organization,body
12056,61084,sci.space,tffreeba@indyvax.iupui.edu,Re: PLANETS STILL: IMAGES ORBIT BY ETHER TWIST,3.0,,They must be shipping that good Eau Clair acid...
10359,38744,comp.graphics,tmc@spartan.ac.BrockU.CA (Tim Ciceran),Re: MPEG Location,21.0,"Brock University, St. Catharines Ontario",Alan Jackson (ajackson@cch.coventry.ac.uk) wro...
1266,54035,rec.sport.hockey,John Michael Santore <jsbh+@andrew.cmu.edu>,Re: Hockey guest spots...,28.0,"Sophomore, Mathematics, Carnegie Mellon, Pitts...",>Hi guys....\n> I'm looking to answe a ...
4602,53559,sci.electronics,johnh@macadam.mpce.mq.edu.au (John Haddy),Re: what to do with old 256k SIMMs?,14.0,Macquarie University,"In article <120466@netnews.upenn.edu>, jhaines..."
7341,15842,sci.crypt,n9045178@henson.cc.wwu.edu (Sean Dean),Re: Does Rush read his E-mail?,23.0,Western Washington University,rick@ee.uwm.edu (Rick Miller) writes:\n\n>rsil...


Normalization, Tfidf vectorize

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

train_20 = train_data.dropna(subset = ['body']).reset_index(drop = True)
train_documents = train_20['body'].reset_index(drop=True)

vectorizer = TfidfVectorizer(
    max_features=10000, 
    lowercase=True, 
    stop_words="english", 
    max_df=0.8, 
    min_df=5
)
train_tfidf_matrix = vectorizer.fit_transform(train_documents)

In [7]:
val_20 = val_data.dropna(subset = ['body']).reset_index(drop = True)
test_20 = test_data.dropna(subset = ['body']).reset_index(drop = True)

val_documents = val_20['body'].reset_index(drop=True)
test_documents = test_20['body'].reset_index(drop=True)

val_tfidf_matrix = vectorizer.transform(val_documents)
test_tfidf_matrix = vectorizer.transform(test_documents)

In [8]:
print(train_tfidf_matrix.shape)
print(val_tfidf_matrix.shape)
print(test_tfidf_matrix.shape)

(15053, 10000)
(1879, 10000)
(1885, 10000)


Euclidean Distance

In [44]:
from tqdm import tqdm
import numpy as np

def euc_dis(x, y):
    x_dense = x.toarray() 
    y_dense = y.toarray() 
    return np.sqrt(np.sum((x_dense[:, np.newaxis, :] - y_dense[np.newaxis, :, :])**2, axis=-1))

def batch_euclidean_dis(train_data, other_data, batch_size):
    n_train, n_features = train_data.shape
    n_other = other_data.shape[0]
    
    distance_matrix = np.zeros((n_train, n_other))
    
    for i in tqdm(range(0, n_train, batch_size), desc="Train Batches"):
        for j in range(0, n_other, batch_size):
            batch_train = train_data[i:i + batch_size]
            batch_other = other_data[j:j + batch_size]
            
            distances = euc_dis(batch_train, batch_other)
            distance_matrix[i:i + batch_size, j:j + batch_size] = distances
    
    return distance_matrix

train_val_distances = batch_euclidean_dis(train_tfidf_matrix, val_tfidf_matrix, batch_size=100)
train_test_distances = batch_euclidean_dis(train_tfidf_matrix, test_tfidf_matrix, batch_size=100)

print("Train-Validation Distance Matrix Shape:", train_val_distances.shape)
print("Train-Test Distance Matrix Shape:", train_test_distances.shape)

Processing Train Batches: 100%|██████████| 151/151 [07:13<00:00,  2.87s/it]
Processing Train Batches: 100%|██████████| 151/151 [06:56<00:00,  2.76s/it]

Train-Validation Distance Matrix Shape: (15053, 1879)
Train-Test Distance Matrix Shape: (15053, 1885)





KNN Prediction

In [None]:
def knn_predict(train_labels, distance_matrix, k):
    n_queries = distance_matrix.shape[1]  
    predictions = []

    for i in tqdm(range(n_queries)):
        
        k_nearest_indices = np.argsort(distance_matrix[:, i])[:k]
        k_nearest_labels = train_labels[k_nearest_indices]
        
        unique_labels, counts = np.unique(k_nearest_labels, return_counts=True)
        predicted_label = unique_labels[np.argmax(counts)]
        predictions.append(predicted_label)
    
    return np.array(predictions)

def find_best_k(train_labels, val_labels, train_val_distances, k_values):
    best_k = None
    best_accuracy = 0

    for k in k_values:
        val_predictions = knn_predict(train_labels, train_val_distances, k)
        
        accuracy = np.mean(val_predictions == val_labels)
        print(f"k={k}, Validation Accuracy={accuracy:.4f}")
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_k = k
    
    return best_k, best_accuracy

def compute_test_accuracy(train_labels, test_labels, train_test_distances, best_k):
    test_predictions = knn_predict(train_labels, train_test_distances, best_k)
    accuracy = np.mean(test_predictions == test_labels)
    return accuracy

In [None]:
k_values = [1, 3, 5, 7, 9]
best_k, best_val_accuracy = find_best_k(train_20['label'], val_20['label'], train_val_distances, k_values)
print(f"Best k: {best_k}, Best Validation Accuracy: {best_val_accuracy:.4f}")

test_accuracy = compute_test_accuracy(train_20['label'], test_20['label'], train_test_distances, best_k)
print(f"Test Accuracy with k={best_k}: {test_accuracy:.4f}")

k=1, Validation Accuracy=0.5061
k=3, Validation Accuracy=0.3566
k=5, Validation Accuracy=0.3204
k=7, Validation Accuracy=0.3960
k=9, Validation Accuracy=0.7664
Best k: 9, Best Validation Accuracy: 0.7664
Test Accuracy with k=9: 0.7523


In [9]:
train_tfidf_matrix = train_tfidf_matrix.toarray()
val_tfidf_matrix = val_tfidf_matrix.toarray()
test_tfidf_matrix = test_tfidf_matrix.toarray()

Cosine similarity

In [10]:
from tqdm import tqdm
import numpy as np

def cosine_sim(x, y):
    
    norm_x = np.linalg.norm(x, axis=-1, keepdims=True) 
    norm_y = np.linalg.norm(y, axis=-1, keepdims=True) 

    dot_product = np.dot(x, y.T)  

    cosine_sim = dot_product / (norm_x * norm_y.T + 1e-8)  
    return cosine_sim

def batch_cosine_sim(train_data, other_data, batch_size=100):

    n_train = train_data.shape[0]
    n_other = other_data.shape[0]
    
    similarity_mat = np.zeros((n_train, n_other))  

    for i in tqdm(range(0, n_train, batch_size), desc="Train Batches"):
        for j in range(0, n_other, batch_size):
            batch_train = train_data[i:i + batch_size]
            batch_other = other_data[j:j + batch_size]
            
            similarities = cosine_sim(batch_train, batch_other)
            similarity_mat[i:i + batch_size, j:j + batch_size] = similarities
    
    return similarity_mat

train_val_similarities = batch_cosine_sim(train_tfidf_matrix, val_tfidf_matrix, batch_size=100)
train_test_similarities = batch_cosine_sim(train_tfidf_matrix, test_tfidf_matrix, batch_size=100)

print("Train-Validation Similarity Matrix Shape:", train_val_similarities.shape)
print("Train-Test Similarity Matrix Shape:", train_test_similarities.shape)

Processing Train Batches: 100%|██████████| 151/151 [00:09<00:00, 15.34it/s]
Processing Train Batches: 100%|██████████| 151/151 [00:09<00:00, 16.14it/s]

Train-Validation Similarity Matrix Shape: (15053, 1879)
Train-Test Similarity Matrix Shape: (15053, 1885)



