In [58]:
from sklearn.datasets import fetch_20newsgroups

In [59]:
categories = ['alt.atheism', 'talk.religion.misc','comp.graphics', 'sci.space']

In [60]:
groups = fetch_20newsgroups(subset='all', categories=categories)

In [61]:
labels = groups.target
label_names = groups.target_names

def is_letter_only(word):
    return word.isalpha()


In [62]:
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer

all_names = set(name.lower() for name in names.words())  # convert names to lowercase
lemmatizer = WordNetLemmatizer()

# Cleaning loop
data_cleaned = []
for doc in groups.data:
    doc = doc.lower()
    doc_cleaned = ' '.join(
        lemmatizer.lemmatize(word) for word in doc.split()
        if is_letter_only(word) and word not in all_names)
    
    data_cleaned.append(doc_cleaned)

In [63]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer(stop_words='english', max_features=None, max_df=0.5, min_df=2)
data = count_vector.fit_transform(data_cleaned)

In [64]:
#K-means algo
from sklearn.cluster import KMeans
k = 4
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(data)

In [65]:
#Cluster size check
clusters = kmeans.labels_
from collections import Counter
print(Counter(clusters))


Counter({2: 3371, 0: 10, 3: 3, 1: 3})


In [66]:
#Wrong separation, implementation of TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [67]:
tfdif = TfidfVectorizer(stop_words='english', max_features=None, max_df=0.5, min_df=2)

In [68]:
data = tfdif.fit_transform(data_cleaned)
kmeans.fit(data)

In [69]:
clusters = kmeans.labels_
print(Counter(clusters))

Counter({1: 1352, 0: 729, 2: 725, 3: 581})


In [72]:
#Top 10 results
import numpy as np
cluster_label = {i: labels[np.where(clusters == i)] for i in range(k)}

terms = tfdif.get_feature_names_out()
centroids = kmeans.cluster_centers_
for cluster, index_list in cluster_label.items():
    counter = Counter(cluster_label[cluster])
    print('cluster_{}: {} samples'.format(cluster, len(index_list)))
    for label_index, count in sorted(counter.items(), key=lambda x: x[1], reverse=True):
        print('{}: {} samples'.format(label_names[label_index], count))
    print()

cluster_0: 729 samples
alt.atheism: 428 samples
talk.religion.misc: 279 samples
sci.space: 16 samples
comp.graphics: 6 samples

cluster_1: 1352 samples
alt.atheism: 366 samples
sci.space: 356 samples
talk.religion.misc: 348 samples
comp.graphics: 282 samples

cluster_2: 725 samples
comp.graphics: 684 samples
sci.space: 36 samples
alt.atheism: 4 samples
talk.religion.misc: 1 samples

cluster_3: 581 samples
sci.space: 579 samples
alt.atheism: 1 samples
comp.graphics: 1 samples



# üìä Understanding NMF and LAD in Machine Learning

---

## üü° Non-negative Matrix Factorization (NMF)

**Non-negative Matrix Factorization (NMF)** is a dimensionality reduction technique used to uncover latent structures in non-negative data.

It factorizes a non-negative matrix **V** into two smaller non-negative matrices **W** and **H** such that:

> V ‚âà W √ó H

This decomposition helps extract interpretable features where the original data is expressed as an additive combination of underlying components.

### üîç Use Cases:
- **Topic modeling** in text mining
- **Image compression** and reconstruction
- **Recommender systems** (e.g., collaborative filtering)

### üåü Key Traits:
- Produces **sparse and interpretable features**
- Works only on **non-negative data**
- Captures **parts-based** representations (e.g., words in topics)

---

## üîµ Least Absolute Deviations (LAD)

**Least Absolute Deviations (LAD)** is a regression method that minimizes the **sum of absolute errors** rather than squared errors.

Also known as **L1 regression** or **median regression**, LAD is valued for its **robustness to outliers** in the dataset.

### üîç Use Cases:
- **Robust linear regression** under noisy or outlier-heavy data
- **Financial and econometric modeling**
- **Quantile regression** (LAD corresponds to the 50th percentile)

### üåü Key Traits:
- Minimizes **absolute differences**, not squared ones
- More robust to **outliers** than ordinary least squares (OLS)
- Produces a model that‚Äôs more stable under extreme values

---

## ‚öñÔ∏è NMF vs. LAD ‚Äì Side-by-Side Comparison

| Feature               | NMF (Non-negative Matrix Factorization) | LAD (Least Absolute Deviations)    |
|-----------------------|------------------------------------------|-------------------------------------|
| Purpose               | Dimensionality reduction                 | Regression modeling                 |
| Input Requirements    | Non-negative matrix                      | Numerical data (can include negatives) |
| Optimization Target   | Factorization: V ‚âà W √ó H                 | Minimize sum of absolute errors     |
| Robust to Outliers    | No                                       | Yes                                 |
| Common Use Cases      | Topic modeling, Recommenders, Images     | Robust regression, Quantile analysis |
| Output                | Latent features (W, H)                   | Regression coefficients             |
| Interpretation        | Parts-based, interpretable components    | Median-based trend line             |

---

üß† **In Summary**:
- Use **NMF** when you want to extract hidden patterns or reduce dimensionality in **non-negative** datasets like document-word matrices.
- Use **LAD** when you're building a **regression model** and need resistance to outliers or skewed data.



In [73]:
from sklearn.decomposition import NMF
t = 20
nmf = NMF(n_components=t, random_state=42)

In [74]:
data = count_vector.fit_transform(data_cleaned)
nmf.fit(data)

In [75]:
terms = count_vector.get_feature_names_out()
for topic_idx, topic in enumerate(nmf.components_):
    print('Topic {}'.format(topic_idx))
    print(' '.join([terms[i] for i in topic.argsort()[-10:]]))

Topic 0
available quality program free color version gif file image jpeg
Topic 1
said make gospel fit day psalm people prophecy ha wa
Topic 2
said verse wa father mormon shall unto mcconkie lord god
Topic 3
atmosphere surface kilometer ha wa earth planet moon spacecraft solar
Topic 4
communication new venture service market ha commercial space satellite launch
Topic 5
include available analysis user software ha processing data tool image
Topic 6
package format message server object image mail file send graphic
Topic 7
just people doe atheism religion believe belief religious god atheist
Topic 8
file graphic grass program ha package ftp available image data
Topic 9
science international satellite mission national telescope center shuttle nasa space
Topic 10
general speed material unified larson book universe theory physicist physical
Topic 11
day new year book say time people article did wa
Topic 12
used occurs true form ha ad premise conclusion argument fallacy
Topic 13
want ha doe mak

In [81]:
#LDA
from sklearn.decomposition import LatentDirichletAllocation
t = 5
lda = LatentDirichletAllocation(n_components=t, learning_method='batch', random_state=42)

In [82]:
data = count_vector.fit_transform(data_cleaned)
lda.fit(data)

In [83]:
lda.components_

array([[1.83006293, 0.20000009, 0.20000009, ..., 2.58246241, 2.16678423,
        0.20000025],
       [1.56949557, 0.20000012, 0.20000012, ..., 0.20428194, 0.23321109,
        3.1994672 ],
       [0.20010613, 0.2000001 , 0.2000001 , ..., 6.78960306, 0.20000191,
        0.20004481],
       [0.20033489, 0.20218391, 0.20218391, ..., 0.22364999, 0.20000082,
        0.20000011],
       [0.20000048, 2.19781578, 2.19781578, ..., 0.2000026 , 0.20000196,
        0.20048763]])

In [84]:
terms = count_vector.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    print('Topic {}'.format(topic_idx))
    print(' '.join([terms[i] for i in topic.argsort()[-10:]]))

Topic 0
spacecraft solar earth orbit mission launch ha satellite wa space
Topic 1
looking software article like ha wa know graphic computer university
Topic 2
people university shuttle ha think like just article space wa
Topic 3
make know doe just ha think article people god wa
Topic 4
version use color format available graphic program jpeg file image
