In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

[fetch_20newsgroups data](https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html)

In [None]:
# uzimamo tri teme: racunari, sport i religija
categories = [
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'alt.atheism',
 'soc.religion.christian',
]

In [None]:
dataset = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))

In [None]:
dataset

In [None]:
df = pd.DataFrame(dataset.data, columns=["corpus"])

In [None]:
df.head()

In [None]:
df.iloc[1].values

In [None]:
df.shape

### Preporcessing

In [None]:
# Biblioteka za rad sa tekstom
import re
import string
import nltk
from nltk.corpus import stopwords

In [None]:
from nltk.corpus import stopwords
stopwords.words('english')[:10]

In [None]:
len(stopwords.words('english'))

In [None]:
string.punctuation

In [None]:
def preprocess_text(text):
    # remove special chars (> punctuation) and numbers
    text = re.sub('[^A-Za-z]+', ' ', text)  # ukloni sve sto nije veliko ili malo slovo 
    # remove stopwords
    tokens = nltk.word_tokenize(text) # podela teksta na tokene (kao reci)
    tokens = [w for w in tokens if not w.lower() in stopwords.words('english') and w not in string.punctuation] # izbaci stopwords
    text = ' '.join(tokens) # ocisceni tekst
    text = text.lower().strip() # sve mala slova i bez suvisnih razmaka (whitespace) 
    return text

In [None]:
df['cleaned_corpus'] = df['corpus'].apply(lambda x: preprocess_text(x))

In [None]:
df

In [None]:
df.iloc[0].values

## TF-IDF

<img src = 'tfidf.png'>

**TF-IDF (Term Frequency-Inverse Document Frequency)** je statisticka mera koja govori koliko je svaka rec bitna u korpusu tako sto svakoj reci dodeljuje numericku tezinu. 

In [None]:
tf_idf_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.95)

In [None]:
X = tf_idf_vectorizer.fit_transform(df['cleaned_corpus'])

In [None]:
X.shape  
# svaka rec dobija svoju tezinu u tekstu, odnosno svakom tekstu dodeljujemo vektor duzine broja reci u vokabularu

In [None]:
vocabulary = []
for text in df['cleaned_corpus']:
    vocabulary.extend([w for w in nltk.word_tokenize(text)])

In [None]:
len(np.unique(vocabulary)) # ima malo vise reci jer smo neke filtrirali koristeci max_df

In [None]:
X # sparse matrix

In [None]:
X.toarray()

### K-means

In [None]:
Ks = range(1, 8) 

In [None]:
models = [KMeans(n_clusters=i, random_state=23) for i in Ks]

In [None]:
scores = [model.fit(X).score(X) for model in models]

In [None]:
plt.plot(Ks, scores)
plt.show()

In [None]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)

In [None]:
clusters = kmeans.labels_ 
clusters

## PCA 

**PCA (Principal Component Analysis)** je tehnika redukcije dimenzije tako da se sto bolje ocuva informacije koju podaci nose. Podaci su predstavljeni glavnim komponentama u novom koordinatnom sistemu dobijenom ortogonalnom transformacijom. Prva glavna komponenta je pravac u prostoru na kome projekcije tacaka podataka imaju najvecu varijansu. U zavisnosti od strukture podataka, mozemo redukovati dimenziju na samo nekoliko glavnih komponenti.

<img src='pca.jpeg' width=600>

In [None]:
# PCA sa dve glavne komponente
pca = PCA(n_components=2, random_state=42)

In [None]:
X_pca = pca.fit_transform(X.toarray())
x0 = X_pca[:, 0]
x1 = X_pca[:, 1]

In [None]:
df['cluster'] = clusters
df['x0'] = x0
df['x1'] = x1

In [None]:
df.head()

In [None]:
X_pca.shape

In [None]:
def get_top_keywords(n_terms):
    df = pd.DataFrame(X.todense()).groupby(clusters).mean()
    terms = tf_idf_vectorizer.get_feature_names_out() # vraca vokabular
    for i, row in df.iterrows():
        print('\nCluster {}'.format(i))
        print(','.join([terms[t] for t in np.argsort(row)[-n_terms:]])) 

In [None]:
get_top_keywords(10)  # 10 najvaznijih reci u svakom klasteru

In [None]:
plt.figure(figsize=(12, 7))
plt.title('PCA rezultat K-means klasterovanja TF-IDF reprezentacije teksta')
plt.xlabel('x0')
plt.ylabel('x1')
sns.scatterplot(data=df, x='x0', y='x1', hue='cluster', palette="viridis")
plt.show()