In [1]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
import matplotlib.pyplot as plt
%matplotlib inline
import re
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from pylab import *
import nltk
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
sns.set()
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans

In [2]:
stop_words = stopwords.words('english')
stop_words = stop_words + list(string.printable)
lemmatizer = WordNetLemmatizer()

In [3]:
categories = ['misc.forsale', 'sci.electronics', 'talk.religion.misc']

In [4]:
news_data = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42,
                               download_if_missing=True)
news_data_df = pd.DataFrame({'text': news_data['data'], 'category': news_data.target})

In [5]:
news_data_df['cleaned_text'] = news_data_df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word.lower()) for
                                                                             word in word_tokenize
                                                                              (re.sub(r'([^\s\w]|_)+',
                                                                                      ' ', str(x)))
                                                                              if word.lower() not in stop_words]))

In [6]:
tfidf_model = TfidfVectorizer()
tfidf_df = pd.DataFrame(tfidf_model.fit_transform(news_data_df['cleaned_text']).todense())
tfidf_df.columns = sorted(tfidf_model.vocabulary_)
tfidf_df.head()

Unnamed: 0,00,000,0000vec,0001,0004,000k,001,0010,001428,0028,...,zucchini,zuck,zuiko,zumbo,zumdahl,zurvanism,zxmkr08,³ation,ºnd,ýé
0,0.171723,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.08358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(tfidf_df)
y_kmeans = kmeans.predict(tfidf_df)
news_data_df['obtained_clusters'] = y_kmeans

In [8]:
pd.crosstab(news_data_df['category'].replace({0: 'misc.forsale', 1: 'sci.electronics', 2: 'talk.religion.misc'}),
           news_data_df['obtained_clusters'].replace({0: 'cluster_1', 1: 'cluster_2', 2: 'cluster_3',
                                                     3: 'cluster_4'}))

obtained_clusters,cluster_1,cluster_2,cluster_3,cluster_4
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
misc.forsale,26,30,527,2
sci.electronics,12,456,123,0
talk.religion.misc,0,16,31,330


In [None]:
distortions = []
K = range(1, 6)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(tfidf_df)
    distortions.append(sum(np.min(cdist(tfidf_df, kmeanModel.clusters_centers_, 'euclidean'),
                                  axis=1))/tfidf_df.shape[0])