In [22]:
import pandas as pd

csv_in  = "/home/carl/spark/examples/carl_Spark/data/mllib/twtr15053001.csv"
twts_df01 = pd.read_csv(csv_in, sep=',', encoding='utf-8')


In [23]:
twts_df01.count()

id            2520
created_at    2520
user_id       2520
user_name     2520
tweet_text    2520
url           2520
dtype: int64

In [24]:
twtstxt_ls02_utf8 = twts_df01['tweet_text']
twtstxt_ls02_utf8[7000:7010]

Series([], Name: tweet_text, dtype: object)

In [25]:
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer

t0 = time()
vectorizer = TfidfVectorizer(max_df=0.5, max_features=20000, min_df=2, stop_words='english', use_idf=True)
X = vectorizer.fit_transform(twtstxt_ls02_utf8)
print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" %  X.shape)
print(X.shape)

done in 0.267964s
n_samples: 2520, n_features: 1960
(2520, 1960)


In [26]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=7, init='k-means++', max_iter=100, n_init=1, verbose=1)
print("Clustering sparse data with %s " % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))

Clustering sparse data with KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=7, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=1) 
Initialization complete
Iteration  0, inertia 3913.361
Iteration  1, inertia 2258.759
Iteration  2, inertia 2225.844
Iteration  3, inertia 2103.791
Iteration  4, inertia 2051.639
Iteration  5, inertia 2012.006
Iteration  6, inertia 2007.627
Converged at iteration 6
done in 0.162s


In [27]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(7):
    print("Cluster%d:" % i)
    for ind in order_centroids[i, :20]:
        print('%s' % terms[ind])
    print()

Top terms per cluster:
Cluster0:
makes
emtmac0kgq
competition
sense
maybe
chart
receiver
depth
easier
async
kerwinfranks
coroutines
rzkbwm6ebf
really
thanks
spark
rt
python
fortune
fun
()
Cluster1:
baybryj
vply6hu0yo
whee
released
worked
6b1
thanks
gvanrossum
rt
python
forexnews
format
fortune
fp2txhfj85
forums
ford
foot
forward
founder
foxnews
()
Cluster2:
bad
life
fp2txhfj85
bright
これを聴き始めたら雨あがった
と同時に木村八段登場
sing
equal
w87s32ra7s
promises
does
look
youtube
day
monty
python
foxnews
forward
forums
fortune
()
Cluster3:
check
deserved
lj3dkeeoph
model
ycvgwdxqnq
vivgcme5wi
visit
falbala
eventbrite
techsquare
ydt8py5ldu
labs
django
bootcamp
gt
iama_programmer
amjoxggt5y
timestampable
mixin
simple
()
Cluster4:
python
spark
rt
plug
clinton
hillary
amp
rare
programming
jobs
tractor
bulb
old
farm
auto
hit
glass
antique
piggy
miss
()
Cluster5:
php
la
javascript
python
technews
programmers
wonxp3xdwr
happyprogrammersday
js
code
java
en
programming
lyft
pink
paint
paid
famous
stick
sf
()
Cluster6

In [28]:
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.manifold import MDS

MDS()

MDS(dissimilarity='euclidean', eps=0.001, max_iter=300, metric=True,
  n_components=2, n_init=4, n_jobs=1, random_state=None, verbose=0)

In [29]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(X)
print(dist)

[[  0.00000000e+00   9.87334457e-01   9.92303918e-01 ...,   1.00000000e+00
    1.00000000e+00   1.00000000e+00]
 [  9.87334457e-01   0.00000000e+00   9.87704035e-01 ...,   1.00000000e+00
    1.00000000e+00   1.00000000e+00]
 [  9.92303918e-01   9.87704035e-01   0.00000000e+00 ...,   1.00000000e+00
    1.00000000e+00   1.00000000e+00]
 ..., 
 [  1.00000000e+00   1.00000000e+00   1.00000000e+00 ...,  -2.22044605e-16
   -2.22044605e-16   7.81682858e-01]
 [  1.00000000e+00   1.00000000e+00   1.00000000e+00 ...,  -2.22044605e-16
   -2.22044605e-16   7.81682858e-01]
 [  1.00000000e+00   1.00000000e+00   1.00000000e+00 ...,   7.81682858e-01
    7.81682858e-01  -2.22044605e-16]]


In [30]:
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

pos = mds.fit_transform(dist)
xs, ys = pos[:, 0], pos[:, 1]
# print(xs)
# print(ys)

In [31]:
print(xs)
print(ys)

[-0.02236058  0.48471498  0.74806128 ...,  0.26753381  0.14221634
  0.15251596]
[ 0.66449042  0.45452639 -0.21821199 ..., -0.08043852  0.12945218  0.07012   ]


In [43]:

cluster_colors = { 0: '#1b9e77', 1:'#d95f02', 2:'#7570b3', 3:'#e7298a', 4:'#66a61e', 5:'#9990b3', 6:'e8888a' }
cluster_names = { 0: 'lady gaga', 1:'python', 2:'iphone', 3:'china', 4:'youtube', 5:'spark', 6:'justinbieber' }

clusters = km.labels_.tolist()
print(len(clusters))
print(len(twtstxt_ls02_utf8))

df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, txt=twtstxt_ls02_utf8))
ix_start = 2000
ix_stop = 2050
df01 = df[ix_start:ix_stop]

print(df01[['label', 'txt']])
print(len(df01))
print()

2520
2520
      label                                                txt
2000      1  RT @gvanrossum: Whee!! Python 3.6b1 is now REL...
2001      1  RT @gvanrossum: Whee!! Python 3.6b1 is now REL...
2002      4  Single quotes vs. double quotes in Python #pyt...
2003      1  RT @gvanrossum: Whee!! Python 3.6b1 is now REL...
2004      1  RT @gvanrossum: Whee!! Python 3.6b1 is now REL...
2005      1  RT @gvanrossum: Whee!! Python 3.6b1 is now REL...
2006      1  RT @gvanrossum: Whee!! Python 3.6b1 is now REL...
2007      1  RT @gvanrossum: Whee!! Python 3.6b1 is now REL...
2008      1  RT @gvanrossum: Whee!! Python 3.6b1 is now REL...
2009      1  RT @gvanrossum: Whee!! Python 3.6b1 is now REL...
2010      4  Programming Python by Mark Lutz https://t.co/5...
2011      1  Whee!! Python 3.6b1 is now RELEASED: https://t...
2012      2  これを聴き始めたら雨あがった、と同時に木村八段登場!! Always Look on the...
2013      2  Bad day does not equal bad life. Promises http...
2014      5  Again: #HappyProgrammersDay ⌨💻🤓\

In [None]:

groups = df.groupby('label')
groups01 = df01.groupby('label')

fig, ax = plt.subplots(figsize=(17, 10))
ax.margins(0.05)


for name, group in groups01:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=cluster_names[name], color=cluster_colors[name], mec='none')
    ax.set_aspect('auto')
    ax.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off')
    ax.tick_params(axis='y', which='both', left='off', top='off', labelleft='off')
    
ax.legend(numpoints=1)

for i in range(ix_start, ix_stop):
    ax.text(df01.ix[i]['x'], df01.ix[i]['y'], df01.ix[i]['txt'], size=10)

plt.show()