In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.cluster import AffinityPropagation
from defs import get_string_distances
import multiprocess as mp
from functools import partial

[INFO] loading EAST text detector...


In [2]:
df = pd.read_csv('words.csv', low_memory=False)
df.set_index('id', inplace=True)

In [3]:
df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,146,147,148,149,150,151,152,153,154,155
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10706426,NOMOS,Cant,,,,,,,,,...,,,,,,,,,,
10863511,LONGINES,Mace,ieee,Swiss,roei,,,,,,...,,,,,,,,,,
11033684,ROLEX,PERPETUAL,ATIVE,CHR,,,,,,,...,,,,,,,,,,
12058471,TIMEGA,VILLE,Pema,,,,,,,,...,,,,,,,,,,
12173580,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20299845,LONGINGES,,,,,,,,,,...,,,,,,,,,,
20299849,BREITLI,fels,,,,,,,,,...,,,,,,,,,,
20299854,LONGINES,,,,,,,,,,...,,,,,,,,,,
20299872,JUN,Wii,PATH,,,,,,,,...,,,,,,,,,,


In [4]:
def get_list_of_words(row):
    words = []
    for i in range(0,21):
        w = row[i]
        if(w == w):
            words.append(row[i].upper())
        
    return(words)

In [5]:
%%time

df['words'] = df.apply(get_list_of_words, axis=1)

Wall time: 1.74 s


In [6]:
%%time

words = []
for row in df['words']:
    words = words + row

Wall time: 16 s


In [7]:
words = pd.DataFrame(words)
words = words.value_counts()
words = words[words>4]

In [8]:
words = words.index.get_level_values(0).astype(str).values
print(len(words))
print(words)

5612
['SWISS' 'AUTOMATIC' 'ROLEX' ... 'SOMME' 'PRIME' 'ROVE']


In [9]:
words = np.asarray(words)

In [10]:
%%time

with mp.Pool(6) as p:
    f = partial(get_string_distances, words=words)
    x = p.map(f, words)
    
df_distances = pd.DataFrame(x)
df_distances['word'] = words
df_distances.columns = df_distances.columns.map(str)
df_distances.set_index('word', inplace=True)

Wall time: 45.2 s


In [11]:
df_distances.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,5602,5603,5604,5605,5606,5607,5608,5609,5610,5611
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SWISS,0,-8,-5,-8,-5,-5,-5,-9,-11,-5,...,-4,-7,-10,-5,-4,-3,-4,-4,-4,-5
AUTOMATIC,-8,0,-8,-5,-9,-7,-8,-7,-7,-6,...,-9,-6,-8,-8,-8,-9,-6,-7,-8,-8
ROLEX,-5,-8,0,-8,-3,-4,-4,-8,-8,-3,...,-3,-6,-9,-5,-3,-5,-4,-3,-3,-2
CERTIFIED,-8,-5,-8,0,-7,-6,-8,-7,-7,-8,...,-7,-8,-8,-9,-9,-9,-6,-8,-6,-8
EEE,-5,-9,-3,-7,0,-2,-2,-10,-9,-3,...,-2,-7,-11,-3,-4,-3,-4,-3,-3,-2


In [12]:
from sklearn.cluster import AffinityPropagation
import numpy as np

In [13]:
%%time

affprop = AffinityPropagation(affinity="precomputed", damping=0.5, random_state=7, verbose=True)
affprop.fit(df_distances)
for cluster_id in np.unique(affprop.labels_):
    exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
    cluster = np.unique(words[np.nonzero(affprop.labels_==cluster_id)[0]])
    cluster_str = ", ".join(cluster)
    print(" - *%s:* %s" % (exemplar, cluster_str))

Converged after 110 iterations.
 - *SWISS:* AQUIS, EWISS, FWISS, ISWISS, LAWISS, LSWISS, MILUS, OWISS, PARIS, PSWISS, SHISS, SMISS, SSWSS, SUISSE, SWESS, SWICE, SWIDS, SWIS, SWISE, SWISG, SWISLP, SWISS, SWNSS, SWSS, TSWISS
 - *AUTOMATIC:* ANTOMATIC, ATOMATIC, AULOMALIC, AUTAMATIC, AUTOIAATIC, AUTOMATIC, AUTOMATIG, AUTOMATIK, AUTOMATION, BRUTOMATIC, BUTOMATIC, KUTOMATIC, LUTOMATIC, MUTOMATIC, NUTOMATIC, PUTOMATIC, RUTOMATIC, SUTOMATIC, WUTOMATIC
 - *ROLEX:* IOLEX, MOLEX, OLEX, OLEYX, POLEX, ROIEX, ROLED, ROLEN, ROLEW, ROLEX, ROLEY, ROLEYX, ROTEX, RPOLEX, SOLEX
 - *CERTIFIED:* CEOTIFIED, CERTIFICATE, CERTIFIE, CERTIFIED, CERTIFIEL, CERTIFIEN, ERTIFIED, LYCERTIFIED, PEDTIEIEN, PERTIFIED, SERTIFIED, YCERTIFIED
 - *EEE:* ABE, AOE, AWE, AYE, BAE, BBE, BIE, BLE, BOE, BPE, BSE, BYE, CBE, CIE, CLE, CPE, CRE, CTE, CUE, DEE, DHE, DOE, DRE, DUE, DWE, EEA, EEE, EEN, EERIE, EES, EEX, EOE, ESE, ETE, EVE, EWE, EXE, FEE, FUE, GEE, GFE, GUE, GWE, HEE, HFE, HOE, HTE, HUE, HVE, HYE, IBE, IDE, IHE, IJE, IK

In [16]:
len(affprop.labels_)

5612

In [22]:
clusters = pd.DataFrame(words, columns=['word'])
clusters['cluster'] = affprop.labels_
clusters.set_index('word', inplace=True)
clusters

Unnamed: 0_level_0,cluster
word,Unnamed: 1_level_1
SWISS,0
AUTOMATIC,1
ROLEX,2
CERTIFIED,3
EEE,4
...,...
JSS,37
ETUAI,13
SOMME,290
PRIME,58


In [23]:
df.tail()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,147,148,149,150,151,152,153,154,155,words
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20299845,LONGINGES,,,,,,,,,,...,,,,,,,,,,[LONGINGES]
20299849,BREITLI,fels,,,,,,,,,...,,,,,,,,,,"[BREITLI, FELS]"
20299854,LONGINES,,,,,,,,,,...,,,,,,,,,,[LONGINES]
20299872,JUN,Wii,PATH,,,,,,,,...,,,,,,,,,,"[JUN, WII, PATH]"
d9113598,Lvales,,,,,,,,,,...,,,,,,,,,,[LVALES]


In [61]:
n_clusters = len(np.unique(affprop.labels_))
def get_word_clusters(words):
    row = [0] * n_clusters
    for word in words:
        try:
            row[clusters.loc[word][0]] = 1
        except:
            pass
    return row

In [89]:
n_clusters = len(np.unique(affprop.labels_))
def get_word_clusters(words):
    row = np.full(n_clusters,0)
    for word in words:
        try:
            row[clusters.loc[word][0]] = 1
        except:
            pass
    return row

In [107]:
%%time

clusters = df['words'].apply(get_word_clusters)

Wall time: 8.12 s


In [109]:
df_clusters = pd.DataFrame(np.row_stack(clusters))
df_clusters.index = clusters.index

In [110]:
df_clusters

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,379,380,381,382,383,384,385,386,387,388
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10706426,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10863511,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11033684,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12058471,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12173580,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20299845,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20299849,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20299854,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20299872,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [111]:
df_clusters.to_csv('word_clusters.csv')