# Fonction de lecture de cluster en pkl

In [1]:
import pickle

from collections import defaultdict
from pprint import pprint as pp
from itertools import zip_longest

from sklearn.metrics.pairwise import cosine_similarity as cos_sim
from sklearn.metrics import mean_squared_error, silhouette_score
import numpy as np

In [2]:
def read_vector_file(path_to_file):
    keys = []
    vectors = []
    with open(path_to_file) as file :
        line = file.readline()
        while line :
            data = line[:-1].split(" ")
            web, vector = data[0], [float(s) for s in data[1:]]
            vectors.append(vector)
            keys.append(web)
            line = file.readline()
    #vectors = normalize(vectors)
    return (keys, vectors)

In [3]:
def do_someting(websites, vectors, classed_file, verbose = False) :
    site2ind_class = {}
    ind2site = []
    good_vecs = []
    count = defaultdict(int)
    keys = set(websites)
    with open(classed_file) as file :
            i = 0
            line = file.readline()
            while line :
                cat, site = line[:-1].split("\t")
                if site in keys :
                    cat = cat.split("/")[3]
                    if site not in site2ind_class and cat != 'Régional':
                        site2ind_class[site] = (i, cat)
                        count[cat]+=1
                        ind2site.append(site)
                        good_vecs.append(vectors[websites.index(site)])
                        i+=1
                line = file.readline()
    if verbose :
        pp.pprint(count)
    return site2ind_class, ind2site, good_vecs

In [4]:
def label_to_clusters(labels):
    clusters = defaultdict(set)
    for ind, clus in enumerate(labels):
        clusters[clus].add(ind)
    return clusters

### Méthodes de mesure

In [5]:
def generate_centroids(vectors, labels, clusters) :
    #Nope, no docstring
    """centroids = defaultdict(lambda : np.array([0.0 for i in range(len(vectors[0]))]))
    for ind, clu in enumerate(labels):
        centroids[clu] += np.array(vectors[ind])
        if clu == 15 :
            from IPython.display import clear_output
            clear_output()
            print(sorted(centroids[clu]))
            input()
    for clu, cent in centroids.items() :
        centroids[clu] /= len(clusters[clu])
    """ 
    centroids = {}
    for ind, clus in clusters.items():
        v = []
        for site in clus :
            v.append(np.array(vectors[site]))
        centroids[ind] = sum(v)/len(v)
    return centroids

def mse(vectors, labels, clusters) :
    centroids = generate_centroids(vectors, labels, clusters)
    print("Centroids done")
    error_dic = defaultdict(int)
    for ind, clus in clusters.items() :
        vec = []
        test = []
        for v in clus :
            vec.append(np.array(vectors[v]))
            test.append(v)
        
        error = mean_squared_error( cos_sim( vec, [centroids[ind]] ).reshape(len(vec)) , 
                                    [1 for j in range(len(vec))])
        #print(len(vec), error)
        error_dic[ind] = error 
            
    return error_dic

In [6]:
def label_similarity(clusters, ind2site, site2ind_class):
    lab_sim = {}
    for ind, clus in clusters.items() :
        top_lab = defaultdict(int)
        for elem in clus :
            top_lab[site2ind_class[ind2site[elem]][1]] +=1
        top_lab = sorted([v for v in top_lab.items()], key = lambda kv: kv[1] )
        lab_sim[ind] = (top_lab[-1][0], top_lab[-1][1]/sum([i[1] for i in top_lab]), )
    return lab_sim

### Zone de test et d'exécution

In [62]:
#### VAR ZONE ####
file_path = "fr.up.seeds.txt.shuf.10000" 
classed_file = "dmozFull.fr"
labels_path = "clus.pkl"
## END VAR ZONE ##

#### EXEC ZONE ####
websites, vectors = read_vector_file(file_path)
site2ind_class, ind2site, good_vecs = do_someting(websites, vectors, classed_file)
with open(labels_path, 'rb') as stream:
    labels = pickle.load(stream)
clusters = label_to_clusters(labels)

lab_sim = label_similarity(clusters, ind2site, site2ind_class)
print("Start")

silh = silhouette_score(good_vecs, labels)
print("Silh done")
mse_dic = mse(good_vecs, labels, clusters)
print("MSE done")
## END EXEC ZONE ##
final = {}
for ind, clus in clusters.items() :
    final[ind] = {"cluster" : {ind2site[i] for i in clus}, "MSE" : mse_dic[ind],
                  "Top_label" : lab_sim[ind][0], "Label_proportion" : lab_sim[ind][1]}
k = 0
for i in final :
    if len(final[i]['cluster'])-1:
        k+=1
        print(i, final[i]['MSE'])
        for site in final[i]['cluster'] :
            print(site)
print(k)

Start
Silh done
Centroids done
MSE done
25 0.07239475760166512
http://www.esculape.com/fmc/meniere.html
http://psea.free.fr/gore/gore.htm
http://www.blue.fr/largo/
http://gene.guillot.free.fr/index.html
http://formation.conseil.free.fr/1907/
http://www.chronophage.com/
http://www.esculape.com/hepatogastro/hepatitec1999.htm
http://verne.jules.free.fr/
http://www.toile.org/psi/index.html
http://www.poesie.net/breton.htm
http://terredefoot.free.fr/
http://boul1.chez.com/jodyville.htm
http://www.redpsy.com/infopsy/anxiete.html
http://www.dvdanime.net/articleview.php?id=84
http://pages.videotron.com/vasgrav/meteo/
http://garstud.free.fr/
http://www.redpsy.com/guide/index.html
http://netelys.free.fr/
http://joleguen.free.fr/
http://users.skynet.be/aero-space/
http://trucsmaths.free.fr/nombre_d_or.htm
http://hitchcock.alienor.fr/
http://pythagore.team.free.fr/
http://www.enfer.com/
http://leconsdechecspourdebutants.com/
http://www.anti-rev.org/textes/Levinas34a/body.html
http://www.aero65.fr/

In [63]:
top_score = sorted([(value['MSE'],value['cluster']) for value in final.values()
                    if len(value['cluster'])>5 and value['MSE'] > 0.01  ])
print(len(top_score))
print(top_score[-1][0])
for site in top_score[-1][1] :
    print(site)
print(top_score[0][0])
for site in top_score[0][1] :
    print(site)

62
0.14410131588341704
http://www.yogalite.fr/
http://www.yogapourmieuxetre.fr/
http://www.guerisseur-traulle.com/
http://www.yogaetcetera.org/
http://www.chu-rouen.fr/page/epicondylite
http://ecolumen.free.fr/
http://www.taodiet.fr/
http://tai-chi-gong.org/
http://www.le-monde-du-guerisseur.com/
http://www.cplf.fr/
http://www.ecole-ahimsa.com/
http://www.beaute-massage-absoluzen.com/
http://massage.ayurvedique.free.fr/
http://kyste-de-tarlov.forumpro.fr/
http://www.chiropratiqueraymond.com/
http://www.massotherapierepentigny.com/
http://www.robertforcier.com/
http://www.chirurgie-du-pied.net/
http://www.kinessonne.com/
http://www.triniyoga.fr/
http://www.kinesiologie.fr/
http://zen-nice.org/
http://www.sozenacupuncture.fr/
http://www.imagup.com/
http://www.cote-basque-plongee.fr/
http://www.yogavar.fr/
http://larbredeleveil.org/zendos/activites/
http://www.yoga-perpignan.fr/
http://www.denshinji.fr/
http://www.chiropratiquegiroux.com/
http://lavoiepatanjali.canalblog.com/
http://www.j

In [54]:
final[140]
list_vec = [(np.asarray(good_vecs[site2ind_class[site][0]]), site) for site in final[140]["cluster"]]
for site in [t[1] for t in list_vec] :
    print(site)
list_vec = [t[0] for t in list_vec]
centroid = sum(list_vec)/30
print([ float("{:.2f}".format(num)) for num in np.concatenate(cos_sim(list_vec, [centroid]))])
for line in (cos_sim(list_vec, list_vec)) : 
    s = 0
    for col in line : 
        print("{:.1f}".format(col), end = " ")
        s +=col
    print(s)
    print("")
print()
print(mean_squared_error(np.concatenate(cos_sim(list_vec, [centroid])), [1]*30))


http://www.securite.org/
http://www.auneo.com/
http://sebsauvage.net/comprendre/ssl/
http://www.50dh.net/
https://fr.funio.com/
http://www.niloo.fr/
http://www.atoosys.com/
http://www.clever.fr/
http://www.ql3d.fr/
https://www.securiteinfo.com/
http://www.adresseip.com/
http://www.dn-computing.com/Quick3270_fr.htm
http://www.hebergementweb.org/
http://www.e-mengine.com/
http://www.ipeos.com/
http://www.bitdefender.fr/
http://www.sonilog.com/
http://www.paratrooper-museum.org/
http://www.leadinfo.fr/
http://www.legrain.fr/
http://www.squallnetwork.net/
http://www.ecis.net/
http://www.adircof.asso.fr/
http://telio.free.fr/
http://www.itrust.fr/
http://www.eurowh.com/
http://www.dinhosting.fr/
http://fr.acronis.com/
http://www.dynamixhost.com/
http://www.hebergement-web-quebec.com/
[0.75, 0.73, 0.71, 0.42, 0.76, 0.6, 0.55, 0.59, 0.63, 0.72, 0.59, 0.59, 0.44, 0.66, 0.6, 0.65, 0.72, 0.6, 0.39, 0.61, 0.58, 0.79, 0.43, 0.59, 0.59, 0.62, 0.69, 0.69, 0.6, 0.53]
1.0 0.4 0.6 0.2 0.4 0.4 0.4 0.5 0