# Fonction de lecture de cluster en pkl

In [1]:
import pickle

from collections import defaultdict
from pprint import pprint as pp
from itertools import zip_longest

from sklearn.metrics.pairwise import cosine_similarity as cos_sim
from sklearn.metrics import mean_squared_error, silhouette_score
import numpy as np

In [2]:
def read_vector_file(path_to_file):
    keys = []
    vectors = []
    with open(path_to_file) as file :
        line = file.readline()
        while line :
            data = line[:-1].split(" ")
            web, vector = data[0], [float(s) for s in data[1:]]
            vectors.append(vector)
            keys.append(web)
            line = file.readline()
    #vectors = normalize(vectors)
    return (keys, vectors)

In [3]:
def do_someting(websites, vectors, classed_file, verbose = False) :
    site2ind_class = {}
    ind2site = []
    good_vecs = []
    count = defaultdict(int)
    keys = set(websites)
    with open(classed_file) as file :
            i = 0
            line = file.readline()
            while line :
                cat, site = line[:-1].split("\t")
                if site in keys :
                    cat = cat.split("/")[3]
                    if site not in site2ind_class and cat != 'Régional':
                        site2ind_class[site] = (i, cat)
                        count[cat]+=1
                        ind2site.append(site)
                        good_vecs.append(vectors[websites.index(site)])
                        i+=1
                line = file.readline()
    if verbose :
        pp.pprint(count)
    return site2ind_class, ind2site, good_vecs

In [4]:
def label_to_clusters(labels):
    clusters = defaultdict(set)
    for ind, clus in enumerate(labels):
        clusters[clus].add(ind)
    return clusters

### Méthodes de mesure

In [5]:
def generate_centroids(vectors, labels, clusters) :
    #Nope, no docstring
    """centroids = defaultdict(lambda : np.array([0.0 for i in range(len(vectors[0]))]))
    for ind, clu in enumerate(labels):
        centroids[clu] += np.array(vectors[ind])
        if clu == 15 :
            from IPython.display import clear_output
            clear_output()
            print(sorted(centroids[clu]))
            input()
    for clu, cent in centroids.items() :
        centroids[clu] /= len(clusters[clu])
    """ 
    centroids = {}
    for ind, clus in clusters.items():
        v = []
        for site in clus :
            v.append(np.array(vectors[site]))
        centroids[ind] = sum(v)/len(v)
    return centroids

def mse(vectors, labels, clusters) :
    centroids = generate_centroids(vectors, labels, clusters)
    print("Centroids done")
    error_dic = defaultdict(int)
    for ind, clus in clusters.items() :
        vec = []
        test = []
        for v in clus :
            vec.append(np.array(vectors[v]))
            test.append(v)
        
        error = mean_squared_error( cos_sim( vec, [centroids[ind]] ).reshape(len(vec)) , 
                                    [1 for j in range(len(vec))])
        #print(len(vec), error)
        error_dic[ind] = error 
            
    return error_dic

In [6]:
def label_similarity(clusters, ind2site, site2ind_class):
    lab_sim = {}
    for ind, clus in clusters.items() :
        top_lab = defaultdict(int)
        for elem in clus :
            top_lab[site2ind_class[ind2site[elem]][1]] +=1
        top_lab = sorted([v for v in top_lab.items()], key = lambda kv: kv[1] )
        lab_sim[ind] = (top_lab[-1][0], top_lab[-1][1]/sum([i[1] for i in top_lab]), )
    return lab_sim

### Zone de test et d'exécution

In [7]:
#### VAR ZONE ####
file_path = "fr.up.seeds.txt.shuf.10000" 
classed_file = "dmozFull.fr"
labels_path = "clus.pkl"
## END VAR ZONE ##

#### EXEC ZONE ####
websites, vectors = read_vector_file(file_path)
site2ind_class, ind2site, good_vecs = do_someting(websites, vectors, classed_file)
with open(labels_path, 'rb') as stream:
    labels = pickle.load(stream)
clusters = label_to_clusters(labels)

lab_sim = label_similarity(clusters, ind2site, site2ind_class)
print("Start")

silh = silhouette_score(good_vecs, labels)
print("Silh done")
mse_dic = mse(good_vecs, labels, clusters)
print("MSE done")
## END EXEC ZONE ##
final = {}
for ind, clus in clusters.items() :
    final[ind] = {"cluster" : {ind2site[i] for i in clus}, "MSE" : mse_dic[ind],
                  "Top_label" : lab_sim[ind][0], "Label_proportion" : lab_sim[ind][1]}
k = 0
for i in final :
    if len(final[i]['cluster'])-1:
        k+=1
        print(i, final[i]['MSE'])
        for site in final[i]['cluster'] :
            print(site)
print(k)

Start
Silh done
Centroids done
MSE done
120 0.07172371128886282
http://www.meteo-londres.eu/
http://www.laregion-risquesnaturels.fr/
http://www.meteo-26.com/indexs.htm
http://pluiesextremes.meteo.fr/antilles/1980-Allen.html
http://services.meteofrance.com/
33 0.11315854383812542
http://feeds.feedburner.com/univ-nantes/thematiques/recherche?format=xml
http://mapage.noos.fr/stockjobber/
http://www.izahay.net/
http://www.yoyodesign.org/doc/w3c/css1/index.html
http://www.univ-tln.fr/spip.php?page=backend
http://www.endorfin.fr/
http://one-man-peau.blog4ever.com/
http://feeds.feedburner.com/poleuniversitaireyonnais?format=xml
http://mignardises.canalblog.com/
http://www.grephh.fr/
http://www.eslpb-natation.fr/
http://lisagpeintre.blogspot.com/
http://www.pensee-chretienne.org/
http://www.roger-orfevre.com/
http://amazonian-museum-network.org/rss
http://www.onlineformapro.com/
http://feeds2.feedburner.com/univ-nantes/composantes/medecine?format=xml
http://meteojcd.free.fr/
http://www.actu.u-

In [8]:
top_score = sorted([(value['MSE'],value['cluster']) for value in final.values()
                    if len(value['cluster'])>5 and value['MSE'] > 0.01  ])
print(len(top_score))
print(top_score[-1][0])
for site in top_score[-1][1] :
    print(site)
print(top_score[0][0])
for site in top_score[0][1] :
    print(site)

91
0.19301827577479966
http://www.grcao.umontreal.ca/
http://www.crous-rouen.fr/
http://www.genotoul.fr/
http://www.grenoble-inp.fr/
http://www.sciences.univ-nantes.fr/
http://www.esi.umontreal.ca/~grofnum/
http://www.insa-toulouse.fr/fr/formation/ingenieur/specialites/gba.html
http://www.univ-rennes1.fr/
http://www.bio.uqam.ca/
http://www.montpellier.inra.fr/
http://www.univ-paris13.fr/cerap
http://www.istem.eu/
http://benoit-bely.chez-alice.fr/CV/cv.html
http://www.univ-reims.fr/
http://www.physique-ingenierie.unistra.fr/
http://www.societechimiquedefrance.fr/
http://www.iut.univ-paris5.fr/
http://www.unil.ch/fbm
http://www.insa-rouen.fr/
http://www.communaute-univ-grenoble-alpes.fr/
http://www-igm.univ-mlv.fr/
http://www.chm.ulaval.ca/
http://www.bibs.u-psud.fr/
http://www.fundp.ac.be/sciences/chimie
http://med.unistra.fr/
http://www.activation.fr/
http://csidoc.insa-lyon.fr/portail-publier-6.php
https://www.chimie-paristech.fr/
http://superconductors.free.fr/
http://www.dr7.cnrs.fr