In [4]:
import pandas as pd
import numpy as np
import progressbar

# Génération

In [15]:
def generate(n, Q, alpha = None, p_epsilon = 0, p_lambda = 1):

    alpha = [1/Q]*Q if alpha is None else alpha

    # Defines class
    Z = np.zeros(n, dtype=np.uint)
    i = 0
    for q, n_q in enumerate(np.random.multinomial(n, alpha)):
        for j in range(n_q):
            Z[i] = q
            i += 1
        
    # Probability matrix of connection between classes
    pi = np.zeros((Q,Q)) + p_epsilon + np.diag([p_lambda - p_epsilon]*Q)

    # Matrix of connections
    X = np.zeros((n,n), dtype = bool)
    for i in range(n):
        q_i = Z[i]
        for j in range(i):
            q_j = Z[j]
            bound = bool(np.random.binomial(1, pi[q_i][q_j]))
            X[i][j] = bound
            X[j][i] = bound
    
    return X, Z

In [16]:
n, Q = 100, 2
X, Z = generate(n, Q)
X

array([[False,  True,  True, ..., False, False, False],
       [ True, False,  True, ..., False, False, False],
       [ True,  True, False, ..., False, False, False],
       ..., 
       [False, False, False, ..., False,  True,  True],
       [False, False, False, ...,  True, False,  True],
       [False, False, False, ...,  True,  True, False]], dtype=bool)

# EM

In [19]:
def norm(tau_old, tau_new):
    s = 0
    for i in range(n):
        for q in range(Q):
               s += abs(tau_old[i][q] - tau_new[i][q])
    return s

# Not good -> need a K-means
def error(tau, Z):
    n_error = 0
    for i in range(len(Z)):
        if Z[i] == tau[i].argmax():
            n_error += 1
    return str(round(n_error/len(Z)*100,2)) + "% d'erreur"

In [20]:
from math import exp
from scipy.special import digamma
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=Q).fit(X)
tau_new = np.zeros((n, Q))
for i, q in enumerate(kmeans.labels_):
    tau_new[i][q] = 1

for iterr in range(3):
    N = np.zeros(Q)
    for q in range(Q):
        s = 0.5
        for i in range(n):
            s += tau_new[i][q]
        N[q] = s
        
        
        
        
    eta = np.zeros((Q, n))
    for q in range(Q):
        for l in range(Q):
            s = 0.5 #eta0
            if (q != l):
                for i in range(n):
                    for j in range(n):
                        if i == j:
                            continue
                        else:
                            s += X[i][j]*tau_new[i][q]*tau_new[j][l]
            else:
                for j in range(n):
                    for i in range(j):
                        s += X[i][j]*tau_new[i][q]*tau_new[j][l]
            if (s < 0):
                raise ValueError(s)
            eta[q][l] = s
            
            
            
    zeta = np.zeros((Q, n))
    for q in range(Q):
        for l in range(Q):
            s = 0.5 #eta0
            if (q != l):
                for i in range(n):
                    for j in range(n):
                        if i == j:
                            continue
                        else:
                            s += (1-X[i][j])*tau_new[i][q]*tau_new[j][l]
            else:
                for j in range(n):
                    for i in range(j):
                        s += (1-X[i][j])*tau_new[i][q]*tau_new[j][l]
            if (s < 0):
                raise ValueError(s)
            zeta[q][l] = s



    tau_old = tau_new
    tau_new = np.zeros((n, Q))
    for i in range(n):
        for q in range(Q):
            p = exp(digamma(N[q])-digamma(sum(N)))
            for j in range(n):
                if (i==j): continue
                for l in range(Q):
                    a = digamma(zeta[q][l])\
                         - digamma(eta[q][l] + zeta[q][l])\
                         + X[i][j]*(digamma(eta[q][l]) - digamma(zeta[q][l]))
                    a *= tau_new[j][l]
                    p *= exp(a)
            tau_new[i][q] = p
        tau_new[i] *= 1/sum(tau_new[i])
    
    print(error(tau_new, Z))
    print(norm(tau_new, tau_old))
    print()

100.0% d'erreur
198.727506897

0.0% d'erreur
197.126682082

100.0% d'erreur
196.777770144



# Build real network

In [None]:
def load(name):
    return pd.read_pickle("data/" + name + ".pickle")

In [None]:
emails = load("emails")
emails.head()

In [None]:
ms = load("messages")
ms.head()

In [None]:
rs = load("recipients")
rs.head()

In [None]:
rs = rs.loc[rs.m_id.isin(ms.index.values)]
X_bounds = rs.join(ms, on = "m_id", rsuffix="_sender")[["e_id_sender", "e_id"]].drop_duplicates().rename(columns = {"e_id_sender" : "sender_id", "e_id" : "recipient_id"})

In [None]:
X_bounds.head()

In [None]:
n = max(ms.e_id.max(), rs.e_id.max()) + 1
X = np.zeros((n,n), dtype=bool)
X.shape

bar = progressbar.ProgressBar(max_value=X_bounds.shape[0])
for index, row in bar(X_bounds.iterrows()):
    a,b = row["sender_id"], row["recipient_id"]
    X[a][b] = True
    X[b][a] = True