In [1]:
import pandas as pd
import numpy as np
import progressbar

# Génération

In [2]:
def generate(n, Q, alpha = None, p_epsilon = 0, p_lambda = 1):

    alpha = [1/Q]*Q if alpha is None else alpha
    if len(alpha) != Q: raise ValueError("Alpha has not the same length as Q")

    # Defines class
    Z = np.zeros(n, dtype=np.uint)
    i = 0
    for q, n_q in enumerate(np.random.multinomial(n, alpha)):
        for j in range(n_q):
            Z[i] = q
            i += 1
        
    # Probability matrix of connection between classes
    pi = np.zeros((Q,Q)) + p_epsilon + np.diag([p_lambda - p_epsilon]*Q)

    # Matrix of connections
    X = np.zeros((n,n), dtype = bool)
    for i in range(n):
        q_i = Z[i]
        for j in range(i):
            q_j = Z[j]
            bound = bool(np.random.binomial(1, pi[q_i][q_j]))
            X[i][j] = bound
            X[j][i] = bound
    
    return X, Z

In [4]:
n, Q = 100, 3
X, Z = generate(n, Q, alpha = [2/10, 6/10, 2/10], p_epsilon = 0.3, p_lambda = 0.7)
X

array([[False,  True,  True, ...,  True, False,  True],
       [ True, False,  True, ..., False,  True,  True],
       [ True,  True, False, ...,  True, False, False],
       ..., 
       [ True, False,  True, ..., False,  True,  True],
       [False,  True, False, ...,  True, False,  True],
       [ True,  True, False, ...,  True,  True, False]], dtype=bool)

# EM

In [5]:
def norm(tau_old, tau_new):
    s = 0
    for i in range(n):
        for q in range(Q):
               s += abs(tau_old[i][q] - tau_new[i][q])
    return s

# Not good -> need a K-means
def error(tau, Z):
    n_error = 0
    for i in range(len(Z)):
        if Z[i] != tau[i].argmax():
            n_error += 1
    return str(round(n_error/len(Z)*100,2)) + "% d'erreur"

In [6]:
from math import exp
from scipy.special import digamma
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=Q).fit(X)
tau_new = np.zeros((n, Q))
for i, q in enumerate(kmeans.labels_):
    tau_new[i][q] = 1


print("Initialisation...")
N = np.zeros(Q)
eta = np.zeros((Q, n))
zeta = np.zeros((Q, n))
bar = progressbar.ProgressBar(max_value=Q*Q*n*n)
for q in range(Q):
    # N
    s_N = 0.5
    for i in range(n):
        s_N += tau_new[i][q]
    N[q] = s_N

    # Eta
    for l in range(Q):
        s_eta = 0.5 #eta0
        s_zeta = 0.5 #zeta0
        for j in range(n):
            for i in range(n):
                bar.update(i + j*n + l*n*n + q*Q*n*n)
                if (q == l and j <= i): break
                if i == j: continue

                s_eta += X[i][j]*tau_new[i][q]*tau_new[j][l]
                s_zeta += (1-X[i][j])*tau_new[i][q]*tau_new[j][l]

        eta[q][l] = s_eta
        zeta[q][l] = s_zeta

        if (s_eta < 0):
            raise ValueError(s_eta)           
        if (s_zeta < 0):
            raise ValueError(s_zeta)
            
            
print("Maximisation...")
for iterr in range(30):
    bar = progressbar.ProgressBar(max_value=n*Q*n)
    
    # Maximisation
    tau_old = tau_new
    tau_new = np.zeros((n, Q))
    for i in range(n):
        for q in range(Q):
            p = exp(digamma(N[q])-digamma(sum(N)))
            for j in range(n):
                bar.update(j + q*n +i*Q*n)
                if (i==j): continue
                for l in range(Q):
                    a = digamma(zeta[q][l])\
                         - digamma(eta[q][l] + zeta[q][l])\
                         + X[i][j]*(digamma(eta[q][l]) - digamma(zeta[q][l]))
                    a *= tau_new[j][l]
                    p *= exp(a)
            tau_new[i][q] = p
        tau_new[i] *= 1/sum(tau_new[i])
    
    delta = norm(tau_new, tau_old)
    print(delta)
    if (delta < 10E-6):
        break
print("Done")

 16% (14811 of 90000) |###                | Elapsed Time: 0:00:00 ETA:  0:00:00

Initialisation...


 12% (3798 of 30000) |##                  | Elapsed Time: 0:00:00 ETA:  0:00:01

Maximisation...


 15% (4557 of 30000) |###                 | Elapsed Time: 0:00:00 ETA:  0:00:01

184.635637448


 96% (28861 of 30000) |################## | Elapsed Time: 0:00:01 ETA:  0:00:00

0.0
Done


In [7]:
res = []
for i in range(len(Z)):
    res.append([Z[i], tau_new[i].argmax()])
res = pd.DataFrame(res, columns = ["x", "y"])
res.groupby(["x", "y"]).size().sort_index()

x  y
0  0     2
   2    13
1  0    48
   1     3
   2     6
2  1    14
   2    14
dtype: int64

# Build real network

In [None]:
def load(name):
    return pd.read_pickle("data/" + name + ".pickle")

In [None]:
emails = load("emails")
emails.head()

In [None]:
ms = load("messages")
ms.head()

In [None]:
rs = load("recipients")
rs.head()

In [None]:
rs = rs.loc[rs.m_id.isin(ms.index.values)]
X_bounds = rs.join(ms, on = "m_id", rsuffix="_sender")[["e_id_sender", "e_id"]].drop_duplicates().rename(columns = {"e_id_sender" : "sender_id", "e_id" : "recipient_id"})

In [None]:
X_bounds.head()

In [None]:
n = max(ms.e_id.max(), rs.e_id.max()) + 1
X = np.zeros((n,n), dtype=bool)
X.shape

bar = progressbar.ProgressBar(max_value=X_bounds.shape[0])
for index, row in bar(X_bounds.iterrows()):
    a,b = row["sender_id"], row["recipient_id"]
    X[a][b] = True
    X[b][a] = True