In [1]:
%matplotlib inline
import os
import igraph as ig
import pandas as pd
from scipy import stats
import numpy as np
import sqlite3
import matplotlib.pyplot as plt

## Analysis of Genetic Algorithms for Community Detection in Complex Networks

In [2]:
# Read sqlite query results into a pandas DataFrame
con = sqlite3.connect("ClusterResults.db")
df = pd.read_sql_query("SELECT * from ClusterResults where params_file not like '%large%'", con)

# Verify that result of SQL query is stored in the dataframe
con.close()
df.head()

Unnamed: 0,algo,seed,file,average_scores,best_scores,final_score,params_file,membership,nmi,rand,vi
0,gals,13369360549290,D:/alien-pineapple/benchmark_gen/gml_files/ben...,0.44009612959866123\n0.5461974760784992\n0.605...,0.5421854674251742\n0.6269896726720868\n0.6793...,0.8174163816764717,impl_GALS/default.properties,"1,2,3,1,2,4,5,6,2,6,7,8,9,10,11,5,12,6,13,14,1...",0.0,0.0,0.0
1,gals,13369360549290,D:/alien-pineapple/benchmark_gen/gml_files/ben...,0.44102171729926565\n0.5457098792826734\n0.609...,0.5605900188715776\n0.6435192651567331\n0.6994...,0.8183975072639841,impl_GALS/default.properties,"1,2,3,1,2,4,5,6,2,6,7,8,9,10,11,5,12,6,13,14,1...",0.0,0.0,0.0
2,gals,13369360549290,D:/alien-pineapple/benchmark_gen/gml_files/ben...,0.4402410463825556\n0.5448017280194865\n0.6035...,0.5558795911697045\n0.6299240468040519\n0.6866...,0.817829138322348,impl_GALS/default.properties,"1,2,3,1,2,4,5,6,2,6,7,8,9,10,11,5,12,6,13,14,1...",0.0,0.0,0.0
3,gals,13369360549290,D:/alien-pineapple/benchmark_gen/gml_files/ben...,0.4390068762765545\n0.545385871564504\n0.60710...,0.5386790824759569\n0.6246539036437487\n0.6933...,0.8119665565380684,impl_GALS/default.properties,"1,2,3,1,2,4,5,6,2,6,7,8,9,10,11,5,12,6,13,14,1...",0.0,0.0,0.0
4,gals,13369360549290,D:/alien-pineapple/benchmark_gen/gml_files/ben...,0.43838356561266134\n0.5446626617100198\n0.605...,0.5284876212325604\n0.6271894762815291\n0.7049...,0.8169301644245596,impl_GALS/default.properties,"1,2,3,1,2,4,5,6,2,6,7,8,9,10,11,5,12,6,13,14,1...",0.0,0.0,0.0


In [3]:
df['membership'] = df['membership'].apply(lambda s: map(int, s.split(',')))
df['best_scores'] = df['best_scores'].apply(lambda s: map(float, s.split('\n')))
df['average_scores'] = df['average_scores'].apply(lambda s: map(float, s.split('\n')))

In [4]:
df['filename'] = df['file'].apply(lambda s: os.path.basename(s))

In [5]:
def getTrueComs(file):
    true_file = file + ".coms"
    if os.path.exists(true_file):
        data = pd.read_csv(true_file, sep='\t', header=None)
        return list(data[1].as_matrix())
    else:
        return []
   

In [6]:
df['true_membership'] = df['file'].apply(lambda s: getTrueComs(s))

## Comparison Metrics

@article{rand1971,
  added-at = {2006-03-21T11:09:44.000+0100},
  author = {Rand, W.M.},
  biburl = {https://www.bibsonomy.org/bibtex/2fd52548cb4bcd8e83dd27e4b55eff1f3/hotho},
  interhash = {1afaf0170bc705a9e49b625f67679ee2},
  intrahash = {fd52548cb4bcd8e83dd27e4b55eff1f3},
  journal = {Journal of the American Statistical Association },
  keywords = {cluster clustering criteria evaluation index rand},
  number = 336,
  pages = {846-850},
  timestamp = {2007-09-18T14:44:34.000+0200},
  title = {Objective criteria for the evaluation of clustering methods},
  volume = 66,
  year = 1971
}



In [7]:
df['nmi'] = df[['membership', 'true_membership']].apply(lambda s: 
                                                        None if s[1] == [] else 
                                                        ig.compare_communities(s[0], s[1], method="nmi"), axis=1)

df['vi'] = df[['membership', 'true_membership']].apply(lambda s: 
                                                        None if s[1] == [] else 
                                                        ig.compare_communities(s[0], s[1], method="vi"), axis=1)

df['rand'] = df[['membership', 'true_membership']].apply(lambda s: 
                                                        None if s[1] == [] else 
                                                        ig.compare_communities(s[0], s[1], method="rand"), axis=1)

df['adjusted_rand'] = df[['membership', 'true_membership']].apply(lambda s: 
                                                        None if s[1] == [] else 
                                                        ig.compare_communities(s[0], s[1], method="adjusted_rand"), axis=1)

df['split_join'] = df[['membership', 'true_membership']].apply(lambda s: 
                                                        None if s[1] == [] else 
                                                        ig.compare_communities(s[0], s[1], method="split-join"), axis=1)

In [8]:
def identifyConverge(vals):
    """find the earliest occurance of the maximum value in a list
    """
    best = vals[-1]
    if vals[0] == best:
        return 0
    for i in range(len(vals)):
        if vals[i] == best:
            return i
        

In [9]:
df['converge_gen'] = df['best_scores'].apply(identifyConverge)
df['gens'] = df['best_scores'].apply(len)

In [10]:
def extract_size(name):
    file = str(name.split('/')[-1])
    size = None
    if 'lfr' in file:
        size = file.split('_')[1]
    elif 'girvan' in file:
        size = 128
    
    return size
    
    
def extract_mu(name):
    file = str(name.split('/')[-1])
    mu = None
    if 'mu' in file:
        mu = file.split('_')[2]
        mu = mu[2:]
    return mu


def extract_kout(name):
    file = str(name.split('/')[-1])
    mu = None
    if 'kout' in file:
        mu = file.split('_')[2]
    return mu


df['short_file'] = df['file'].apply(lambda s: str(s.split('/')[-1]))
df['size'] = df['file'].apply(lambda s: extract_size(s))
df['mu'] = df['file'].apply(lambda s: extract_mu(s))
df['kout'] = df['file'].apply(lambda s: extract_kout(s))

In [11]:
gals = df.loc[lambda dd: dd['algo'] == 'gals']
gacd = df.loc[lambda dd: dd['algo'] == 'gacd']
tbga = df.loc[lambda dd: dd['algo'] == 'tasgin']
ganet = df.loc[lambda dd: dd['algo'] == 'ganet']

In [12]:
def highlight_max(s):
    """
    highlight the maximum in a Series yellow.
    """
    is_max = s == s.max()
    return ['background-color: yellow' if v else '' for v in is_max]

In [13]:
gals_valtable = gals.groupby(['filename','params_file'], sort=False)['nmi'].mean().reset_index().pivot('filename', 'params_file', 'nmi')
gals_valtable.style.apply(highlight_max, axis=1)

params_file,impl_GALS/default.properties
filename,Unnamed: 1_level_1
dolphins.gml,
football.gml,
girvan_kout_10_0.gml,0.0886493
girvan_kout_11_0.gml,0.0661671
girvan_kout_12_0.gml,0.0482544
girvan_kout_13_0.gml,0.0371656
girvan_kout_14_0.gml,0.0431408
girvan_kout_15_0.gml,0.0284754
girvan_kout_1_0.gml,1.0
girvan_kout_2_0.gml,1.0


In [15]:
with open('gals.tex','w') as tf:
    tf.write(gals_valtable.to_latex())

In [14]:
ganet_valtable = ganet.groupby(['filename','params_file'], sort=False)['nmi'].mean().reset_index().pivot('filename', 'params_file', 'nmi')

ganet_valtable.style.apply(highlight_max, axis=1)

params_file,impl_GANET/balanced_c_m.properties,impl_GANET/default.properties,impl_GANET/high_elite.properties,impl_GANET/high_r.properties,impl_GANET/higher_r.properties,impl_GANET/low_cross_high_m.properties
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
dolphins.gml,,,,,,
football.gml,,,,,,
girvan_kout_10_0.gml,0.0,0.0,0.0,0.0101195,0.0396989,0.0
girvan_kout_11_0.gml,0.0,0.0,0.0,0.00695269,0.0302283,0.0
girvan_kout_12_0.gml,0.0,0.0,0.0,0.0106595,0.0265229,0.0
girvan_kout_13_0.gml,0.0,0.0,0.0,0.0049732,0.0270508,0.0
girvan_kout_14_0.gml,0.0,0.0,0.0,0.00806893,0.0155061,0.0
girvan_kout_15_0.gml,0.0,0.0,0.0,0.00664283,0.0163956,0.0
girvan_kout_1_0.gml,0.997483,1.0,1.0,1.0,1.0,0.966106
girvan_kout_2_0.gml,0.92269,1.0,1.0,1.0,1.0,0.796509


In [17]:
with open('ganet.tex','w') as tf:
    tf.write(ganet_valtable.to_latex())

In [15]:
gacd_valtable = gacd.groupby(['filename','params_file'], sort=False)['nmi'].max().reset_index().pivot('filename', 'params_file', 'nmi')
gacd_valtable.style.apply(highlight_max, axis=1)

params_file,impl_GACD/default.properties,impl_GACD/low_mut_high_cross.properties
filename,Unnamed: 1_level_1,Unnamed: 2_level_1
dolphins.gml,,
football.gml,,
girvan_kout_10_0.gml,0.112101,0.159798
girvan_kout_11_0.gml,0.0971016,0.0998821
girvan_kout_12_0.gml,0.0841867,0.0969749
girvan_kout_13_0.gml,0.0687894,0.0707389
girvan_kout_14_0.gml,0.0511708,0.0592357
girvan_kout_15_0.gml,0.0453382,0.0490339
girvan_kout_1_0.gml,1.0,1.0
girvan_kout_2_0.gml,1.0,1.0


In [19]:
with open('gacd.tex','w') as tf:
    tf.write(ganet_valtable.to_latex())

In [16]:
tbga_valtable = tbga.groupby(['filename','params_file'], sort=False)['nmi'].mean().reset_index().pivot('filename', 'params_file', 'nmi')
tbga_valtable.style.apply(highlight_max, axis=1)

params_file,impl_TasginGA/default.properties,impl_TasginGA/high_elite.properties,impl_TasginGA/high_init.properties,impl_TasginGA/high_mut.properties
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dolphins.gml,,,,
football.gml,,,,
girvan_kout_10_0.gml,0.0580131,0.0539764,0.0570108,0.0625157
girvan_kout_11_0.gml,0.0465857,0.0435233,0.0367911,0.041119
girvan_kout_12_0.gml,0.0372377,0.0399442,0.0322929,0.0442313
girvan_kout_13_0.gml,0.0344538,0.0314403,0.0287068,0.0354185
girvan_kout_14_0.gml,0.0317857,0.0319333,0.026173,0.029513
girvan_kout_15_0.gml,0.0228158,0.0305732,0.0237681,0.0212579
girvan_kout_1_0.gml,1.0,1.0,1.0,1.0
girvan_kout_2_0.gml,1.0,0.997523,1.0,1.0


In [21]:
with open('tasgin.tex','w') as tf:
    tf.write(tbga_valtable.to_latex())

In [22]:
plot_data = df[['file', 'true_membership']]
plot_data['true_membership'] = plot_data['true_membership'].apply(lambda s: ' '.join(str(e) for e in s))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [23]:
plot_data = plot_data.drop_duplicates()

In [24]:
for x, y in zip(plot_data['file'], plot_data['true_membership']):
    if "lfr" not in x and "dolphins" not in x:
        print(x)
        y = [int(i) for i in y.split(" ") if i != '']
        print(len(y))
        g = ig.Graph.Read(x, format='gml')
        coms = ig.VertexClustering(g, membership=y)
        plt = ig.plot(coms, mark_groups = True, vertex_size=2)
        plt.save(x.split('/')[-1]+".png")
        print("saved")

D:/alien-pineapple/benchmark_gen/gml_files/real_networks/football.gml
0


IOError: [Errno 2] No such file or directory: 'D:/alien-pineapple/benchmark_gen/gml_files/real_networks/football.gml'

In [None]:
def plot_graph(filename):
    g = ig.Graph.Read(filename, format='gml')
    membership = getTrueComs(filename)
    coms = ig.VertexClustering(g, membership=membership)
    plt = ig.plot(coms, mark_groups = True, vertex_size=0, bbox=(20000,20000))
    imagename = filename.split("\\")[-1]
    imagename = imagename.split(".")[0]
    print(imagename)
    plt.save(imagename+".png")

In [None]:
plot_graph('D:\\alien-pineapple\\benchmark_gen\\gml_files\\benchmarks\\lfr2\\n1000b\\lfr_1000_mu3_b_0.gml')

In [None]:
plot_graph('D:\\alien-pineapple\\benchmark_gen\\gml_files\\benchmarks\\lfr2\\n1000b\\lfr_1000_mu4_b_0.gml')

In [None]:
plot_graph('D:\\alien-pineapple\\benchmark_gen\\gml_files\\benchmarks\\lfr2\\n1000b\\lfr_1000_mu5_b_0.gml')

In [None]:
plot_graph('D:\\alien-pineapple\\benchmark_gen\\gml_files\\benchmarks\\lfr2\\n1000b\\lfr_1000_mu6_b_0.gml')

In [None]:
def convergence_curve(dataframe):
    files = dataframe['file'].unique()
    params = dataframe['params_file'].unique()

    for file in files:
        for param in params:
            batch = dataframe[(dataframe['file']==file) & (dataframe['params_file']==param)]
            best_mean = np.array(np.mean(np.matrix(batch['best_scores'].tolist()), axis=0)).flatten()
            avg_mean = np.array(np.mean(np.matrix(batch['average_scores'].tolist()), axis=0)).flatten()
            x=[i for i in range(len(best_mean))] 
            y=best_mean.tolist()
            plt.plot(x, y, label="best")
            plt.xlabel('Generation')
            plt.ylabel('Modularity')
            plt.title('Convergence of GANETff')
            plt.plot(avg_mean.tolist(), label="avg")
            plt.legend()
            plt.show()

In [188]:
convergence_curve(ganet)

AttributeError: 'Plot' object has no attribute 'plot'

In [17]:
def extract_size(name):
    file = str(name.split('/')[-1])
    size = None
    if 'lfr' in file:
        size = file.split('_')[1]
    elif 'girvan' in file:
        size = 128
    
    return size
    
    
def extract_mu(name):
    file = str(name.split('/')[-1])
    mu = None
    if 'mu' in file:
        mu = file.split('_')[2]
        mu = mu[2:]
    return mu


def extract_kout(name):
    file = str(name.split('/')[-1])
    mu = None
    if 'kout' in file:
        mu = file.split('_')[2]
    return mu


df['short_file'] = df['file'].apply(lambda s: str(s.split('/')[-1]))
df['size'] = df['file'].apply(lambda s: extract_size(s))
df['mu'] = df['file'].apply(lambda s: extract_mu(s))
df['kout'] = df['file'].apply(lambda s: extract_kout(s))

In [19]:
def params_kruskal(dataframe):
    files = dataframe['file'].unique()
    params = dataframe['params_file'].unique()
    for file in files:
        results = []
        for param in params:
            batch = dataframe[(dataframe['file']==file) & (dataframe['params_file']==param)]
            results.append(batch['nmi'].tolist())
        res = None
        try:
            res = stats.kruskal(*results)
        except(ValueError):
            res = "equal populations"
        print(res, 
              file.split('/')[-1], 
              res.pvalue < res.statistic if type(res) is not str else None)

In [20]:
params_kruskal(gals)

('equal populations', u'lfr_1000_mu1_s_0.gml', None)
('equal populations', u'lfr_1000_mu2_s_0.gml', None)
('equal populations', u'lfr_1000_mu3_s_0.gml', None)
('equal populations', u'lfr_1000_mu4_s_0.gml', None)
('equal populations', u'lfr_1000_mu5_s_0.gml', None)
('equal populations', u'lfr_1000_mu6_s_0.gml', None)
('equal populations', u'lfr_1000_mu1_b_0.gml', None)
('equal populations', u'lfr_1000_mu2_b_0.gml', None)
('equal populations', u'lfr_1000_mu3_b_0.gml', None)
('equal populations', u'lfr_1000_mu4_b_0.gml', None)
('equal populations', u'lfr_1000_mu5_b_0.gml', None)
('equal populations', u'lfr_1000_mu6_b_0.gml', None)
('equal populations', u'lfr_5000_mu1_s_0.gml', None)
('equal populations', u'lfr_5000_mu2_s_0.gml', None)
('equal populations', u'lfr_5000_mu3_s_0.gml', None)
('equal populations', u'lfr_5000_mu4_s_0.gml', None)
('equal populations', u'lfr_5000_mu5_s_0.gml', None)
('equal populations', u'lfr_5000_mu6_s_0.gml', None)
('equal populations', u'lfr_5000_mu1_b_0.gml',