In [1]:
#imports
import pandas as pd
import numpy as np
import networkx as nx
import scipy as sp
from sklearn.cluster import KMeans
import shutil

In [2]:
#load files
astroph_df = pd.read_csv("communities/CA-AstroPh.txt", skiprows=1,header=None, sep= " ")
condmat_df = pd.read_csv("communities/CA-CondMat.txt", skiprows=1,header=None, sep= " ")
grqc_df = pd.read_csv("communities/CA-GrQc.txt", skiprows=1,header=None, sep= " ")
hepph_df = pd.read_csv("communities/CA-HepPh.txt", skiprows=1,header=None, sep= " ")
hepth_df = pd.read_csv("communities/CA-HepTh.txt", skiprows=1,header=None, sep= " ")

In [29]:
astroph_values=astroph_df.values
condmat_values=condmat_df.values
grqc_values=grqc_df.values
hepph_values=hepph_df.values
hepth_values=hepth_df.values


In [31]:
# ca-AstroPh 17903 197031 50
# ca-CondMat 21363 91342 100
# ca-GrQc 4158 13428 2
# ca-HepPh 11204 117649 25
# ca-HepTh 8638 24827 20

2

In [11]:
def normalized_spectral_clustering(vertices, k_eigval):
    
    G=nx.Graph()
    g = G.add_edges_from(vertices)
    
    #calculate laplacian
    laplacian = nx.normalized_laplacian_matrix(G)
    #normalize laplicacian
    laplacian_normalized = sp.sparse.identity(laplacian.shape[0]).toarray() - laplacian
    laplacian_normalized = sp.sparse.csr_matrix(laplacian_normalized)
    
    #calcluate k eigenvectors and form matrix U
    eig_val, U = sp.sparse.linalg.eigs(laplacian_normalized, k=k_eigval)
    
    #normalize U rows to have 1 norm
    u_norm = sp.linalg.norm(U, axis=1, ord=1)
    u_normalized = U / u_norm[:,np.newaxis]
    
    #cluster the U points to k clusters
    kmeans = KMeans(n_clusters=k_eigval, random_state=0).fit(u_normalized)
    
    vertex_id = np.arange(laplacian.shape[0])
    cluster_id = kmeans.labels_
    concat_arr = np.column_stack((vertex_id, cluster_id))
    
    #concat_df = pd.DataFrame({'vertex_id':concat_arr[:,1],'cluster_id':concat_arr[:,0]})
    return concat_arr
    
    
def write_to_csv(clustering, name, header):
    concat_arr_df = pd.DataFrame(clustering)
    concat_arr_df.to_csv(name, sep=' ', index=False, header=False)
    
    def line_prepender(filename, line):
        with open(filename, 'r+') as f:
            content = f.read()
            f.seek(0, 0)
            f.write(line.rstrip('\r\n') + '\n' + content)
            
    line_prepender(name, header)   
    
    
def format_txt_file(name_old, name_new, header):
    
    shutil.copyfile(name_old, name_new)
    
    with open(name_new) as f:
        lines = f.readlines()

        lines # ['This is the first line.\n', 'This is the second line.\n']

        lines[0] = header

        lines # ["This is the line that's replaced.\n", 'This is the second line.\n']

        with open(name_new, "w") as f:
            f.writelines(lines)
        
    
    
    



In [71]:
write_to_csv(re, 'CA-AstroPh.csv', '# ca-AstroPh 17903 197031 50')

In [72]:
%run evaluate.py ca-AstroPh.txt CA-AstroPh.csv

reading graphfile ca-AstroPh.txt
objective = 15962.666666666666


In [None]:
# ca-AstroPh 17903 197031 50
# ca-CondMat 21363 91342 100
# ca-GrQc 4158 13428 2
# ca-HepPh 11204 117649 25
# ca-HepTh 8638 24827 20

#astroph_values
#condmat_values
#grqc_values
#hepph_values
#hepth_values

In [25]:
#CA-AstroPh
ask2 = normalized_spectral_clustering(astroph_values, 2)
ask10 = normalized_spectral_clustering(astroph_values, 10)
ask20 = normalized_spectral_clustering(astroph_values, 20)
ask50 = normalized_spectral_clustering(astroph_values, 50)
ask100 = normalized_spectral_clustering(astroph_values, 100)



In [26]:
#CA-AstroPh write clustering to csv
write_to_csv(ask2, 'CA-AstroPh-k2.csv', '# ca-AstroPh 17903 197031 2')
write_to_csv(ask10, 'CA-AstroPh-k10.csv', '# ca-AstroPh 17903 197031 10')
write_to_csv(ask20, 'CA-AstroPh-k20.csv', '# ca-AstroPh 17903 197031 20')
write_to_csv(ask50, 'CA-AstroPh-k50.csv', '# ca-AstroPh 17903 197031 50')
write_to_csv(ask100, 'CA-AstroPh-k100.csv', '# ca-AstroPh 17903 197031 100')


In [27]:
#CA-AstroPh reformat txt file for evaluation script
format_txt_file('CA-AstroPh.txt', 'CA-AstroPh-k2.txt', '# ca-AstroPh 17903 197031 2\n')
format_txt_file('CA-AstroPh.txt', 'CA-AstroPh-k10.txt', '# ca-AstroPh 17903 197031 10\n')
format_txt_file('CA-AstroPh.txt', 'CA-AstroPh-k20.txt', '# ca-AstroPh 17903 197031 20\n')
format_txt_file('CA-AstroPh.txt', 'CA-AstroPh-k50.txt', '# ca-AstroPh 17903 197031 50\n')
format_txt_file('CA-AstroPh.txt', 'CA-AstroPh-k100.txt', '# ca-AstroPh 17903 197031 100\n')




In [28]:
#CA-AstroPh run evaluation script 
%run evaluate.py CA-AstroPh-k2.txt CA-AstroPh-k2.csv
%run evaluate.py CA-AstroPh-k10.txt CA-AstroPh-k10.csv
%run evaluate.py CA-AstroPh-k20.txt CA-AstroPh-k20.csv
%run evaluate.py CA-AstroPh-k50.txt CA-AstroPh-k50.csv
%run evaluate.py CA-AstroPh-k100.txt CA-AstroPh-k100.csv



reading graphfile CA-AstroPh-k2.txt
objective = 26.708333333333332
reading graphfile CA-AstroPh-k10.txt
objective = 1449.3333333333333
reading graphfile CA-AstroPh-k20.txt
objective = 4350.35294117647
reading graphfile CA-AstroPh-k50.txt
objective = 15962.666666666666
reading graphfile CA-AstroPh-k100.txt
objective = 36530.0


In [31]:
cok2 = normalized_spectral_clustering(condmat_values, 2)
cok10 = normalized_spectral_clustering(condmat_values, 10)
cok20 = normalized_spectral_clustering(condmat_values, 20)
cok50 = normalized_spectral_clustering(condmat_values, 50)
co100 = normalized_spectral_clustering(condmat_values, 100)

In [32]:
write_to_csv(cok2, 'CA-CondMat-k2.csv', '# ca-CondMat 21363 91342 2')
write_to_csv(cok10, 'CA-CondMat-k10.csv', '# ca-CondMat 21363 91342 10')
write_to_csv(cok20, 'CA-CondMat-k20.csv', '# ca-CondMat 21363 91342 20')
write_to_csv(cok50, 'CA-CondMat-k50.csv', '# ca-CondMat 21363 91342 50')
write_to_csv(co100, 'CA-CondMat-k100.csv', '# ca-CondMat 21363 91342 100')

In [33]:
format_txt_file('ca-CondMat.txt', 'ca-CondMat-k2.txt', '# ca-CondMat 21363 91342 2\n')
format_txt_file('ca-CondMat.txt', 'ca-CondMat-k10.txt', '# ca-CondMat 21363 91342 10\n')
format_txt_file('ca-CondMat.txt', 'ca-CondMat-k20.txt', '# ca-CondMat 21363 91342 20\n')
format_txt_file('ca-CondMat.txt', 'ca-CondMat-k50.txt', '# ca-CondMat 21363 91342 50\n')
format_txt_file('ca-CondMat.txt', 'ca-CondMat-k100.txt', '# ca-CondMat 21363 91342 100\n')

In [34]:
%run evaluate.py ca-CondMat-k2.txt CA-CondMat-k2.csv
%run evaluate.py ca-CondMat-k10.txt CA-CondMat-k10.csv
%run evaluate.py ca-CondMat-k20.txt CA-CondMat-k20.csv
%run evaluate.py ca-CondMat-k50.txt CA-CondMat-k50.csv
%run evaluate.py ca-CondMat-k100.txt CA-CondMat-k100.csv

reading graphfile ca-CondMat-k2.txt
objective = 9.140625
reading graphfile ca-CondMat-k10.txt
objective = 1379.157894736842
reading graphfile ca-CondMat-k20.txt
objective = 2867.1111111111113
reading graphfile ca-CondMat-k50.txt
objective = 7071.363636363636
reading graphfile ca-CondMat-k100.txt
objective = 12091.714285714286


In [36]:
qrk2 = normalized_spectral_clustering(grqc_values, 2)
qrk10 = normalized_spectral_clustering(grqc_values, 10)
qrk20 = normalized_spectral_clustering(grqc_values, 20)


In [37]:
write_to_csv(qrk2, 'CA-GrQc-k2.csv', '# ca-GrQc 4158 13428 2')
write_to_csv(qrk10, 'CA-GrQc-k10.csv', '# ca-GrQc 4158 13428 10')
write_to_csv(qrk20, 'CA-GrQc-k20.csv', '# ca-GrQc 4158 13428 20')


In [40]:
format_txt_file('ca-GrQc.txt', 'ca-GrQc-k2.txt', '# ca-GrQc 4158 13428 2\n')
format_txt_file('ca-GrQc.txt', 'ca-GrQc-k10.txt', '# ca-GrQc 4158 13428 10\n')
format_txt_file('ca-GrQc.txt', 'ca-GrQc-k20.txt', '# ca-GrQc 4158 13428 20\n')


In [42]:
%run evaluate.py ca-GrQc-k2.txt ca-GrQc-k2.csv
%run evaluate.py ca-GrQc-k10.txt ca-GrQc-k10.csv
%run evaluate.py ca-GrQc-k20.txt ca-GrQc-k20.csv


reading graphfile ca-GrQc-k2.txt
objective = 5.26
reading graphfile ca-GrQc-k10.txt
objective = 596.625
reading graphfile ca-GrQc-k20.txt
objective = 720.6875


In [43]:
hek25 = normalized_spectral_clustering(hepph_values, 25)

In [44]:
write_to_csv(hek25, 'CA-HepPh-k25.csv', '# ca-HepPh 11204 117649 25')

In [45]:
format_txt_file('ca-HepPh.txt', 'ca-HepPh-k25.txt', '# ca-HepPh 11204 117649 25\n')

In [46]:
%run evaluate.py ca-HepPh-k25.txt CA-HepPh-k25.csv

reading graphfile ca-HepPh-k25.txt
objective = 7791.538461538462


In [47]:
hepthk25 = normalized_spectral_clustering(hepth_values, 20)

In [48]:
write_to_csv(hepthk25, 'CA-HepTh-k20.csv', '# ca-HepTh 8638 24827 20')

In [49]:
format_txt_file('ca-HepTh.txt', 'ca-HepTh-k20.txt', '# ca-HepTh 8638 24827 20\n')

In [50]:
%run evaluate.py ca-HepTh-k20.txt CA-HepTh-k20.csv

reading graphfile ca-HepTh-k20.txt
objective = 934.7647058823529
