#  Matching Catalogs by Members ID

---
**License**

 Matching Catalogs by Members ID

 Tue Aug 24 10:30:00 2021\
 Copyright  2021\
 Cinthia Nunes de Lima <cinthia.n.lima@uel.br> \ Sandro Dias Pinto Vitenti <vitenti@uel.br>

---
---

 Ascaso_Extended_cDC2_RM_Test\
 Copyright (C) 2021 Cinthia Nunes de Lima <cinthia.n.lima@uel.br>, Sandro Dias Pinto Vitenti <vitenti@uel.br>

 numcosmo is free software: you can redistribute it and/or modify it
 under the terms of the GNU General Public License as published by the
 Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 numcosmo is distributed in the hope that it will be useful, but
 WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 See the GNU General Public License for more details.

 You should have received a copy of the GNU General Public License along
 with this program.  If not, see <http://www.gnu.org/licenses/>.
 
---

## Packages

In [1]:
import math
import sys

import os
import numpy as np
import matplotlib.pyplot as plt

from astropy.io import fits
from astropy.table import Table, join
from astropy.coordinates import SkyCoord
import astropy.units as u

import pandas as pd

sys.path.insert(0, "/global/homes/c/cinlima/gcrcatalogs-new/gcr-catalogs")
sys.path.insert(0, "/global/homes/c/cinlima/")
sys.path.insert(0, "/global/homes/c/cinlima/NumCosmo/notebooks/richness_proxy/CatalogsMatching/")


import GCRCatalogs
#Put nersc if using nersc
GCRCatalogs.set_root_dir_by_site("nersc")

print('GCRCatalogs =', GCRCatalogs.__version__, '|' ,'GCR =', GCRCatalogs.GCR.__version__)


from numcosmo_py import Nc, Ncm

from match_id import MatchID as MID

import time

# GCRCatalogs.get_available_catalog_names()
print('Numpy =', np.__version__)

GCRCatalogs = 1.9.0 | GCR = 0.9.2
Numpy = 1.26.4


In [2]:
import networkx as nx
import random

In [3]:
import clevar
from clevar import ClCatalog, MemCatalog
from clevar.match import MembershipMatch, get_matched_pairs
from clevar.match import output_matched_catalog
from clevar.match_metrics.scaling import ClCatalogFuncs as s_cf

## Catalogs

In [4]:
cluster_data = Table.read('/global/homes/c/cinlima/NumCosmo/notebooks/richness_proxy/CatalogsMatching/cluster_data.fits')
member_data = Table.read('/global/homes/c/cinlima/NumCosmo/notebooks/richness_proxy/CatalogsMatching/member_data.fits')
truth_data = Table.read('/global/homes/c/cinlima/NumCosmo/notebooks/richness_proxy/CatalogsMatching/truth_data.fits')

pmem = member_data["p_member"] * member_data["pfree_member"] * member_data["theta_i_member"] * member_data["theta_r_member"] 

member_data.add_column(pmem, name="pmem")

cluster_member_data = join(member_data['id_member', 'cluster_id_member', 'ra_member', 'dec_member', 'redshift_true_member', 'pmem' ], cluster_data, keys_left = 'cluster_id_member', keys_right = 'cluster_id', join_type='inner')

truthdf = truth_data.to_pandas()

hascentral = truthdf[truthdf['is_central'] ==  True]
halo_central_ids = list(hascentral['halo_id'].unique())
central_position = hascentral[['ra', 'dec', 'redshift', 'halo_id']].rename(columns = {'ra': 'halo_ra', 'dec': 'halo_dec', 'redshift': 'halo_redshift'})
halos_dataframe = pd.merge(truthdf[truthdf['halo_id'].isin(halo_central_ids)], central_position, how='inner', on=['halo_id'])

truth_data = Table.from_pandas(halos_dataframe)

In [5]:
# CLEVAR

# Halos/Clusters
#CosmoDC2 + RedMaPPer: 'cosmoDC2_v1.1.4_redmapper_v0.8.1'
RM_cat = ClCatalog.read('/global/homes/c/cinlima/NumCosmo/notebooks/richness_proxy/CatalogsMatching/cDC2_RM_match_cross_membership/RM_cat.fits', name='RM_cat', full=True) #tags={'mass':'richness'}

#CosmoDC2: 'cosmoDC2_v1.1.4_image'
halo_cat = ClCatalog.read('/global/homes/c/cinlima/NumCosmo/notebooks/richness_proxy/CatalogsMatching/cDC2_RM_match_cross_membership/halo_cat.fits', name='halo_cat', full=True) #tags={'mass':'mass_fof'}

#Members
RM_cat.read_members('/global/homes/c/cinlima/NumCosmo/notebooks/richness_proxy/CatalogsMatching/cDC2_RM_match_cross_membership/RM_cat_members.fits',  full=True)
halo_cat.read_members('/global/homes/c/cinlima/NumCosmo/notebooks/richness_proxy/CatalogsMatching/cDC2_RM_match_cross_membership/halo_cat_members.fits', full=True)

#Match catalogs
mt = MembershipMatch()

mt.load_matches(RM_cat, halo_cat, out_dir='/global/homes/c/cinlima/NumCosmo/notebooks/richness_proxy/CatalogsMatching/cDC2_RM_match_cross_membership/')


RM_cat
    * ClEvar used in matching: 0.14.1
 * Total objects:    45,081
 * multiple (self):  43,106
 * multiple (other): 43,106
 * unique (self):    42,914
 * unique (other):   42,917
 * cross:            42,914

halo_cat
    * ClEvar used in matching: 0.14.1
 * Total objects:    249,557
 * multiple (self):  74,474
 * multiple (other): 74,474
 * unique (self):    42,917
 * unique (other):   42,914
 * cross:            42,914


# Match

### functions

In [6]:
def get_members_number(catalog, object_tag):

   members_number = catalog[object_tag].value_counts()
    
   return members_number, pd.merge(pd.DataFrame(members_number), catalog, how='inner', on=[object_tag])    

def get_shared_members_fraction_catalog(matched_catalog):
   
    shared_count = matched_catalog.groupby(['cat1_id', 'cat2_id'])['cat2_id'].transform('count')
    
    # frac_catalog1 = shared_count / matched_catalog['nmem_cat1']
    frac_catalog1 = matched_catalog.groupby(['cat1_id', 'cat2_id'])['pmem'].transform('sum') / matched_catalog['nmem_cat1']

    frac_catalog2 = shared_count / matched_catalog['nmem_cat2']

    return shared_count, frac_catalog1, frac_catalog2


def load_catalogs(catalog1, catalog2, id_columns1, id_columns2 ):

   cat1 = catalog1.to_pandas().rename(columns = id_columns1)    
   cat2 = catalog2.to_pandas().rename(columns = id_columns2)

   #Number of members associated with the objects (e.g. clusters or halos):  
   # cat1_nmem = get_members_number(cat1, 'cat1_id')[1].rename(columns={'count': 'nmem_cat1'}) 
   cat2_nmem = get_members_number(cat2, 'cat2_id')[1].rename(columns={'count': 'nmem_cat2'})

   cat1['nmem_cat1'] = cat1.groupby(['cat1_id'])['pmem'].transform('sum') 
   
   return cat1, cat2_nmem

def match_by_id(catalog1, catalog2):

    #Matching
    matched_catalog = pd.merge(catalog1, catalog2, how='inner', on=['id'])   
    
    # Shared members fraction
    shared_frac = get_shared_members_fraction_catalog(matched_catalog)

    matched_catalog['shared_num'] = shared_frac[0]
    
    matched_catalog['shared_frac_cat1'] = shared_frac[1]
    matched_catalog['shared_frac_cat2'] = shared_frac[2]
    
    matched_catalog['cross_frac'] = np.array(matched_catalog['shared_frac_cat1']) *( np.array(matched_catalog['shared_frac_cat1']) + np.array(matched_catalog['shared_frac_cat2']) ) / 2 
    
    return matched_catalog

def get_max_fraction_candidates(object_data, canditate_object_tag, shared_fraction_option = 'shared_frac_cat1'):
   
    max_fraction = object_data[shared_fraction_option].max()
    
    return object_data[object_data[shared_fraction_option] == max_fraction][canditate_object_tag].unique()




In [7]:
def matching(catalog1, catalog2, option= 'left'):

    matched_catalog = match_by_id(catalog1, catalog2)

    match_info = matched_catalog[['id', 'cat1_id', 'nmem_cat1', 'cat2_id', 'shared_num', 'shared_frac_cat1', 'shared_frac_cat2', 'cross_frac']].copy()
  
    
    # left

    match_info_sorted_left = match_info.sort_values(by = ['cat1_id', 'cross_frac'], ascending= [True, False]).drop_duplicates(subset=['cat1_id', 'cat2_id'], keep='first')

    matches_left = (match_info_sorted_left.groupby('cat1_id')['cat2_id'].agg(list).reset_index())
    matches_left = matches_left.rename(columns={'cat2_id': 'matches_left'})

    left = match_info_left.merge(matches_left, on='cat1_id', how='left').drop_duplicates(subset = ['cat1_id'], keep = 'first')

    
    # right
    match_info_sorted_right = match_info.sort_values(by = ['cat2_id', 'cross_frac'], ascending= [True, False]).drop_duplicates(subset=['cat2_id', 'cat1_id'], keep='first')

    matches_right = (match_info_sorted_right.groupby('cat2_id')['cat1_id'].agg(list).reset_index())
    matches_right = matches_right.rename(columns={'cat1_id': 'matches_left'})

    right = match_info_right.merge(matches_left, on='cat2_id', how='left').drop_duplicates(subset = ['cat2_id'], keep = 'first')


    # cross

    
    
    return match_info

### test

In [8]:
cat1_columns = {'id_member': 'id', 'cluster_id': 'cat1_id', 'cluster_ra': 'cat1_ra', 'cluster_dec': 'cat1_dec', 'cluster_redshift':'cat1_redshift'}
cat2_columns = {'galaxy_id': 'id', 'halo_id': 'cat2_id', 'halo_ra': 'cat2_ra', 'halo_dec': 'cat2_dec', 'halo_redshift':'cat2_redshift'}

catalogs = load_catalogs(cluster_member_data, truth_data, cat1_columns, cat2_columns)

cat_clusters = catalogs[0]
cat_halos = catalogs[1]

In [9]:
matched_catalog = match_by_id(cat_clusters, cat_halos)
# matched_catalog

In [10]:
match_info = matched_catalog[['cat1_id', 'cat2_id',  'is_central', 'richness', 'halo_mass', 'nmem_cat1', 'nmem_cat2', 'shared_num', 'shared_frac_cat1', 'shared_frac_cat2','cross_frac']].copy().drop_duplicates(subset=['cat1_id', 'cat2_id'])
match_info

Unnamed: 0,cat1_id,cat2_id,is_central,richness,halo_mass,nmem_cat1,nmem_cat2,shared_num,shared_frac_cat1,shared_frac_cat2,cross_frac
0,1,2050900064279,False,256.669159,1.896877e+15,256.662476,603,275,0.942527,0.456053,0.659099
191,1,2608400064279,False,256.669159,3.202339e+13,256.662476,25,1,0.000822,0.040000,0.000017
276,2,203600066421,False,232.673080,6.446309e+14,232.586365,284,95,0.376728,0.334507,0.133971
277,2,46800066421,False,232.673080,6.446309e+14,232.586365,526,155,0.590698,0.294677,0.261495
290,2,52000079421,False,232.673080,2.284268e+13,232.586365,21,1,0.000344,0.047619,0.000008
...,...,...,...,...,...,...,...,...,...,...,...
542701,102670,2077800044300,False,6.060438,5.261578e+13,6.061342,46,8,0.818482,0.173913,0.406129
542706,102670,910700044300,False,6.060438,2.043040e+13,6.061342,17,1,0.137078,0.058824,0.013427
542710,102685,1044700191253,False,5.282434,1.194697e+13,5.276836,22,1,0.003281,0.045455,0.000080
542711,102685,1211900191272,False,5.282434,2.307428e+13,5.276836,20,1,0.007575,0.050000,0.000218


### Cross 

In [11]:
match_tuples = list(match_info[['cat1_id', 'cat2_id', 'cross_frac']].itertuples(index=False, name=None))
# match_tuples

In [None]:
# pairs = [(1,1,0.6), (2,1,0.4), (2,2,0.2),(3,2,0.01), (2,3,0.5)]

pairs = match_tuples

G = nx.Graph()
# add edges with 'weight' attribute
for a,b,w in pairs:
    G.add_edge(("A",a), ("B",b), weight=w)

random.seed(42)

# --- parameters --------------------------------------------------------------
n_components = 5
nodes_per_side = 10     # per component
edge_prob = 0.3         # sparsity inside each component
# ------------------------------------------------------------------------------

total_weight = 0.0
global_matching = set()

for i, nodes in enumerate(nx.connected_components(G)):
    subG = G.subgraph(nodes)
    m = nx.max_weight_matching(subG, maxcardinality=False)
    w = sum(subG[u][v]['weight'] for u, v in m)
    print("\nNodes in component:")
    for a in subG:
        if a[0] == 'A':
            for b in subG[a]:
                if b[0] == 'B':
                    print(a, b, subG[a][b]["weight"])
    total_weight += w
    global_matching |= m
    print(f"Component {i}: {len(subG)} nodes, {len(m)} pairs, weight = {w:.3f}")
    for u, v in m:
        if u[0] == "A":
            a, b = u, v
        else:
            a, b = v, u
        print(a, b)


print(f"\nTotal matching weight = {total_weight:.3f}")
print(f"Total pairs matched   = {len(global_matching)}")




Nodes in component:
('A', 1) ('B', 2050900064279) 0.6590994542677545
('A', 1) ('B', 2608400064279) 1.6780873865661028e-05
Component 0: 3 nodes, 1 pairs, weight = 0.659
('A', 1) ('B', 2050900064279)

Nodes in component:
('A', 2) ('B', 203600066421) 0.13397109276262326
('A', 2) ('B', 46800066421) 0.2614948743164729
('A', 2) ('B', 52000079421) 8.245040524185033e-06
('A', 2) ('B', 216100066411) 0.00025407253793325034
Component 1: 5 nodes, 1 pairs, weight = 0.261
('A', 2) ('B', 46800066421)

Nodes in component:
('A', 3) ('B', 9600137401) 0.0544913412285935
('A', 3) ('B', 101900137401) 0.1091677378944588
('A', 3) ('B', 173200123401) 0.0011487088851954042
('A', 3) ('B', 192400137401) 0.10376146503963254
('A', 3) ('B', 124000137401) 2.1733811361135563e-07
('A', 3) ('B', 210600123401) 0.0023401314279218074
('A', 3) ('B', 334000137401) 0.00021255498334439574
('A', 35740) ('B', 9600137401) 0.0002178853109613124
('A', 35740) ('B', 214200137382) 0.6734618834033501
('A', 34020) ('B', 101900137401) 

In [None]:
# global_matching

rows = []
for row_data in global_matching:
    # Cada row_data é uma tupla de duas tuplas (coluna, valor)
    # O construtor dict aceita uma sequência de pares (chave, valor)
    row_dict = dict(row_data) 
    rows.append(row_dict)

# Cria o DataFrame
df = pd.DataFrame(rows)

In [None]:
df_ab = df.rename(columns={
    'A': 'cat1_id',
    'B': 'cat2_id'
})

In [None]:
match = pd.merge(matched_catalog.drop_duplicates(subset=['cat1_id', 'cat2_id']), df_ab, how='inner', on=['cat1_id', 'cat2_id'])

In [None]:
# m.columns

match

In [None]:
plt.scatter(np.log(m['halo_mass']), np.log(m['richness']))

In [None]:
RM_cat_bij[RM_cat_bij['id'] == '1597']

In [None]:
match_info[match_info['cat1_id'] == 1597]

In [None]:
match_info[match_info['cat2_id'] == 1570200057279]

### rascunho

#### Left and Right

In [None]:
match_info_left = match_info.sort_values(by = ['cat1_id', 'cross_frac'], ascending= [True, False]).drop_duplicates(subset=['cat1_id', 'cat2_id'], keep='first')

matches_left = (match_info_left.groupby('cat1_id')['cat2_id'].agg(list).reset_index())
matches_left = matches_left.rename(columns={'cat2_id': 'matches_left'})

left = match_info_left.merge(matches_left, on='cat1_id', how='left').drop_duplicates(subset = ['cat1_id'], keep = 'first')

In [None]:
left.head(3)

In [None]:
match_info_sorted_right = match_info.sort_values(by = ['cat2_id', 'cross_frac'], 
                                                 ascending= [True, False]).drop_duplicates(subset=['cat2_id', 'cat1_id'], keep='first')

matches_right = (match_info_sorted_right.groupby('cat2_id')['cat1_id'].agg(list).reset_index())
matches_right = matches_right.rename(columns={'cat1_id': 'matches_right'})

right = match_info_sorted_right.merge(matches_right, on='cat2_id', how='left').drop_duplicates(subset = ['cat2_id'], keep = 'first')

In [None]:
right.head(3)

In [None]:
RM_cat_bij, halo_cat_bij = get_matched_pairs(RM_cat, halo_cat, 'cross')

diff = np.isin(m['cat1_id'], RM_cat_bij['id'], invert=True)

m[diff]

In [None]:
RM_cat_bij, halo_cat_bij = get_matched_pairs(RM_cat, halo_cat, 'cross')

diff = np.isin(m['cat2_id'], halo_cat_bij['id'], invert=True)

m[diff]

In [None]:
m[m['cat1_id'] ==  1597]

In [None]:
# # max_weight_matching returns a set of node pairs
# m = nx.algorithms.matching.max_weight_matching(G, maxcardinality=False)

# matching = []
# total = 0.0
# for u,v in m:
#     # u,v are nodes like ("A","a1") and ("B","b1")
#     if u[0]=="A":
#         a,u_b = u[1], v[1]
#     else:
#         a,u_b = v[1], u[1]
#     w = G[u][v]['weight']
#     matching.append((a, u_b, w))
#     total += w

# print("matching:", matching)
# print("total weight:", total)


In [None]:
right.sort_values(by = ['cat1_id'])

In [None]:
pd.merge(right.sort_values(by = ['cat1_id', 'cross_frac'], ascending= [True, False] ), left[['cat1_id', 'matches_left']], on = 'cat1_id', how= 'left')

In [None]:
left

In [None]:
right[right['cat1_id'] ==  33]

In [None]:
right[right['cat1_id'] ==  48]

In [None]:
left[left['cat1_id'] ==  48]

In [None]:
RM_cat_bij, halo_cat_bij = get_matched_pairs(RM_cat, halo_cat, 'cat2')

In [None]:
df1 = left[['cat1_id', 'cat2_id']]

In [None]:
df2 = pd.DataFrame({
    'cat1_id': RM_cat_bij['id'],
    'cat2_id': RM_cat_bij['mt_other']
})


In [None]:
comparacao = pd.merge(df1, df2, how='outer', indicator=True)
diferencas = comparacao[comparacao['_merge'] != 'both']
diferencas = diferencas.drop(columns='_merge')

In [None]:
diferencas

In [None]:
right[right['cat2_id'] == 193300079365]

In [None]:
left[left['cat1_id'] == 576]

In [None]:
right[right['cat1_id'] == 47]

In [None]:
RM_cat_bij[RM_cat_bij['id'] == '1']

In [None]:
halo_cat_bij[halo_cat_bij['id'] == '1158700069300']