#  Matching Catalogs by Members ID

---
**License**

 Matching Catalogs by Members ID

 Tue Aug 24 10:30:00 2021\
 Copyright  2021\
 Cinthia Nunes de Lima <cinthia.n.lima@uel.br> \ Sandro Dias Pinto Vitenti <vitenti@uel.br>

---
---

 Ascaso_Extended_cDC2_RM_Test\
 Copyright (C) 2021 Cinthia Nunes de Lima <cinthia.n.lima@uel.br>, Sandro Dias Pinto Vitenti <vitenti@uel.br>

 numcosmo is free software: you can redistribute it and/or modify it
 under the terms of the GNU General Public License as published by the
 Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 numcosmo is distributed in the hope that it will be useful, but
 WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 See the GNU General Public License for more details.

 You should have received a copy of the GNU General Public License along
 with this program.  If not, see <http://www.gnu.org/licenses/>.
 
---

## Libraries & Configs

In [1]:
import math
import sys

import os
import numpy as np
import matplotlib.pyplot as plt

from astropy.io import fits
from astropy.table import Table
import pandas as pd

sys.path.insert(0, "/global/homes/c/cinlima/gcrcatalogs-new/gcr-catalogs")
sys.path.insert(0, "/global/homes/c/cinlima/")
sys.path.insert(0, "/global/homes/c/cinlima/NumCosmo/notebooks/richness_proxy/MachingCatalogs")


import GCRCatalogs
#Put nersc if using nersc
GCRCatalogs.set_root_dir_by_site("nersc")

print('GCRCatalogs =', GCRCatalogs.__version__, '|' ,'GCR =', GCRCatalogs.GCR.__version__)


from numcosmo_py import Nc, Ncm

from mtdc2 import mtrmdc2

import time

# GCRCatalogs.get_available_catalog_names()


GCRCatalogs = 1.9.0 | GCR = 0.9.2


## Catalogs

In [2]:
#Cosmo DC2
cosmodc2= GCRCatalogs.load_catalog('cosmoDC2_v1.1.4_image')

#Cosmo DC2 + RedMaPPer
catalog_dc2_redmapper = GCRCatalogs.load_catalog('cosmoDC2_v1.1.4_redmapper_v0.8.1')

In [3]:
# Restrictions on richness, mass and redshift
min_richness = 5
min_halo_mass = 1e13 #Msun
redshift_max = 1.2

#cDC2 + RM data
quantities = catalog_dc2_redmapper.list_all_quantities()
cluster_quantities = [q for q in quantities if 'member' not in q]
member_quantities = [q for q in quantities if 'member' in q]
    
query = GCRCatalogs.GCRQuery('(richness > ' + str(min_richness) +')')
cluster_data = Table(catalog_dc2_redmapper.get_quantities(cluster_quantities, [query])) # cDC2 + RM cluster data
member_data = Table(catalog_dc2_redmapper.get_quantities(member_quantities)) # cDC2 + RM member data

#cDC2 data
quantities_wanted = ['redshift','halo_mass','halo_id','galaxy_id','ra','dec', 'is_central']
query = GCRCatalogs.GCRQuery('(halo_mass > ' + str(min_halo_mass) +') & (redshift < ' + str(redshift_max) +') & (is_central < ' + str(redshift_max) +') ')
truth_data = Table(cosmodc2.get_quantities(quantities_wanted, [query]))

In [1]:
# truth_data[truth_data['is_central'] == True]

# cluster_data

## Matching cDC2 and cDC2 + RM Catalogs

In [4]:
memberdf = member_data.to_pandas().rename(columns={'id_member': 'id'}) #This creates a dataframe and rename: id_member -> id (cDC2 + RM MEMBERS).
truthdf = truth_data.to_pandas().rename(columns={'galaxy_id': 'id'}) #This creates a dataframe and rename: galaxy_id -> id (cDC2 MEMBERS).

cluster_df=cluster_data.to_pandas()[['richness', 'cluster_id', 'redshift']] #This creates a dataframe for cDC2 clusters with columns:
                                                                            #                    richness', 'cluster_id', 'redshift.

In [None]:
memberdf

In [None]:
truthdf

In [None]:
cluster_df

In [None]:
Mt_df = pd.merge(memberdf['id'], truthdf['id'], how='inner', on=['id']) #This selects commom members ID from cDC2 and cDC2 + RM data
Mt_df

In [None]:
mt_member = pd.merge(memberdf, Mt_df, how='inner', on=['id']) #This selects the matching members using Mt_df IDs
mt_member

In [None]:
idc = mt_member[['id', 'cluster_id_member']].rename(columns={'cluster_id_member': 'cluster_id'}) #This selects the members and clusters IDs
idc

In [None]:
mt_cluster = pd.merge(idc, cluster_df, how='inner', on=['cluster_id']) #This selects the clusters data by idc IDS 
mt_cluster

In [None]:
mt_catalog = pd.merge(mt_cluster, truthdf, how='inner', on=['id']) #This creates the matching catalog with clustes and halos

In [None]:
clusters_id = mt_catalog[mt_catalog['is_central'] == True]['cluster_id'].unique() #This selects the clusters IDs

clusters_id

In [None]:
mt_catalog['freq'] = mt_catalog.groupby(['cluster_id', 'halo_id'])['halo_id'].transform('count') # This Adds a frequency column for each halo ID
                                                                                                 # in mt_catalog.
mt_catalog[0:15]

In [None]:
iscentral = mt_catalog[mt_catalog['is_central'] == True] #This selects the central members

iscentral

In [None]:
# This loop creates the unique matching catalog using the halo ID frequency. It selects the halos with maximum frequency.
match_dataframe = pd.DataFrame()                                                                                                                                                                            #
for cl in clusters_id:                                                                                        
    gcut = iscentral.groupby('cluster_id').get_group(cl)
    match_dataframe = pd.concat([match_dataframe, gcut[gcut['freq'] == gcut['freq'].max()]], ignore_index=True)


## Time measurement

In [None]:
inicio = time.time()

mt_dataframe = mtrmdc2(member_data, cluster_data, truth_data)

fim = time.time()
tempo_total = fim - inicio

print(f"Tempo de execução: {tempo_total:.3}s")