In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
from easyquery import Query

In [None]:
VSDMPL_DATA = "" # path to parquet

In [None]:
hosts_and_sats = pd.read_parquet(VSDMPL_DATA)
hosts_and_sats['num_satellites'] = hosts_and_sats.groupby('upid')['upid'].transform('count')
hosts_and_sats[hosts_and_sats['upid']==12507653347]

<h2>Add corrected positions</h2>

- if x - x_host > box_length/2, shift by -box_length

- if x - x_host < -box_length/2 shift by +box_length

<h4>Add host x, y, z</h4>

In [None]:
hosts, sats = Query("upid == -1").split(hosts_and_sats)
sats = pd.merge(sats, hosts, how="left", left_on="upid", right_on="id", suffixes=("", "_host")).sort_values(by='upid')
sats

<h4>Add adjusted positions x_adj, y_adj, z_adj</h4>

In [None]:
box_len = 160
sats['x_adj'] = sats['x']
sats.loc[sats['x'] - sats['x_host'] > box_len/2, 'x_adj'] = sats['x'] - 160
sats.loc[sats['x'] - sats['x_host'] < -box_len/2, 'x_adj'] = sats['x'] + 160

sats['y_adj'] = sats['y']
sats.loc[sats['y'] - sats['y_host'] > box_len/2, 'y_adj'] = sats['y'] - 160
sats.loc[sats['y'] - sats['y_host'] < -box_len/2, 'y_adj'] = sats['y'] + 160


sats['z_adj'] = sats['z']
sats.loc[sats['z'] - sats['z_host'] > box_len/2, 'z_adj'] = sats['z'] - 160
sats.loc[sats['z'] - sats['z_host'] < -box_len/2, 'z_adj'] = sats['z'] + 160

<h2>Make cuts to get SAGA-like data</h2>
<h4>Current cuts</h4>

- vmax_mpeak >= 45 (following vmax_mpeak vs # satellites & using saga # satellites average)

- 3 or more satellites

<h4>Unused cuts</h4>

- mvir >= 1e10

In [None]:
# Filter by most massive
# sats = df[df['mvir'] >= 1e10]

# Use vmax_mpeak as proxy for luminosity
sats_cut = sats[sats['vmax_mpeak'] >= 52.77]

# Select only satellite systems with 3 or more members
sats_cut = sats_cut.groupby(['upid']).filter(lambda x: len(x) > 2)
sats_cut

In [None]:
average_num_sats = len(sats_cut)/len(sats_cut['upid'].unique())
average_num_sats

In [None]:
test_sats_cut = sats_cut.copy()
test_sats_cut['test_column'] = test_sats_cut[test_sats_cut['vmax_mpeak'] >= 50].groupby('upid')['upid'].transform('count')
test_sats_cut
average = test_sats_cut[test_sats_cut['test_column'] > 2]['test_column']
average

<h2>Ellipticity calculations</h2>

In [None]:
data = sats_cut

<h4>Quadrupole moments</h4>

In [None]:
ellipticity_df = data.copy()
ellipticity_df['x_diff_sq'] = (ellipticity_df['x_adj'] - ellipticity_df['x_host'])**2
ellipticity_df['y_diff_sq'] = (ellipticity_df['y_adj'] - ellipticity_df['y_host'])**2
ellipticity_df['x_diff_y_diff'] = (ellipticity_df['y_adj'] - ellipticity_df['y_host'])*(ellipticity_df['x_adj'] - ellipticity_df['x_host'])
ellipticity_df = ellipticity_df.groupby('upid', as_index = False)['x_diff_sq', 'y_diff_sq', 'x_diff_y_diff'].mean().rename(columns={'x_diff_sq':'Q_xx', 'y_diff_sq':'Q_yy', 'x_diff_y_diff':'Q_xy'})
ellipticity_df


<h4>Ellipticity components</h4>

In [None]:
def e_1 (Q_xx, Q_yy, Q_xy):
    return (( Q_xx - Q_yy )/(Q_xx + Q_yy + 2*(Q_xx*Q_yy-Q_xy**2)**(1/2)))
def e_2 (Q_xx, Q_yy, Q_xy):
    return (2*Q_xy/(Q_xx + Q_yy + 2*(Q_xx*Q_yy-Q_xy**2)**(1/2)))
def e_3 (e_1, e_2):
    return ( e_1**2 + e_2**2 )**(1/2)


ellipticity_df['e_1'] = e_1(ellipticity_df['Q_xx'], ellipticity_df['Q_yy'], ellipticity_df['Q_xy'])
ellipticity_df['e_2'] = e_2(ellipticity_df['Q_xx'], ellipticity_df['Q_yy'], ellipticity_df['Q_xy'])
ellipticity_df['e_3'] = e_3(ellipticity_df['e_1'], ellipticity_df['e_2'])

In [None]:
# sns.set(style="darkgrid", palette="viridis")
custom_style = {
    "figure.facecolor": "212946",
    "axes.facecolor": "212946",
    "savefig.facecolor": "212946", 
    "grid.color": "2A3459",
    "text.color": "0.9",
    "axes.labelcolor": "0.9",
    "xtick.color": "0.9",
    "ytick.color": "0.9",
    "font.family": "Bitstream Vera Sans",
    "grid.linestyle": "-",
    "lines.solid_capstyle": "round"
}

sns.set_style("darkgrid", custom_style)

In [None]:
sns.histplot(ellipticity_df["e_3"], bins=7, element='poly', stat="density", fill=False)

In [None]:
# check about the high ellipticity bois
high_ellipticity_halo_ids = ellipticity_df[ellipticity_df['e_3'] >= 0.9]['upid'].unique()
high_ellipticity_data = data[data['upid'].isin(high_ellipticity_halo_ids)]

high_ellipticity_halos = high_ellipticity_data.groupby(['upid'])['id'].count()
average_count_high_ellipticity = high_ellipticity_halos.mean()
print("Average number of satellites per halo for the high-eccentricity data: {}".format(average_count_high_ellipticity))

all_halos = data.groupby(['upid'])['id'].count()
average_count_all = all_halos.mean()
print("Average number of satellites per halo for entire data set: {}".format(average_count_all))


<h4>Store ellipticity DataFrame to use in other nbs</h4>

In [None]:
ellipticity_df_VSMDPL = ellipticity_df
%store ellipticity_df_VSMDPL

In [None]:
display(ellipticity_df)

<h2>Highest luminosity analysis</h2>

In [None]:
highest_luminosity_sats = hosts_and_sats.groupby('upid', as_index = False)['Mr', 'num_satellites'].min()

In [None]:
highest_luminosity_ellipticity = pd.merge(highest_luminosity_sats, ellipticity_df, how="left", left_on="upid", right_on="upid")[['upid', 'Mr', 'e_3']]

In [None]:
# get rid of hosts which were filtered out by 
highest_luminosity_ellipticity_VSMDPL = highest_luminosity_ellipticity[~highest_luminosity_ellipticity['e_3'].isna()]
%store highest_luminosity_ellipticity_VSMDPL
highest_luminosity_ellipticity_VSMDPL

<h2>Concat highest luminosity w/ log(vmax_mpeak)</h2>

In [None]:
highest_vmax_sats = hosts_and_sats.groupby('upid', as_index = False)['vmax_mpeak'].max()

In [None]:
highest_vmax_ellipticity = pd.merge(highest_vmax_sats, ellipticity_df, how="left", left_on="upid", right_on="upid")[['upid', 'vmax_mpeak', 'e_3']]
highest_vmax_ellipticity

In [None]:
highest_vmax_ellipticity['log_vmax_mpeak'] = np.log(highest_vmax_ellipticity['vmax_mpeak'])

In [None]:
highest_vmax_ellipticity_VSMDPL = highest_vmax_ellipticity[~highest_vmax_ellipticity['e_3'].isna()]
%store highest_vmax_ellipticity_VSMDPL
highest_vmax_ellipticity_VSMDPL

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig, ax1 = plt.subplots()
ax1.set_xlabel('ellipticity (2D)')
ax1.set_ylabel('log(vmax_mpeak)')
sp1 = sns.scatterplot(data=highest_vmax_ellipticity_VSMDPL, x='e_3', y='log_vmax_mpeak', ax=ax1, s=2)

In [None]:
fig = sp1.figure.get_figure()
fig.set_size_inches([6, 4])
fig.savefig("SimVmax_mpeakVsEllipticity.png",dpi=600)

<h2>Average # satellites per vmax_mpeak</h2>

In [None]:
hosts_num_vmax = sats.groupby('upid').first()[['vmax_mpeak_host', 'num_satellites']]
hosts_num_vmax

Bin hosts by vmax_mpeak

In [None]:
hosts_num_vmax_averages = hosts_num_vmax.groupby(pd.cut(hosts_num_vmax['vmax_mpeak_host'], np.linspace(135, 340, 20))).mean()
hosts_num_vmax_averages

In [None]:
fig2, ax2 = plt.subplots()
ax2.set_xlabel('vmax_mpeak')
ax2.set_ylabel('average # satellites')
sp2 = sns.lineplot(hosts_num_vmax_averages['vmax_mpeak_host'], hosts_num_vmax_averages['num_satellites'], ax=ax2)

In [None]:
fig2 = sp2.figure.get_figure()
fig2.set_size_inches([6, 4])
fig2.savefig("NumSatsVsVmax_mpeak.png",dpi=600)

In [None]:
fig3, ax3 = plt.subplots()
ax3.set_xlabel('vmax_mpeak')
ax3.set_ylabel('# satellites')
sp3 = sns.scatterplot(hosts_num_vmax['vmax_mpeak_host'], hosts_num_vmax['num_satellites'], ax=ax3, s=2)

In [None]:
import scipy as sp
spearmanr_results = sp.stats.spearmanr(hosts_num_vmax['vmax_mpeak_host'], hosts_num_vmax['num_satellites'], axis=0, nan_policy='propagate', alternative='two-sided')
spearmanr_results

In [None]:
fig3 = sp3.figure.get_figure()
fig3.set_size_inches([6, 4])
fig3.savefig("NumSatsVsVmax_mpeak_scatter.png",dpi=600)