# Map of the granparents country of birth (1 c)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from matplotlib.colors import LogNorm
from mpl_toolkits.axes_grid1 import make_axes_locatable
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.size'] = 7  # Global font size
plt.rcParams['axes.titlesize'] = 7  # Title font size
plt.rcParams['axes.labelsize'] = 7  # Axis label font size
plt.rcParams['xtick.labelsize'] = 6  # X-tick font size
plt.rcParams['ytick.labelsize'] = 6  # Y-tick font size
plt.rcParams['legend.fontsize'] = 6  # Legend font size
plt.rcParams['figure.titlesize'] = 7  # Figure title font size



import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib.colors import LogNorm
import numpy as np

countries_10k = pd.read_csv('countries.csv') #### reported grandmapents country of birth
world = gpd.read_file('ne_110m_admin_0_countries.shp') ####map template
countries_list=countries_10k[[c for c in countries_10k.columns if 'country' in c.lower() and 'grand' in c.lower()]].values.flatten().tolist()
len(countries_list)

country_counts = {}
for country in countries_list:
    country_counts[country] = country_counts.get(country, 0) + 1
country_counts['United States of America'] = country_counts['United States']
# Add the frequency information to the shapefile
world['Frequency'] = world['NAME'].map(country_counts)
world['Frequency'] = world['Frequency'].fillna(0)

# Create a LogNorm object for the colormap
min_value = world['Frequency'][world['Frequency'] > 0].min()
max_value = world['Frequency'].max()
norm = LogNorm(vmin=min_value, vmax=max_value)

# Set the figure size to 180 mm width at 300 dpi
width_inches = 90 / 25.4  # convert mm to inches
height_inches = width_inches / 2  # adjust this ratio as needed
plt.figure(figsize=(width_inches, height_inches), dpi=300)

# Plot the map with heatmap
fig, ax = plt.subplots(figsize=(width_inches, height_inches))

# Plot country boundaries
world.boundary.plot(ax=ax, color='black', linewidth=0.3)
plot = world.plot(column='Frequency', cmap='YlGnBu', norm=norm, linewidth=0.3, edgecolor='0.8', 
                  legend=True, 
                  legend_kwds={'label': '', 'format': '{x:.0f}', 'shrink': 0.5, 'pad': 0.02}, 
                  ax=ax)

# Set the limits to exclude Antarctica
ax.set_ylim(-60, 90)
ax.set_xlim(-170, 175)

# Remove axis ticks and labels
ax.set_xticks([])
ax.set_yticks([])
ax.set_xticklabels([])
ax.set_yticklabels([])

# Remove all spines
for spine in ax.spines.values():
    spine.set_visible(False)

# Set title with reduced padding
plt.title('Grandparents Countries of Birth', fontsize=8, pad=1)

# Remove additional white space when saving
plt.savefig('map_grandparents_10k_nature.tiff', dpi=300, format='tiff', bbox_inches='tight')
plt.savefig('map_grandparents_10k_nature.png', dpi=300, bbox_inches='tight')

# Fig 1 D Ancestry PCA scatter

In [None]:
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import matplotlib.pyplot as plt
import numpy as np
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Generate a random matrix X with 10 samples and 5 features

def do_pca(X,n_components=50):
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    pca = PCA(n_components)  # Reduce to 10 principal components
    X_pca = pca.fit_transform(X)
    return X_pca



#mb=mb[mb.index.isin(data.index)]
mb = pd.read_csv('species.csv', index_col=0) ###relative abundances of microbiome species
mb_pca  =do_pca(mb)
mb_pca=pd.DataFrame(mb_pca, index=mb.index, columns=['PC'+str(i+1) for i in range(50)])




# Set the style to match Nature Medicine guidelines
plt.style.use('default')
plt.rcParams.update({
    'font.sans-serif': ['Helvetica'],
    'font.size': 7,
    'axes.linewidth': 0.5,
    'axes.labelsize': 7,
    'xtick.labelsize': 6,
    'ytick.labelsize': 6,
    'legend.fontsize': 6,
    'xtick.major.size': 3,
    'ytick.major.size': 3,
    'xtick.major.width': 0.5,
    'ytick.major.width': 0.5,
    'axes.spines.right': False,
    'axes.spines.top': False,
})

ancestry_table = pd.read_csv('country_ancestry.csv')

origin=pd.read_csv('grandparents_countries_of_birth.csv')

bp_count = origin.apply(pd.Series.value_counts, axis=1)
bp_count = bp_count.reset_index().drop_duplicates().set_index('RegistrationCode')
bp_count = bp_count[~bp_count.index.duplicated(keep=False)]
bp_count = bp_count.stack()
bp_count.index.names = ['RegistrationCode', 'birthplace']
bp_count = bp_count.rename('bp_count').to_frame().reset_index()
bp_to_anc = {i: 'NOT_DEFINED' for i in bp_count['birthplace'].unique().tolist()}
country_heb_ancestry = dict(zip(ancestry_table['country_eng'], ancestry_table['ancestry']))
bp_to_anc.update(country_heb_ancestry)
bp_count['ancestry'] = bp_count['birthplace'].replace(bp_to_anc)
bp_count = bp_count.query("ancestry != 'NOT_DEFINED'")
bp_count = bp_count.set_index(['RegistrationCode', 'ancestry'])['bp_count'].rename('reported')
bp_count = bp_count.groupby(['RegistrationCode', 'ancestry']).sum()
bp_count = bp_count / bp_count.groupby('RegistrationCode').sum()
consistent_ancestry = bp_count[bp_count>0.7]
mixed=bp_count[~bp_count.index.get_level_values(0).isin(consistent_ancestry.index.get_level_values(0))]
#mixed=mixed[mixed.index.get_level_values(0).isin(reg_ids_bm)]
mixed = mixed.reset_index().groupby('RegistrationCode').first()
mixed['ancestry'] = 'Mixed'
consistent_ancestry=consistent_ancestry.reset_index().groupby('RegistrationCode').first()
ancestry=mixed.append(consistent_ancestry)

def create_ylgnbu_color_dict(n):
    """
    Create a dictionary of n colors from the YlGnBu colormap.
    
    Parameters:
    n (int): Number of colors to generate
    
    Returns:
    dict: A dictionary where keys are normalized values (0 to 1) and values are RGB tuples
    """
    cmap = plt.get_cmap('YlGnBu')
    
    # Generate n evenly spaced values between 0 and 1
    values = [i/(n-1) for i in range(n)]
    
    # Create the color dictionary
    color_dict = {i: cmap(value) for i,value in enumerate(values)}
    
    return color_dict


def do_ancestry_scatter(pca_gen, title='', ax=None, show_legend=True):
                        
    categories = [ 'Mixed', 'Other','Ashkenaz', 'Yemen', 'North African', 'Sephardi', 'Middle Eastern', ]
    legend_order = ['Ashkenaz', 'Yemen', 'North African', 'Sephardi', 'Middle Eastern',  'Mixed', 'Other']
   
    ancestry['ancestry']=ancestry['ancestry'].apply(lambda x:'Other' if x=='Unknown/other' else x)
    
    data = pca_gen[['PC1', 'PC2']].join(ancestry['ancestry']).sort_index()
    data = data.dropna()

    if ax is None:
        fig, ax = plt.subplots(figsize=(7, 7))

    num_categories = len(categories)
    color_dict = create_ylgnbu_color_dict(num_categories)
    colors = plt.cm.rainbow(np.linspace(1, 0, num_categories))
    markers = ['o', 's', '^', 'D', 'v', '>', '<', 'p', '*', 'h', 'H', '+', 'x', 'd', '|', '_']

    for i, (category, color) in enumerate(zip(categories, colors)):
        subset = data[data['ancestry'] == category]
        ax.scatter(subset['PC1'], subset['PC2'], c=color_dict[i],marker=markers[i % len(markers)], label=category, alpha=0.4, s=4)

    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title(f'{title} PCs \n by  Ancestry', fontsize=8)

    # Move the legend outside the plot
    if show_legend:
        handles, labels = ax.get_legend_handles_labels()
        ordered_handles = [handles[labels.index(cat)] for cat in legend_order]
        ordered_labels = [cat for cat in legend_order]
        ax.legend(ordered_handles, ordered_labels, loc='center left',bbox_to_anchor=(0.97, 0.5), scatterpoints=1, fontsize=6)
    
    plt.grid(True, alpha=0.3)
    plt.tight_layout()  # Adjusts the plot to ensure no overlap


fig = plt.figure(figsize=(129 / 25.4 ,55/25.7))

# Create a GridSpec with 2 rows and 2 columns, with the bottom row spanning both columns
gs = GridSpec(2, 4)

# Top-left panel (Panel A)
ax1 = fig.add_subplot(gs[:, :2])

pca_gen=pd.read_csv("covariates.eigenvec", sep="\t") ####file with 10 genetics PCs
do_ancestry_scatter(pca_gen, 'Genetics', ax=ax1)
# Insert your plot here, e.g., ax1.plot(x, y)

# Top-right panel (Panel B)
ax2 = fig.add_subplot(gs[:, 2:])
do_ancestry_scatter(mb_pca, 'Microbiome', ax=ax2,show_legend=False)

plt.tight_layout()

# Save or show the figure
plt.savefig('10k_genetics_scatter.tiff', dpi=300, format='tiff', bbox_inches='tight')
plt.savefig('10k_genetics_scatter.png', dpi=300, bbox_inches='tight')

# Fig 1 e. ancestry predictions and differences

In [None]:
###predictiong an ethnicity from different dataframes (genetics, lifestyle, metabolomics, etc.)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

import pandas as pd

from os.path import join

from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
import os
import warnings
warnings.filterwarnings("ignore")

def predict_ancestry(data, ancestry_ser,ethnicity, n_splits=5):
    # Create binary labels for the current batch
    y = (ancestry_ser['ancestry'] == ethnicity).astype(int)
    X = data
    batch_aucs = []

    for _ in range(n_splits):
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        # Train LightGBM model
        model = lgb.LGBMClassifier(num_estimators=1000,random_state=42, num_threads=16)
        model.fit(X_train, y_train)
        # Predict and calculate AUC-ROC
        y_pred = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_pred)
        batch_aucs.append(auc)
    res = pd.Series({
            'ethnicity': ethnicity,
            'Mean AUC-ROC': np.mean(batch_aucs),
            'Sample Count': sum(y)
        })
 #   res.to_csv(batch+'_pred.csv')
    res=pd.DataFrame()
    res.loc[ethnicity,'Mean AUC-ROC'] = np.mean(batch_aucs)
    res.loc[ethnicity,'STD AUC-ROC'] = np.std(batch_aucs)
    res.loc[ethnicity,'Sample Count'] = sum(y)
    return res
res_all=pd.DataFrame()
res_dna=pd.DataFrame()

ac = ancestry[ancestry.index.isin(pca_gen.index)].sort_index()
ac = ac[~ac['ancestry'].isin(['Mixed', 'Unknown/other'])].sort_index()
for ethn in ['Ashkenaz', 'North African', 'Middle Eastern', 
       'Sephardi', 'Yemen']:
    res_dna=res_dna.append(predict_ancestry(pca_gen, ac,ethn, n_splits=5))
    
res_dna['source']='dna'
res_all=res_all.append(res_dna)

In [None]:

import matplotlib.pyplot as plt
import numpy as np
import matplotlib.gridspec as gridspec

import matplotlib.pyplot as plt
from scipy import stats

def boxplot_with_mannwhitney_final(data1, data2, labels, title, ax=None, save=True, short=False):
    # Perform Mann-Whitney U test
    statistic, p_value = stats.mannwhitneyu(data1, data2, alternative='two-sided')
    
    if ax is None:
        # Create a figure and axis
        fig, ax = plt.subplots(figsize=(180 / 25.4 / 6 * 1.3, 2), dpi=300)
    
    # Create the boxplot
    bp = ax.boxplot([data1, data2], labels=labels, showfliers=False, widths=0.6)
    
    # Customize the plot
    ax.set_ylabel(title, fontsize=6)
    ax.set_xlabel('', fontsize=6)

    # Set style for median lines
    for median in bp['medians']:
        median.set(color='black', linewidth=1.5)
    
    # Set style for whiskers and caps
    for element in ['whiskers', 'caps']:
        for item in bp[element]:
            item.set(color='black', linewidth=1)
    
    # Find the highest point of the boxes (excluding outliers)
    y_max = max(
        max(bp['boxes'][0].get_ydata()),
        max(bp['boxes'][1].get_ydata()),
        max(bp['caps'][1].get_ydata()),
        max(bp['caps'][3].get_ydata())
    )
    
    # Add asterisks based on the p-value
    if short:
        annotation_height = y_max+ (y_max * 0.04)
        line_height = y_max + (y_max * 0.02)
    else:
        annotation_height = y_max + (y_max * 0.15)  # Move p-value annotation lower (15% above the highest point)
        line_height = y_max + (y_max * 0.1)  # Move line lower (10% above the highest point)
    
    if p_value < 0.001:
        annotation = '***'  # p < 0.001
    elif p_value < 0.01:
        annotation = '**'   # p < 0.01
    elif p_value < 0.05:
        annotation = '*'    # p < 0.05
    else:
        annotation = 'n.s.'  # not significant
    
    # Display the asterisks above the boxes
    ax.text(1.5, annotation_height, annotation,  # Use the adjusted height for annotation
            horizontalalignment='center', fontweight='bold', fontsize=6)
    
    # Add a line connecting the boxes
    x1, x2 = 1, 2
    ax.plot([x1, x1, x2, x2], [line_height, annotation_height, annotation_height, line_height], lw=0.5, c='black')
    
    # Adjust y-axis to make room for the annotation
    if short:
        ax.set_ylim(top=annotation_height + (y_max * 0.05))
    else:
        ax.set_ylim(top=annotation_height + (y_max * 0.1))  # Reduce space above the annotation (10% more space)
    
    # Set background color and frame
    ax.set_facecolor('white')
    for spine in ax.spines.values():
        spine.set_visible(True)
        spine.set_color('black')
        spine.set_linewidth(0.5)
    
    # Add grid
    ax.grid(True, axis='y', linestyle=':', color='gray', alpha=0.5, linewidth=0.5)
    
    # Adjust tick parameters
    ax.tick_params(axis='both', which='major', labelsize=5, length=3, width=0.5)
    plt.xticks(rotation=45)
    ax.set_xticklabels(labels, fontweight='normal', fontsize=6)
    
    plt.tight_layout()
    
    if save:
        plt.savefig('/genetics/{}.png'.format(title[:5]),
                    dpi=300, bbox_inches='tight')

import pandas as pd
import numpy as np
from scipy import stats
def calculate_significance(group1, group2):
    t_stat, p_value = stats.ttest_ind(group1, group2)
    return p_value
# Set the style to match Nature Medicine guidelines
plt.style.use('default')
plt.rcParams.update({
 'font.sans-serif': ['Helvetica'],
    'font.size': 7,
    'axes.linewidth': 0.5,
    'axes.labelsize': 7,
    'xtick.labelsize': 6,
    'ytick.labelsize': 6,
    'legend.fontsize': 6,
    'xtick.major.size': 3,
    'ytick.major.size': 3,
    'xtick.major.width': 0.5,
    'ytick.major.width': 0.5,
    'axes.spines.right': False,
    'axes.spines.top': False,
})

fig = plt.figure(figsize=(180 / 25.4,2))

# Create a GridSpec with 2 rows and 2 columns, with the bottom row spanning both columns

gs = gridspec.GridSpec(1, 6, width_ratios=[2,0.4, 1, 1, 1, 1])
# Top-left panel (Panel A)
results=pd.read_csv('/prediction_results.csv')

std=pd.read_csv('prediction_std.csv')

# Bottom panel (Panel C, wider)
ax3 = fig.add_subplot(gs[0, 0])  # First column (wider)
#ax3.set_title('Panel C')
results_cols={'dna':'Genetics', 'lifestyle':'Lifestyle', 'nightingale':'Metabolomics',
              'nutr_all':'Nutrition', 'microbiome':'Microbiome', 'bt_data':'Blood Tests'}
results1=results[['dna', 'lifestyle', 'nightingale', 'microbiome']].loc[['Ashkenaz', 'Middle Eastern', 'North African']].rename(columns=results_cols)

#std=std[['dna', 'lifestyle', 'nightingale', 'microbiome']].loc[['Ashkenaz', 'Middle Eastern', 'North African']].rename(columns=results_cols)
error_config = {'elinewidth': 0.5}
results1.plot(kind='bar',ax=ax3, yerr=std, cmap='YlGnBu', edgecolor='black', linewidth=0.5,error_kw=error_config)
ax3.set_ylabel('Predicted ethnicity (AUC)')
ax3.set_xlabel('')
ax3.set_ylim(0.48, 1.05)

ax3.tick_params(axis='x', rotation=45)
ax3.set_title('Ancestry prediction', fontsize=7)


ax3.tick_params(axis='x', length=0)



###creating a matched 1x1 cohort of ashkenaz vs not azhkenaz
from utils import find_match

basic_data=pd.read_csv('basic_data.csv', index_col=0) ###age, gender, bmi


ancestry['is_achkenaz'] = ancestry['ancestry']=='Ashkenaz'
ancestry=ancestry[ancestry.index.isin(basic_data.index)]
not_mixed=ancestry[~ancestry['ancestry'].isin(['Unknown/other', 'Mixed'])]
shak_ind = ancestry[ancestry['is_achkenaz']].index
not_shak_ind = not_mixed[~not_mixed['is_achkenaz']].index
shak_data=basic_data.loc[shak_ind].dropna()
shak_data['not_ashkenaz']=False
not_shak_data=basic_data.loc[not_shak_ind].dropna()
not_shak_data['not_ashkenaz']=True
df=shak_data.append(not_shak_data)

matched_df = find_match(df,TREATMENT='not_ashkenaz', koef=1)
shak_ind=matched_df[~matched_df['not_ashkenaz']].index
not_shak_ind=matched_df[matched_df['not_ashkenaz']].index
for col in ['age', 'gender', 'bmi']:
    print(col, calculate_significance(basic_data.loc[shak_ind][col].dropna(), basic_data.loc[not_shak_ind][col].dropna()))


res_df=pd.DataFrame(index=shak_ind.append(not_shak_ind))
foods= pd.read_csv('popular_foods.csv', index_col=0)
res_df['Potatoes(g/d)'] = foods['Potatoes,g/d']

bt= pd.read_csv('bt_data.csv', index_col=0)
bt=bt.rename(columns={'bt__wbc':"WBC"})
res_df['WBC (cells/µL)']=bt['WBC']

ls= pd.read_csv('lifestyle.csv', index_col=0)
ls=ls.rename(columns={'hours_outdoors_summer':'Outdoor time summer, hours'})
res_df['Outdoor time summer (hours)']=ls['Outdoor time summer, hours']

measurements= pd.read_csv('measurements_data.csv', index_col=0)
measurements=measurements.rename(columns={'sitting_blood_pressure_systolic':'BP, sys'})
res_df['BP, sys (mmHg)']=measurements['BP, sys']



for i,col in enumerate(['Potatoes(g/d)', 'WBC (cells/µL)', 'Summer Outdoor Time (hours)',
       'BP, sys (mmHg)']):
    if col in ['BP, sys (mmHg)']:
        short=True
    else:
        short=False
    ax = fig.add_subplot(gs[0, i+2])
    boxplot_with_mannwhitney_final(res_df[res_df['ashk']][col].dropna().values,
                res_df[~res_df['ashk']][col].dropna().values, ['Ashkenaz', 'Other'], col, ax=ax, short=short)

ax3.legend(loc='center left', bbox_to_anchor=(0.9, 0.5),fontsize=6, title="Predictor")
plt.subplots_adjust(left=0.05, right=3, top=0.2, bottom=0.1, wspace=0.15)
plt.tight_layout()

# Save or show the figure
plt.savefig('10k_predicting_ethnicity_all.png', dpi=300, bbox_inches='tight')