In [1]:

import pandas as pd
import numpy as np
import itertools as it
import plotly.express as px
from plotly.graph_objects import Figure


In [2]:

species_name_dict = {
    "Hvul": "../metacell_Hvul_geo_1e6_0.1lim_3fold_broad_type_classes_1mincount.csv",
    "Nvec": "../metacell_Nvec_geo_1e6_0.1lim_3fold_broad_type_classes_1mincount.csv",
    "Xesp": "../metacell_Xesp_geo_1e6_0.1lim_3fold_broad_type_classes_1mincount.csv",
    "Spis": "../metacell_Spis_geo_1e6_0.1lim_3fold_broad_type_classes_1mincount.csv",
}


In [3]:
pair=("Hvul", "Spis")

species_1, species_2 = pair

print(f"start {species_1}, {species_2}")

classes_sp1 = pd.read_csv(species_name_dict.get(species_1))
classes_sp2 = pd.read_csv(species_name_dict.get(species_2))
ortho = pd.read_csv(f"../../orthologous_pairs_{species_1}_{species_2}.csv")

mapping = ortho
mapping.drop(columns="Unnamed: 0", inplace=True)

start Hvul, Spis


In [4]:
mapping

Unnamed: 0,Gene_sp1,Gene_sp2,Gene_sp1_sp,Gene_sp2_sp
0,Hvul_g10365_1,Spis_XP_022783009_1,Hvul,Spis
1,Hvul_g24804_1,Spis_XP_022782072_1,Hvul,Spis
2,Hvul_g8326_1,Spis_XP_022780686_1,Hvul,Spis
3,Hvul_g14066_1,Spis_XP_022792326_1,Hvul,Spis
4,Hvul_g14066_1,Spis_XP_022799259_1,Hvul,Spis
...,...,...,...,...
44159,Hvul_g29480_1,Spis25454_1,Hvul,Spis
44160,Hvul_g8978_1,Spis_XP_022807267_1,Hvul,Spis
44161,Hvul_g16455_1,Spis24465_1,Hvul,Spis
44162,Hvul_g16454_1,Spis24465_1,Hvul,Spis


In [12]:

# Example input dataframe
# df = pd.DataFrame({'gene1': [...], 'gene2': [...]})

def classify_ortholog_types(df):
    # Count mappings for each gene in each species
    gene1_counts = df['Gene_sp1'].value_counts()
    gene2_counts = df['Gene_sp2'].value_counts()
    
    # Add counts to the dataframe
    df['Gene_sp1_count'] = df['Gene_sp1'].map(gene1_counts)
    df['Gene_sp2_count'] = df['Gene_sp2'].map(gene2_counts)
    
    # Determine ortholog type for each pair
    def classify(row):
        if row['Gene_sp1_count'] == 1 and row['Gene_sp2_count'] == 1:
            return 'o2o'
        elif row['Gene_sp1_count'] == 1 and row['Gene_sp2_count'] > 1:
            return 'o2m'
        elif row['Gene_sp1_count'] > 1 and row['Gene_sp2_count'] == 1:
            return 'm2o'
        else:
            return 'm2m'
    
    df['ortholog_type'] = df.apply(classify, axis=1)

    def refine_classification(group):
        if ('m2o' in group['ortholog_type'].values or 'o2m' in group['ortholog_type'].values) and 'm2m' in group['ortholog_type'].values :
            group['ortholog_type'] = 'm2m'
        return group

    df = df.groupby('Gene_sp1', group_keys=False).apply(refine_classification)

    
    return df

In [13]:
mapping['Gene_sp2'].value_counts()

Spis22455_1            178
Spis22157_1            178
Spis21717_1            178
Spis_XP_022809498_1    178
Spis23396_1            178
                      ... 
Spis_XP_022788325_1      1
Spis_XP_022810003_1      1
Spis_XP_022803193_1      1
Spis_XP_022810098_1      1
Spis_XP_022810410_1      1
Name: Gene_sp2, Length: 12548, dtype: int64

In [14]:
classify_ortholog_types(mapping).to_csv("test_mapping_Hvul_Spis.csv")

sp1_counts = mapping.drop_duplicates().groupby("Gene_sp1").count()
sp1_counts.rename(
    columns={"Gene_sp2": "Gene_sp2_counts"}, inplace=True
)  # group by sp1, count is for sp2
sp2_counts = mapping.drop_duplicates().groupby("Gene_sp2").count()
sp2_counts.rename(columns={"Gene_sp1": "Gene_sp1_counts"}, inplace=True)

sp1_counts

mapping_new = mapping.merge(
    sp1_counts[["Gene_sp2_counts"]],
    left_index=False,
    right_index=True,
    left_on="Gene_sp1",
)

mapping_new.sort_values(by='Gene_sp1')

mapping_both = mapping_new.merge(
    sp2_counts[["Gene_sp1_counts"]],
    left_index=False,
    right_index=True,
    left_on="Gene_sp2",
)

mapping_both["orthology_type"] = np.select(
    [
        (
            (mapping_both["Gene_sp1_counts"] == 1)
            & (mapping_both["Gene_sp2_counts"] == 1)
        ),
        (
            (mapping_both["Gene_sp1_counts"] > 1)
            & (mapping_both["Gene_sp2_counts"] == 1)
        ),
        (
            (mapping_both["Gene_sp1_counts"] == 1)
            & (mapping_both["Gene_sp2_counts"] > 1)
        ),
        (
            (mapping_both["Gene_sp1_counts"] > 1)
            & (mapping_both["Gene_sp2_counts"] > 1)
        ),
    ],
    [
        "o2o",
        "m2o",
        "o2m",
        "m2m",
    ],
    default="",
)


mapping_both.sort_values(by='Gene_sp1')