# Setup

First we load in our datasets for analysis. This includes our cluster datasets for both stages, where each row is a cluster as well as protein datasets where each row is a protein.

In [133]:
import pandas as pd
# load in our cluster data
s3_df = pd.read_csv("data/generated_tables/s3_network.tsv", sep="\t")
s5_df = pd.read_csv("data/generated_tables/s5_network.tsv", sep="\t")
# keep track of the stage
s5_df["stage"] = 5
s3_df["stage"] = 3


# add essentiality data onto clusters
s5_essentiality_df = pd.read_csv("data/generated_tables/s5_essentiality_df.tsv", sep="\t")
s3_essentiality_df = pd.read_csv("data/generated_tables/s3_essentiality_df.tsv", sep="\t")
# keep track of stage
s5_essentiality_df["stage"] = 5
s3_essentiality_df["stage"] = 3

# Appending redundant tag onto essentiality data
s3_essentiality_df["redundant"] = s3_essentiality_df["Product Description"].duplicated(keep=False)
s5_essentiality_df["redundant"] = s5_essentiality_df["Product Description"].duplicated(keep=False)
for i, row in s3_essentiality_df.iterrows():
    if "unknown function" in row["Product Description"]:
        s3_essentiality_df.at[i, "redundant"] = False
for i, row in s5_essentiality_df.iterrows():
    if "unknown function" in row["Product Description"]:
        s5_essentiality_df.at[i, "redundant"] = False

# add column for essentiality threshold
s3_essentiality_df["under_MIS_threshold"] = s3_essentiality_df["essential"]
s5_essentiality_df["under_MIS_threshold"] = s5_essentiality_df["essential"]

# change essential to be true if either redundant or under threshold
s3_essentiality_df["essential"] = s3_essentiality_df["under_MIS_threshold"] | s3_essentiality_df["redundant"]
s5_essentiality_df["essential"] = s5_essentiality_df["under_MIS_threshold"] | s5_essentiality_df["redundant"]

s3_essentiality_df.drop("Unnamed: 0", axis=1, inplace=True)
s5_essentiality_df.drop("Unnamed: 0", axis=1, inplace=True)

s3_essentiality_df.to_csv("data/generated_tables/updated_s3_essentiality_df.tsv", sep="\t", index=False)
s5_essentiality_df.to_csv("data/generated_tables/updated_s5_essentiality_df.tsv", sep="\t", index=False)

# combine the two dataframes
protein_concat_df = pd.concat([s5_essentiality_df, s3_essentiality_df])

In [134]:
s3_essentiality_df[s3_essentiality_df["Accession ID"] == "Q8IIV1"]

Unnamed: 0,Accession ID,Gene ID,Product Description,3D7_MIS,3D7_MFS,Neighbours,essential,Degree,Betweenness Centrality,Closeness Centrality,stage,redundant,under_MIS_threshold
156,Q8IIV1,PF3D7_1105100,histone H2B,,,"['Q8IB24', 'C6KSV0', 'Q8IIV2', 'C6KT18', 'K7NT...",False,9,0.000998,0.289366,3,False,False


In [135]:
print("Number of clusters in s5 network greater than random: ", sum(s5_df["significant"]))
print("Number of clusters in s3 network greater than random: ", sum(s3_df["significant"]))

Number of clusters in s5 network greater than random:  69
Number of clusters in s3 network greater than random:  30


In [136]:
from ast import literal_eval

# function to convert str representation of Proteins cluster column in df to sets
def convert_data(df):
    df["Proteins"] = df["Proteins"].apply(literal_eval)
    return df 

print("data type of cluster before converting: ", type(s3_df["Proteins"][0]))
for df in [s3_df, s5_df]:
    df = convert_data(df)
print("data type of cluster after converting: ", type(s3_df["Proteins"][0]))


data type of cluster before converting:  <class 'str'>
data type of cluster after converting:  <class 'set'>


## Analyzing Network Stats

In [137]:
# 
s5_essentiality_df[s5_essentiality_df["essential"]].sort_values("Degree", ascending=False)

Unnamed: 0,Accession ID,Gene ID,Product Description,3D7_MIS,3D7_MFS,Neighbours,essential,Degree,Betweenness Centrality,Closeness Centrality,stage,redundant,under_MIS_threshold
114,C0H5H0,PF3D7_1344200,endoplasmic reticulum chaperone GRP170,0.143,-2.839,"['O97282', 'Q8II43', 'Q8IFP3', 'Q8ILP6', 'C6KS...",True,69,0.148560,0.445327,5,False,True
109,Q8IAX5,PF3D7_0813900,"40S ribosomal protein S16, putative",0.141,-3.879,"['Q8IE10', 'C6KT19', 'C6KT25', 'Q8IIB4', 'Q8IE...",True,58,0.033885,0.426100,5,False,True
62,Q8IDR9,PF3D7_1342000,40S ribosomal protein S6,0.127,-2.986,"['Q8ILI2', 'Q8IIA2', 'Q8IK15', 'Q8IIV2', 'A0A1...",True,53,0.022999,0.415909,5,False,True
41,Q8I323,PF3D7_0912900,"26S proteasome regulatory subunit RPN8, putative",0.122,-3.009,"['Q8ILE3', 'Q76NM6', 'Q8IKB2', 'Q8I0V2', 'Q8I2...",True,53,0.053808,0.421672,5,False,True
85,C0H5C2,PF3D7_1317800,40S ribosomal protein S19,0.133,-2.762,"['A0A144A3N9', 'Q8IIB4', 'A0A5K1K8V8', 'Q8IEK3...",True,52,0.020566,0.411225,5,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,Q8IIB6,PF3D7_1124700,"GrpE protein homolog, mitochondrial, putative",0.122,-2.922,['Q76NN6'],True,1,0.000000,0.265770,5,False,True
178,C0H586,PF3D7_0933200,"calcyclin-binding protein, putative",0.205,0.000,['Q76NN6'],True,1,0.000000,0.265770,5,False,True
13,Q8IM38,PF3D7_1405600,ribonucleoside-diphosphate reductase small cha...,0.120,-2.846,['Q8IJN8'],True,1,0.000000,0.002625,5,True,True
26,Q8I6T2,PF3D7_1345700,"isocitrate dehydrogenase [NADP], mitochondrial",0.121,-3.089,['Q8IJC6'],True,1,0.000000,0.280467,5,False,True


## Tagging Protein Essentiality to Clusters

In [138]:
# tagging protein essentiality to clusters
def calculate_cluster_essentiality(cluster):
    total = 0
    for protein in cluster:
        if protein in s5_essentiality_df["Accession ID"].values:
            total += s5_essentiality_df[s5_essentiality_df["Accession ID"] == protein]["essential"].values[0]
    return total/len(cluster)

lst = []
for i, row in s5_df.iterrows():
    lst.append(calculate_cluster_essentiality(row["Proteins"]))
s5_df["essentiality"] = lst

lst = []
for i, row in s3_df.iterrows():
    lst.append(calculate_cluster_essentiality(row["Proteins"]))
s3_df["essentiality"] = lst

## Analyzing Between Clusters

In [139]:
# # combine the two dataframes
# cluster_concat_df = pd.concat([s3_df, s5_df])


# rename columns
s3_df.rename(columns={"Number": "s3_number",
                      "Proteins": "s3_proteins",
                      "avg_spearman": "s3_avg_spearman",
                      "size": "s3_size",
                      "essentiality": "s3_essentiality"}, inplace=True),

s5_df.rename(columns={"Number": "s5_number",
                      "Proteins": "s5_proteins",
                      "avg_spearman": "s5_avg_spearman",
                      "size": "s5_size",
                      "essentiality": "s5_essentiality"}, inplace=True)

In [140]:
# cross join the two dataframes so each row is a combination of s3 and s5 clusters
crossed_df = pd.merge(
    s3_df[["s3_number", "s3_proteins", "s3_avg_spearman", "s3_size", "s3_essentiality"]],
    s5_df[['s5_number', 's5_proteins', 's5_avg_spearman', 's5_size', "s5_essentiality"]],
    how="cross")

# reordering the columns
crossed_df = crossed_df.loc[:, ["s3_number", "s5_number", "s3_proteins", "s5_proteins", 
                                "s3_avg_spearman", "s5_avg_spearman", "s3_size", "s5_size"]]
crossed_df["total_size"] = crossed_df["s3_size"] + crossed_df["s5_size"]

In [141]:
# look for common proteins across clusters
common_proteins = []
for _, row in crossed_df.iterrows():
    row_common_proteins = []
    for protein in row["s3_proteins"]:
        if protein in row["s5_proteins"]:
            row_common_proteins.append(protein)
    common_proteins.append(row_common_proteins)
crossed_df["common_proteins"] = common_proteins

In [142]:
# look for common proteins across clusters
def get_common_proteins(proteins1, proteins2):
    common_proteins = []
    # iterate over crossed df
    for _, row in crossed_df.iterrows():
        row_common_proteins = []
        # if protein in s3 is in s5, add to list of common proteins
        for protein in row[proteins1]:
            if protein in row[proteins2]:
                row_common_proteins.append(protein)
        common_proteins.append(row_common_proteins)
    return common_proteins

In [143]:
# add number of overlapping proteins
crossed_df["num_overlap"] = crossed_df["common_proteins"].apply(len)
# sort by number of overlaps
filtered_crossed_df = crossed_df[crossed_df["num_overlap"] > 0]
filtered_crossed_df.sort_values("num_overlap", ascending=False).head(5)

Unnamed: 0,s3_number,s5_number,s3_proteins,s5_proteins,s3_avg_spearman,s5_avg_spearman,s3_size,s5_size,total_size,common_proteins,num_overlap
509,6,5,"{Q8IDG2, Q7K6A9, Q76NM6, Q8IC01, Q8IBI3, Q8IEK...","{Q8IDG2, Q7K6A9, Q8II71, C0H4C7, C6KST5, Q8IBI...",0.721655,0.80047,24,35,59,"[Q8IDG2, Q7K6A9, Q8IBI3, Q8IK90, Q8II43, Q8IDG...",15
89,1,5,"{Q8IDG2, Q7K6A9, Q76NM6, Q8IC01, Q8IBI3, Q8IEK...","{Q8IDG2, Q7K6A9, Q8II71, C0H4C7, C6KST5, Q8IBI...",0.738465,0.80047,24,35,59,"[Q8IDG2, Q7K6A9, Q8IBI3, Q8IK90, Q8II43, Q8IDG...",15
2,0,2,"{Q8IIV1, Q8I0V2, O97285, Q8IKF0, C6KT18, Q8IK8...","{Q8I4U5, Q8IJX8, Q8IJM9, Q8IKM5, C6KT18, Q8IJ3...",0.376935,0.516712,16,65,81,"[Q8IIV1, Q8I0V2, O97285, C6KT18, Q8IET7, Q8IB2...",11
1178,14,2,"{Q8IAX8, Q8IIV1, Q8IJX8, Q8IJX3, C6KT18, Q8I60...","{Q8I4U5, Q8IJX8, Q8IJM9, Q8IKM5, C6KT18, Q8IJ3...",0.331886,0.516712,14,65,79,"[Q8IAX8, Q8IIV1, Q8IJX8, C6KT18, K7NTP5, Q8I5H...",10
1514,18,2,"{Q8IB03, Q8I289, Q8I1V1, K7NTP5, C6KTA4, O9625...","{Q8I4U5, Q8IJX8, Q8IJM9, Q8IKM5, C6KT18, Q8IJ3...",0.426576,0.516712,21,65,86,"[K7NTP5, Q8IKR1, O97285, Q8I0P6, Q8IET7, Q8I5H...",7


In [144]:
def calculate_overlap(row, size_col1, size_col2):
    return row["num_overlap"] / min(row[size_col1], row[size_col2])

def calculate_jaccard_index(row, proteins_col1, proteins_col2):
    return len(row["common_proteins"]) / len(set(list(row[proteins_col1]) + list(row[proteins_col2])))

crossed_df["percent_overlap"] = crossed_df.apply(calculate_overlap, args=["s3_size", "s5_size"], axis=1)
crossed_df["jaccard_index"] = crossed_df.apply(calculate_jaccard_index, args=["s3_proteins", "s5_proteins"], axis=1)
crossed_df["similarity_score"] = (crossed_df["percent_overlap"] + crossed_df["jaccard_index"]) / 2

In [145]:
crossed_df[["s3_number", "s5_number", "s3_avg_spearman", "s5_avg_spearman", "s3_size", "s5_size", "total_size", "num_overlap", "jaccard_index", "percent_overlap", "similarity_score"]].sort_values("similarity_score", ascending=False).head(5)

Unnamed: 0,s3_number,s5_number,s3_avg_spearman,s5_avg_spearman,s3_size,s5_size,total_size,num_overlap,jaccard_index,percent_overlap,similarity_score
1848,22,0,0.506917,0.500198,2,6,8,2,0.333333,1.0,0.666667
1466,17,38,0.48666,0.639987,4,6,10,3,0.428571,0.75,0.589286
996,11,72,0.704249,0.599209,5,5,10,3,0.428571,0.6,0.514286
946,11,22,0.704249,0.379117,5,6,11,3,0.375,0.6,0.4875
509,6,5,0.721655,0.80047,24,35,59,15,0.340909,0.625,0.482955


In [146]:
# top 10 clusters with highest percent common
crossed_df[crossed_df["num_overlap"] > 2].sort_values("percent_overlap", ascending=False).head(5)

Unnamed: 0,s3_number,s5_number,s3_proteins,s5_proteins,s3_avg_spearman,s5_avg_spearman,s3_size,s5_size,total_size,common_proteins,num_overlap,percent_overlap,jaccard_index,similarity_score
1466,17,38,"{Q76NM4, Q8I3W9, A0A5K1K8H7, Q8IHR8}","{Q8IIK8, Q8I3W9, Q8I274, A0A5K1K8H7, C0H516, Q...",0.48666,0.639987,4,6,10,"[Q76NM4, Q8I3W9, A0A5K1K8H7]",3,0.75,0.428571,0.589286
1178,14,2,"{Q8IAX8, Q8IIV1, Q8IJX8, Q8IJX3, C6KT18, Q8I60...","{Q8I4U5, Q8IJX8, Q8IJM9, Q8IKM5, C6KT18, Q8IJ3...",0.331886,0.516712,14,65,79,"[Q8IAX8, Q8IIV1, Q8IJX8, C6KT18, K7NTP5, Q8I5H...",10,0.714286,0.144928,0.429607
87,1,3,"{Q8IDG2, Q7K6A9, Q76NM6, Q8IC01, Q8IBI3, Q8IEK...","{Q8IE84, Q76NM6, Q8IDS0, Q8I280, Q8I2H3, Q8IEP...",0.738465,0.73466,24,7,31,"[Q76NM6, Q6ZMA8, Q8IE84, Q8I280, Q8I2H3]",5,0.714286,0.192308,0.453297
2,0,2,"{Q8IIV1, Q8I0V2, O97285, Q8IKF0, C6KT18, Q8IK8...","{Q8I4U5, Q8IJX8, Q8IJM9, Q8IKM5, C6KT18, Q8IJ3...",0.376935,0.516712,16,65,81,"[Q8IIV1, Q8I0V2, O97285, C6KT18, Q8IET7, Q8IB2...",11,0.6875,0.157143,0.422321
2270,27,2,"{Q8IAX8, Q8IIV1, Q8ILG8, Q8IJX3, C6KT18, Q8I5H...","{Q8I4U5, Q8IJX8, Q8IJM9, Q8IKM5, C6KT18, Q8IJ3...",0.349638,0.516712,9,65,74,"[Q8IAX8, Q8IIV1, C6KT18, Q8I5H4, Q8IIV2, Q8IBV7]",6,0.666667,0.088235,0.377451


In [147]:
# top 10 clusters with lowest percent common
crossed_df.sort_values(["percent_overlap", "total_size"], ascending=[True, False]).head(5)

Unnamed: 0,s3_number,s5_number,s3_proteins,s5_proteins,s3_avg_spearman,s5_avg_spearman,s3_size,s5_size,total_size,common_proteins,num_overlap,percent_overlap,jaccard_index,similarity_score
590,7,2,"{Q8I485, Q8I4R5, Q8IEU2, Q8IKC8, Q8IBN4, O7731...","{Q8I4U5, Q8IJX8, Q8IJM9, Q8IKM5, C6KT18, Q8IJ3...",0.552264,0.516712,11,65,76,[],0,0.0,0.0,0.0
1346,16,2,"{Q8IJN9, Q8I2X4, Q8IDG2, Q8IC01, Q8II24, Q8II3...","{Q8I4U5, Q8IJX8, Q8IJM9, Q8IKM5, C6KT18, Q8IJ3...",0.626482,0.516712,10,65,75,[],0,0.0,0.0,0.0
2354,28,2,"{Q8I2X4, Q8IC01, Q8II36, Q7K6A5, Q8IEK1, O9722...","{Q8I4U5, Q8IJX8, Q8IJM9, Q8IKM5, C6KT18, Q8IJ3...",0.438453,0.516712,7,65,72,[],0,0.0,0.0,0.0
1262,15,2,"{Q8IDG9, Q8IIX5, Q8I490, Q8IFM0, Q8I2F4, Q8I206}","{Q8I4U5, Q8IJX8, Q8IJM9, Q8IKM5, C6KT18, Q8IJ3...",0.384848,0.516712,6,65,71,[],0,0.0,0.0,0.0
758,9,2,"{Q8IEJ6, Q8I0X1, Q8ILB6, P61074, O97227}","{Q8I4U5, Q8IJX8, Q8IJM9, Q8IKM5, C6KT18, Q8IJ3...",0.678557,0.516712,5,65,70,[],0,0.0,0.0,0.0


In [148]:
# get clusters with no common proteins across all clusters
x = crossed_df[crossed_df["s3_number"] == 1]
y = x["num_overlap"] > 1
bool(y.sum() == 0)

False

In [149]:
lst = []
for i in s3_df["s3_number"]:
    x = crossed_df[crossed_df["s3_number"] == i]
    y = x["num_overlap"] > 1
    lst.append(bool(y.sum() == 0))
s3_df["unique between stages"] = lst

lst = []
for i in s5_df["s5_number"]:
    x = crossed_df[crossed_df["s5_number"] == i]
    y = x["num_overlap"] > 1
    lst.append(bool(y.sum() == 0))
s5_df["unique between stages"] = lst

In [150]:
s5_df[s5_df["unique between stages"]].sort_values("s5_size", ascending=False).head(5)

Unnamed: 0,s5_number,s5_proteins,s5_avg_spearman,s5_size,avg_spearman_random,significant,large,stage,s5_essentiality,unique between stages
17,17,"{Q8IKT2, C6KT25, Q8I3X4, Q8I5P5, P50250, Q8IE6...",0.489377,9,0.39314,True,True,5,0.555556,True
23,23,"{Q8IKQ9, Q8IIX0, A0A144A1R5, Q8I463, O97249, Q...",0.496267,9,0.402226,True,True,5,0.666667,True
15,15,"{Q8ILE3, Q8IJM0, Q8IAR6, Q8I5M9, Q8IC05, Q8IKH...",0.758082,8,0.421936,True,True,5,0.625,True
63,63,"{Q8IJN9, Q8IIR8, Q8IKT2, Q8I6U7, C6KT25, Q8I3X...",0.499435,8,0.400639,True,True,5,0.625,True
30,30,"{Q8II82, Q8I3Y6, Q8I3A4, Q8IBZ4, C0H4C7, Q8ILS...",0.8794,7,0.395047,True,True,5,0.428571,True


In [151]:
s5_df[s5_df["unique between stages"] & (s5_df["s5_size"] > 2)].sort_values(["s5_avg_spearman", "s5_size"], ascending=False).head(5)

Unnamed: 0,s5_number,s5_proteins,s5_avg_spearman,s5_size,avg_spearman_random,significant,large,stage,s5_essentiality,unique between stages
66,66,"{Q8II42, C6S3I6, Q8I3A1, Q8II92, P61074}",0.940508,5,0.38321,True,True,5,0.6,True
30,30,"{Q8II82, Q8I3Y6, Q8I3A4, Q8IBZ4, C0H4C7, Q8ILS...",0.8794,7,0.395047,True,True,5,0.428571,True
1,1,"{Q8IBS3, Q8IIW2, Q8ILP6, Q8IIA4, Q8IDZ9, Q8I246}",0.873057,6,0.384568,True,True,5,1.0,True
36,36,"{C6S3I6, Q8I5Q3, Q8I3A1, Q8II42}",0.838632,4,0.419211,True,True,5,0.5,True
34,34,"{Q8IM24, Q8IKW0, Q8IK83, Q8IKS0}",0.814394,4,0.380889,True,True,5,1.0,True


## Comparing Within Cluster

To compare PPIs within a cluster, we cross join the stage 3 dataset with itself,
and see if the same proteins appear in any 2 clusters. We call this attribute 'unique within stage'.

Additionally, we cross join them 

In [152]:
# TODO: 

# cross join the two dataframes so each row is a combination of s3 and s5 clusters
s3_crossed_df = pd.merge(
    s3_df[["s3_number", "s3_proteins", "s3_avg_spearman", "s3_size"]].rename(columns={"s3_number": "number_1",
                                                                                        "s3_proteins": "proteins_1",
                                                                                        "s3_avg_spearman": "avg_spearman_1",
                                                                                        "s3_size": "size_1"}),
    s3_df[["s3_number", "s3_proteins", "s3_avg_spearman", "s3_size"]].rename(columns={"s3_number": "number_2",
                                                                                        "s3_proteins": "proteins_2",
                                                                                        "s3_avg_spearman": "avg_spearman_2",
                                                                                        "s3_size": "size_2"}),
    how="cross")
# drop rows where the two clusters are the same
s3_crossed_df.drop(s3_crossed_df[s3_crossed_df["number_1"] == s3_crossed_df["number_2"]].index, inplace=True)
# calculating total size
s3_crossed_df["total_size"] = s3_crossed_df["size_1"] + s3_crossed_df["size_2"]
s3_crossed_df.head()

# look for common proteins across clusters
common_proteins = []
for _, row in s3_crossed_df.iterrows():
    row_common_proteins = []
    for protein in row["proteins_1"]:
        if protein in row["proteins_2"]:
            row_common_proteins.append(protein)
    common_proteins.append(row_common_proteins)
s3_crossed_df["common_proteins"] = common_proteins

# add number of overlapping proteins
s3_crossed_df["num_overlap"] = s3_crossed_df["common_proteins"].apply(len)
# sort by number of overlaps
filtered_crossed_df = crossed_df[crossed_df["num_overlap"] > 0]
filtered_crossed_df.sort_values("num_overlap", ascending=False).head(5)

lst = []
for i in s3_df["s3_number"]:
    x = s3_crossed_df[s3_crossed_df["number_1"] == i]
    y = x["num_overlap"] > 1
    lst.append(bool(y.sum() == 0))
s3_df["unique within stage"] = lst

lst = []
# loop over each s3 cluster
for i in s3_df["s3_number"]:
    # get list of cluster pairs for current s3 cluster
    x = crossed_df[crossed_df["s3_number"] == i]
    # see if there is any overlapping proteins
    y = x["num_overlap"] > 1
    # if there's no overlapping proteins across all pairs, then the proteins
    # in this cluster are unique between stages
    lst.append(bool(y.sum() == 0))
s3_df["unique between stages"] = lst

In [153]:
# TODO: 

# cross join the two dataframes so each row is a combination of s3 and s5 clusters
s5_crossed_df = pd.merge(
    s5_df[["s5_number", "s5_proteins", "s5_avg_spearman", "s5_size"]].rename(columns={"s5_number": "number_1",
                                                                                        "s5_proteins": "proteins_1",
                                                                                        "s5_avg_spearman": "avg_spearman_1",
                                                                                        "s5_size": "size_1"}),
    s5_df[["s5_number", "s5_proteins", "s5_avg_spearman", "s5_size"]].rename(columns={"s5_number": "number_2",
                                                                                        "s5_proteins": "proteins_2",
                                                                                        "s5_avg_spearman": "avg_spearman_2",
                                                                                        "s5_size": "size_2"}),
    how="cross")
# drop rows where the two clusters are the same
s5_crossed_df.drop(s5_crossed_df[s5_crossed_df["number_1"] == s5_crossed_df["number_2"]].index, inplace=True)
# calculating total size
s5_crossed_df["total_size"] = s5_crossed_df["size_1"] + s5_crossed_df["size_2"]

# look for common proteins across clusters
common_proteins = []
for _, row in s5_crossed_df.iterrows():
    row_common_proteins = []
    for protein in row["proteins_1"]:
        if protein in row["proteins_2"]:
            row_common_proteins.append(protein)
    common_proteins.append(row_common_proteins)
s5_crossed_df["common_proteins"] = common_proteins

# add number of overlapping proteins
s5_crossed_df["num_overlap"] = s5_crossed_df["common_proteins"].apply(len)
# sort by number of overlaps
filtered_crossed_df = crossed_df[crossed_df["num_overlap"] > 0]
filtered_crossed_df.sort_values("num_overlap", ascending=False).head(5)

# get clusters with proteins unique WITHIN a stage
lst = []
for i in s5_df["s5_number"]:
    x = s5_crossed_df[s5_crossed_df["number_1"] == i]
    y = x["num_overlap"] > 1
    lst.append(bool(y.sum() == 0))
s5_df["unique within stage"] = lst

# get clusters with proteins unique BETWEEN stages
lst = []
for i in s5_df["s5_number"]:
    x = crossed_df[crossed_df["s5_number"] == i]
    y = x["num_overlap"] > 1
    lst.append(bool(y.sum() == 0))
s5_df["unique between stages"] = lst

In [154]:
unique_s5_clusters = s5_df[s5_df["unique within stage"] & 
      s5_df["unique between stages"] & 
      s5_df["large"] & 
      s5_df["significant"]].sort_values("s5_avg_spearman", ascending=False)
print(unique_s5_clusters.head(5))
unique_s5_clusters.to_csv("data/generated_tables/unique_s5_clusters.tsv", sep="\t", index=False)

    s5_number                                        s5_proteins  \
19         19               {A0A143ZY58, Q8I1S0, Q8IM66, Q8IKH3}   
31         31               {A0A5K1K967, Q8IL48, C6KTA3, C6KSV2}   
15         15  {Q8ILE3, Q8IJM0, Q8IAR6, Q8I5M9, Q8IC05, Q8IKH...   
32         32                   {Q8I3N3, O96252, C6KT09, C6S3G2}   
40         40                   {Q8I5V6, Q8IIT3, C0H4W2, Q8I3Q7}   

    s5_avg_spearman  s5_size  avg_spearman_random  significant  large  stage  \
19         0.796937        4             0.388856         True   True      5   
31         0.795290        4             0.367192         True   True      5   
15         0.758082        8             0.421936         True   True      5   
32         0.747036        4             0.397200         True   True      5   
40         0.701087        4             0.413126         True   True      5   

    s5_essentiality  unique between stages  unique within stage  
19            0.500                   True  

In [155]:
s3_df[s3_df["unique within stage"] & s3_df["unique between stages"] & s3_df["large"] & s3_df["significant"]]
#{Q8I3M5, C6KTB3, O77312}	

Unnamed: 0,s3_number,s3_proteins,s3_avg_spearman,s3_size,avg_spearman_random,significant,large,stage,s3_essentiality,unique between stages,unique within stage
13,13,"{Q8I6S5, Q8IJ34, Q8IEA6}",0.474638,3,0.372556,True,True,3,0.333333,True,True
19,19,"{C6KTB3, O77312, Q8I3M5}",0.774704,3,0.38641,True,True,3,1.0,True,True


In [156]:
s5_df[s5_df["unique within stage"] & s5_df["unique between stages"] & s5_df["large"] & s5_df["significant"]].sort_values("s5_avg_spearman", ascending=False).head(5)

Unnamed: 0,s5_number,s5_proteins,s5_avg_spearman,s5_size,avg_spearman_random,significant,large,stage,s5_essentiality,unique between stages,unique within stage
19,19,"{A0A143ZY58, Q8I1S0, Q8IM66, Q8IKH3}",0.796937,4,0.388856,True,True,5,0.5,True,True
31,31,"{A0A5K1K967, Q8IL48, C6KTA3, C6KSV2}",0.79529,4,0.367192,True,True,5,0.5,True,True
15,15,"{Q8ILE3, Q8IJM0, Q8IAR6, Q8I5M9, Q8IC05, Q8IKH...",0.758082,8,0.421936,True,True,5,0.625,True,True
32,32,"{Q8I3N3, O96252, C6KT09, C6S3G2}",0.747036,4,0.3972,True,True,5,0.75,True,True
40,40,"{Q8I5V6, Q8IIT3, C0H4W2, Q8I3Q7}",0.701087,4,0.413126,True,True,5,0.25,True,True


In [157]:
# convert csv to tsv
s3_ppi_clusters = pd.read_csv("data/Stage_3_PPI_predicted_features_with_clusters1.csv", sep=",")
s3_ppi_clusters.to_csv("data/Stage_3_PPI_predicted_features_with_clusters1.tsv", sep="\t", index=False)

s5_ppi_clusters = pd.read_csv("data/Stage_5_PPI_predicted_features_with_clusters1.csv", sep=",")
s5_ppi_clusters.to_csv("data/Stage_5_PPI_predicted_features_with_clusters1.tsv", sep="\t", index=False)

# create a map of protein accessions to protein name/functions
protein_desc_map = {}
for row in s5_ppi_clusters.iterrows():
    if row[1]["Protein1"] not in protein_desc_map:
        protein_desc_map[row[1]["Protein1"]] = row[1]["Description1"]
    if row[1]["Protein2"] not in protein_desc_map:
        protein_desc_map[row[1]["Protein2"]] = row[1]["Description2"]
        
for row in s3_ppi_clusters.iterrows():
    if row[1]["Protein1"] not in protein_desc_map:
        protein_desc_map[row[1]["Protein1"]] = row[1]["Description1"]
    if row[1]["Protein2"] not in protein_desc_map:
        protein_desc_map[row[1]["Protein2"]] = row[1]["Description2"]

def get_protein_desc_from_acccession(cluster):
    return [protein_desc_map[protein] for protein in cluster]


In [158]:
get_protein_desc_from_acccession(s5_df["s5_proteins"][0])
get_protein_desc_from_acccession(s3_df["s3_proteins"][4])

['Aminopeptidase P',
 '20 kDa chaperonin',
 'Triosephosphate isomerase',
 'Heat shock protein 70']

In [159]:
# append putative tag if cluster contains a putative protein
def append_putative(row, protein_col):
    proteins = get_protein_desc_from_acccession(row[protein_col])
    contains_putative = False
    for p in proteins:
        if 'putative' in p:
            contains_putative = True
            break
    return contains_putative

def append_uncharacterized(row, protein_col):
    proteins = get_protein_desc_from_acccession(row[protein_col])
    contains_uncharacterized = False
    for p in proteins:
        if 'uncharacterized' in p.lower():
            contains_uncharacterized = True
            break
    return contains_uncharacterized

s5_df["contains_putative"] = s5_df.apply(append_putative, args=["s5_proteins"], axis=1)
s3_df["contains_putative"] = s3_df.apply(append_putative, args=["s3_proteins"], axis=1)

s5_df["contains_uncharacterized"] = s5_df.apply(append_uncharacterized, args=["s5_proteins"], axis=1)
s3_df["contains_uncharacterized"] = s3_df.apply(append_uncharacterized, args=["s3_proteins"], axis=1)

In [160]:
# rename columns
cluster_concat_df = pd.concat(
    [s3_df.rename(columns={"s3_number": "Number",
                      "s3_proteins": "Proteins",
                      "s3_avg_spearman": "avg_spearman",
                      "s3_size": "size",
                      "s3_essentiality": "essentiality"}, inplace=False),
    s5_df.rename(columns={"s5_number": "Number",
                      "s5_proteins": "Proteins",
                      "s5_avg_spearman": "avg_spearman",
                      "s5_size": "size",
                      "s5_essentiality": "essentiality"}, inplace=False)]
)
cluster_concat_df = cluster_concat_df[cluster_concat_df["size"] > 2]
cluster_concat_df["descriptions"] = cluster_concat_df["Proteins"].apply(get_protein_desc_from_acccession)

In [161]:
cluster_concat_df[cluster_concat_df["contains_uncharacterized"]].sort_values("essentiality", ascending=False).head(5)

Unnamed: 0,Number,Proteins,avg_spearman,size,avg_spearman_random,significant,large,stage,essentiality,unique between stages,unique within stage,contains_putative,contains_uncharacterized,descriptions
74,74,"{Q8IKW0, Q8IM24, Q8IKS0, Q8I3X7}",0.560441,4,0.397528,True,True,5,1.0,True,False,False,True,"[LCCL domain-containing protein, LCCL/lectin a..."
56,56,"{Q8IKC8, Q8IEB8, Q8I485, Q8IK07}",0.091733,4,0.384779,False,True,5,0.75,False,True,False,True,"[Exported protein 2, Uncharacterized protein, ..."
10,10,"{Q8IBN4, C6KSV8, Q8I484, Q8I5J2, Q8I2X3, C0H571}",0.421278,6,0.398928,True,True,5,0.666667,False,False,True,True,"[Secreted ookinete protein, putative, ATP synt..."
17,17,"{Q8IKT2, C6KT25, Q8I3X4, Q8I5P5, P50250, Q8IE6...",0.489377,9,0.39314,True,True,5,0.555556,True,False,False,True,"[6-phosphogluconate dehydrogenase, decarboxyla..."
24,24,"{Q8I5A9, Q76NN8, Q8IBD1, O97285, O96252, Q8IKF...",0.312215,13,0.355444,False,True,3,0.461538,False,False,True,True,"[Ras-related protein Rab-2, Calcium-transporti..."


## Functions for analysis

#### For Clusters

In [162]:
# By clusters
def get_clusters_for_protein(protein, cluster_concat_df=cluster_concat_df):
    """gets a clusters by protein

    Args:
        protein (str): protein ID, either uniprot accession or plasmoDB ID
        cluster_concat_df (_type_, optional): DF containing clusters by row. Defaults to cluster_concat_df.

    Returns:
        _type_: augmented dataframe with clusters containing protein
    """
    print("get clusters for protein with function: ", protein_desc_map[protein])
    # get all clusters for a given protein
    clusters = cluster_concat_df[cluster_concat_df["Proteins"].apply(lambda x: protein in x)]
    return clusters

In [163]:
get_clusters_for_protein("Q8I0V2", cluster_concat_df)

get clusters for protein with function:  ATP synthase subunit beta


Unnamed: 0,Number,Proteins,avg_spearman,size,avg_spearman_random,significant,large,stage,essentiality,unique between stages,unique within stage,contains_putative,contains_uncharacterized,descriptions
0,0,"{Q8IIV1, Q8I0V2, O97285, Q8IKF0, C6KT18, Q8IK8...",0.376935,16,0.366112,True,True,3,0.4375,False,False,True,True,"[Histone H2B, ATP synthase subunit beta, ATP-d..."
2,2,"{Q8IB03, Q8I289, Q8I1V1, K7NTP5, C6KTA4, O9625...",0.398492,26,0.368785,True,True,3,0.538462,False,False,True,False,"[Chaperone protein ClpB1, Heptatricopeptide re..."
14,14,"{Q8IAX8, Q8IIV1, Q8IJX8, Q8IJX3, C6KT18, Q8I60...",0.331886,14,0.369976,False,True,3,0.285714,False,False,True,False,"[DNA/RNA-binding protein ALBA1, Histone H2B, E..."
18,18,"{Q8IB03, Q8I289, Q8I1V1, K7NTP5, C6KTA4, O9625...",0.426576,21,0.3523,True,True,3,0.52381,False,False,True,False,"[Chaperone protein ClpB1, Heptatricopeptide re..."
24,24,"{Q8I5A9, Q76NN8, Q8IBD1, O97285, O96252, Q8IKF...",0.312215,13,0.355444,False,True,3,0.461538,False,False,True,True,"[Ras-related protein Rab-2, Calcium-transporti..."
2,2,"{Q8I4U5, Q8IJX8, Q8IJM9, Q8IKM5, C6KT18, Q8IJ3...",0.516712,65,0.39939,True,True,5,0.584615,False,False,True,False,"[26S protease regulatory subunit 8, putative, ..."
59,59,"{Q8IM15, Q8IBN4, A0A143ZZK9, A0A5K1K8W5, Q8I2X...",0.28722,6,0.395289,False,True,5,0.5,True,False,True,False,"[Plasmepsin III, Secreted ookinete protein, pu..."


#### For Proteins

In [164]:
# By proteins
def get_essential_proteins(protein_concat_df=protein_concat_df):
    return protein_concat_df[protein_concat_df["essential"] == True]

In [165]:
essential = get_essential_proteins()
essential[(essential["Degree"] < 10) & (essential["Degree"] > 5)].head()

Unnamed: 0,Accession ID,Gene ID,Product Description,3D7_MIS,3D7_MFS,Neighbours,essential,Degree,Betweenness Centrality,Closeness Centrality,stage,redundant,under_MIS_threshold
4,Q8IIA4,PF3D7_1126000,threonine--tRNA ligase,0.119,-2.949,"['Q8IBS3', 'Q8IE10', 'Q8IDK7', 'Q8IDZ9', 'Q8IL...",True,7,0.00829,0.312109,5,False,True
6,C0H571,PF3D7_0929400,high molecular weight rhoptry protein 2,0.119,-3.01,"['Q8IKC8', 'Q8IBN4', 'Q6ZMA7', 'Q8I4T3', 'C6KS...",True,9,0.002461,0.310253,5,False,True
21,Q8IKF0,PF3D7_1468700,eukaryotic initiation factor 4A,0.12,-2.99,"['Q8I0V4', 'Q9TY94', 'C6KT23', 'Q8IDB0', 'Q8IC...",True,7,0.000639,0.316163,5,False,True
25,Q8I3I6,PF3D7_0528100,"AP-1/2 complex subunit beta, putative",0.12,-2.959,"['Q8IB24', 'Q8ILG6', 'Q8I2X4', 'C0H5H0', 'Q8I3...",True,6,0.005161,0.338432,5,False,True
28,Q8IIJ6,PF3D7_1117100,ubiquitin carboxyl-terminal hydrolase UCH54,0.121,-2.89,"['Q8II71', 'C0H5H0', 'Q8IJW0', 'A0A5K1K9F3', '...",True,9,0.000112,0.33906,5,False,True


In [166]:
# get cluster pairs with only one common protein
crossed_df[(crossed_df["num_overlap"] == 1) & (crossed_df["s3_size"] > 2) & (crossed_df["s5_size"] > 2)].head()

Unnamed: 0,s3_number,s5_number,s3_proteins,s5_proteins,s3_avg_spearman,s5_avg_spearman,s3_size,s5_size,total_size,common_proteins,num_overlap,percent_overlap,jaccard_index,similarity_score
8,0,8,"{Q8IIV1, Q8I0V2, O97285, Q8IKF0, C6KT18, Q8IK8...","{Q9TY94, Q7KQL5, A0A5K1K910, Q8IKF0, Q8III5, Q...",0.376935,0.489262,16,6,22,[Q8IKF0],1,0.166667,0.047619,0.107143
14,0,14,"{Q8IIV1, Q8I0V2, O97285, Q8IKF0, C6KT18, Q8IK8...","{Q8IJM9, Q8IKJ0, Q8IJ28, Q8IBV7}",0.376935,0.503623,16,4,20,[Q8IBV7],1,0.25,0.052632,0.151316
20,0,20,"{Q8IIV1, Q8I0V2, O97285, Q8IKF0, C6KT18, Q8IK8...","{Q8IK89, Q8ILZ7, O97306, Q8IJX3}",0.376935,0.387022,16,4,20,[Q8IK89],1,0.25,0.052632,0.151316
33,0,33,"{Q8IIV1, Q8I0V2, O97285, Q8IKF0, C6KT18, Q8IK8...","{O96221, Q8ILX1, Q8I5L6, Q8IB60, Q8I1S0, Q8I5B3}",0.376935,0.496311,16,6,22,[Q8I5L6],1,0.166667,0.047619,0.107143
59,0,59,"{Q8IIV1, Q8I0V2, O97285, Q8IKF0, C6KT18, Q8IK8...","{Q8IM15, Q8IBN4, A0A143ZZK9, A0A5K1K8W5, Q8I2X...",0.376935,0.28722,16,6,22,[Q8I0V2],1,0.166667,0.047619,0.107143


In [167]:
tmp = get_clusters_for_protein("Q8IKF0")
tmp[tmp["contains_putative"]]

get clusters for protein with function:  RNA helicase


Unnamed: 0,Number,Proteins,avg_spearman,size,avg_spearman_random,significant,large,stage,essentiality,unique between stages,unique within stage,contains_putative,contains_uncharacterized,descriptions
0,0,"{Q8IIV1, Q8I0V2, O97285, Q8IKF0, C6KT18, Q8IK8...",0.376935,16,0.366112,True,True,3,0.4375,False,False,True,True,"[Histone H2B, ATP synthase subunit beta, ATP-d..."
2,2,"{Q8IB03, Q8I289, Q8I1V1, K7NTP5, C6KTA4, O9625...",0.398492,26,0.368785,True,True,3,0.538462,False,False,True,False,"[Chaperone protein ClpB1, Heptatricopeptide re..."
10,10,"{A0A144A2H0, Q8IDZ8, C0H4V6, Q8IKF0, Q8II24, Q...",0.638065,9,0.375679,True,True,3,0.555556,False,False,True,False,"[Aminopeptidase P, 20 kDa chaperonin, 14-3-3 p..."
18,18,"{Q8IB03, Q8I289, Q8I1V1, K7NTP5, C6KTA4, O9625...",0.426576,21,0.3523,True,True,3,0.52381,False,False,True,False,"[Chaperone protein ClpB1, Heptatricopeptide re..."
24,24,"{Q8I5A9, Q76NN8, Q8IBD1, O97285, O96252, Q8IKF...",0.312215,13,0.355444,False,True,3,0.461538,False,False,True,True,"[Ras-related protein Rab-2, Calcium-transporti..."
8,8,"{Q9TY94, Q7KQL5, A0A5K1K910, Q8IKF0, Q8III5, Q...",0.489262,6,0.412184,True,True,5,0.333333,False,False,True,False,"[RNA helicase, Tubulin beta chain, aspartate c..."
77,77,"{Q9TY94, Q8IBG1, A0A144A1R5, Q7KQL5, Q8I463, Q...",0.302591,9,0.406181,False,True,5,0.666667,False,False,True,False,"[RNA helicase, Dynein heavy chain-like protein..."


In [168]:
cluster_concat_df[cluster_concat_df["contains_putative"] & cluster_concat_df["unique between stages"] & (cluster_concat_df["stage"] == 5)].sort_values(["avg_spearman", "essentiality"], ascending=False).head(5)

Unnamed: 0,Number,Proteins,avg_spearman,size,avg_spearman_random,significant,large,stage,essentiality,unique between stages,unique within stage,contains_putative,contains_uncharacterized,descriptions
66,66,"{Q8II42, C6S3I6, Q8I3A1, Q8II92, P61074}",0.940508,5,0.38321,True,True,5,0.6,True,False,True,False,"[Nucleic acid binding protein, putative, Repli..."
30,30,"{Q8II82, Q8I3Y6, Q8I3A4, Q8IBZ4, C0H4C7, Q8ILS...",0.8794,7,0.395047,True,True,5,0.428571,True,False,True,True,"[Prefoldin subunit 5, putative, Probable prefo..."
36,36,"{C6S3I6, Q8I5Q3, Q8I3A1, Q8II42}",0.838632,4,0.419211,True,True,5,0.5,True,False,True,False,"[Replication factor A protein 3, putative, 10 ..."
19,19,"{A0A143ZY58, Q8I1S0, Q8IM66, Q8IKH3}",0.796937,4,0.388856,True,True,5,0.5,True,True,True,False,"[ADP-ribosylation factor 1, Small GTP-binding ..."
31,31,"{A0A5K1K967, Q8IL48, C6KTA3, C6KSV2}",0.79529,4,0.367192,True,True,5,0.5,True,True,True,False,"[Elongation factor 1-gamma, putative, tRNA imp..."


In [169]:
cluster_concat_df[cluster_concat_df["contains_putative"] & cluster_concat_df["unique between stages"] & (cluster_concat_df["stage"] == 5)].sort_values(["avg_spearman", "essentiality"], ascending=False).head(5)

Unnamed: 0,Number,Proteins,avg_spearman,size,avg_spearman_random,significant,large,stage,essentiality,unique between stages,unique within stage,contains_putative,contains_uncharacterized,descriptions
66,66,"{Q8II42, C6S3I6, Q8I3A1, Q8II92, P61074}",0.940508,5,0.38321,True,True,5,0.6,True,False,True,False,"[Nucleic acid binding protein, putative, Repli..."
30,30,"{Q8II82, Q8I3Y6, Q8I3A4, Q8IBZ4, C0H4C7, Q8ILS...",0.8794,7,0.395047,True,True,5,0.428571,True,False,True,True,"[Prefoldin subunit 5, putative, Probable prefo..."
36,36,"{C6S3I6, Q8I5Q3, Q8I3A1, Q8II42}",0.838632,4,0.419211,True,True,5,0.5,True,False,True,False,"[Replication factor A protein 3, putative, 10 ..."
19,19,"{A0A143ZY58, Q8I1S0, Q8IM66, Q8IKH3}",0.796937,4,0.388856,True,True,5,0.5,True,True,True,False,"[ADP-ribosylation factor 1, Small GTP-binding ..."
31,31,"{A0A5K1K967, Q8IL48, C6KTA3, C6KSV2}",0.79529,4,0.367192,True,True,5,0.5,True,True,True,False,"[Elongation factor 1-gamma, putative, tRNA imp..."


In [170]:
protein_concat_df.sort_values("Degree", ascending=False).head(5)

Unnamed: 0,Accession ID,Gene ID,Product Description,3D7_MIS,3D7_MFS,Neighbours,essential,Degree,Betweenness Centrality,Closeness Centrality,stage,redundant,under_MIS_threshold
114,C0H5H0,PF3D7_1344200,endoplasmic reticulum chaperone GRP170,0.143,-2.839,"['O97282', 'Q8II43', 'Q8IFP3', 'Q8ILP6', 'C6KS...",True,69,0.14856,0.445327,5,False,True
109,Q8IAX5,PF3D7_0813900,"40S ribosomal protein S16, putative",0.141,-3.879,"['Q8IE10', 'C6KT19', 'C6KT25', 'Q8IIB4', 'Q8IE...",True,58,0.033885,0.4261,5,False,True
337,Q8IM10,PF3D7_1408600,"40S ribosomal protein S8e, putative",1.0,-2.487,"['Q8IL02', 'O97313', 'Q8I487', 'Q8I0P6', 'Q8II...",False,56,0.013215,0.416858,5,False,False
41,Q8I323,PF3D7_0912900,"26S proteasome regulatory subunit RPN8, putative",0.122,-3.009,"['Q8ILE3', 'Q76NM6', 'Q8IKB2', 'Q8I0V2', 'Q8I2...",True,53,0.053808,0.421672,5,False,True
62,Q8IDR9,PF3D7_1342000,40S ribosomal protein S6,0.127,-2.986,"['Q8ILI2', 'Q8IIA2', 'Q8IK15', 'Q8IIV2', 'A0A1...",True,53,0.022999,0.415909,5,False,True


## Improving Clusters with Coexpression

In [171]:
# Checking for clusters with size < 6
s3_df[s3_df["s3_size"] < 6].sort_values("s3_size", ascending=False).head()

Unnamed: 0,s3_number,s3_proteins,s3_avg_spearman,s3_size,avg_spearman_random,significant,large,stage,s3_essentiality,unique between stages,unique within stage,contains_putative,contains_uncharacterized
9,9,"{Q8IEJ6, Q8I0X1, Q8ILB6, P61074, O97227}",0.678557,5,0.3656,True,True,3,0.4,False,False,True,False
11,11,"{Q8I3N3, Q8I2I2, Q8IKI8, Q8I2N1, Q8IL75}",0.704249,5,0.376922,True,True,3,0.2,False,True,True,False
4,4,"{A0A144A2H0, Q8IDZ8, Q7KQM0, Q8II24}",0.935112,4,0.346397,True,True,3,0.5,True,False,False,False
5,5,"{Q8IEC8, Q8ILV6, Q8I2W2, Q8IDN6}",0.634717,4,0.34303,True,True,3,0.5,False,True,True,False
17,17,"{Q76NM4, Q8I3W9, A0A5K1K8H7, Q8IHR8}",0.48666,4,0.358231,True,True,3,0.75,False,True,False,False


In [172]:
s3_pairs_correlation = pd.read_csv("data/generated_tables/s3_interacting_pairs_correlation.tsv", sep="\t").drop("Unnamed: 0", axis=1)
s5_pairs_correlation = pd.read_csv("data/generated_tables/s5_interacting_pairs_correlation.tsv", sep="\t").drop("Unnamed: 0", axis=1)

In [173]:
from itertools import combinations

x = combinations(s3_df[s3_df["s3_size"] < 6].sort_values("s3_size", ascending=False).head(1)["s3_proteins"].values[0], 2)

In [213]:
s3_essentiality_df.dropna(subset=["3D7_MIS"], inplace=True)

In [175]:
cluster_9_pairs = []
for y in x:
    cluster_9_pairs.append(y)

In [176]:
cluster_9_pairs

[('Q8IEJ6', 'Q8I0X1'),
 ('Q8IEJ6', 'Q8ILB6'),
 ('Q8IEJ6', 'P61074'),
 ('Q8IEJ6', 'O97227'),
 ('Q8I0X1', 'Q8ILB6'),
 ('Q8I0X1', 'P61074'),
 ('Q8I0X1', 'O97227'),
 ('Q8ILB6', 'P61074'),
 ('Q8ILB6', 'O97227'),
 ('P61074', 'O97227')]

In [177]:
s5_essentiality_df[s5_essentiality_df["Accession ID"] == "C0H5H0"]

Unnamed: 0,Accession ID,Gene ID,Product Description,3D7_MIS,3D7_MFS,Neighbours,essential,Degree,Betweenness Centrality,Closeness Centrality,stage,redundant,under_MIS_threshold
114,C0H5H0,PF3D7_1344200,endoplasmic reticulum chaperone GRP170,0.143,-2.839,"['O97282', 'Q8II43', 'Q8IFP3', 'Q8ILP6', 'C6KS...",True,69,0.14856,0.445327,5,False,True


In [178]:
s3_essentiality_df.sort_values("Betweenness Centrality", ascending=False).head(5)

Unnamed: 0,Accession ID,Gene ID,Product Description,3D7_MIS,3D7_MFS,Neighbours,essential,Degree,Betweenness Centrality,Closeness Centrality,stage,redundant,under_MIS_threshold
146,K7NTP5,PF3D7_0831700,heat shock protein 70,1.0,0.585,"['Q8IJN7', 'C6KSV0', 'Q76NN8', 'Q8IB24', 'Q8IE...",True,41,0.122628,0.373715,3,True,False
104,C0H4V6,PF3D7_0818200,14-3-3 protein,0.585,-3.082,"['Q7KQL9', 'P61074', 'Q8I0V4', 'Q8IJN7', 'Q8II...",False,21,0.086894,0.348331,3,False,False
62,Q8II24,PF3D7_1134000,heat shock protein 70,0.182,-2.879,"['C0H4V6', 'Q8IKK7', 'Q8IKF0', 'O97227', 'Q8I2...",True,14,0.082584,0.332042,3,True,True
59,C6KSV0,PF3D7_0610400,histone H3,0.176,-2.61,"['K7NTP5', 'C6KT18', 'Q8IJ34', 'Q8IIV1', 'Q8IL...",True,19,0.05921,0.313972,3,False,True
36,Q8IC01,PF3D7_0708800,heat shock protein 110,0.14,-2.848,"['Q6ZMA8', 'Q8IDG2', 'Q8IK90', 'Q8I2J3', 'Q8ID...",True,21,0.057584,0.292113,3,False,True


In [179]:
crossed_df[crossed_df["num_overlap"] > 2].sort_values("similarity_score", ascending=False).head(5)

Unnamed: 0,s3_number,s5_number,s3_proteins,s5_proteins,s3_avg_spearman,s5_avg_spearman,s3_size,s5_size,total_size,common_proteins,num_overlap,percent_overlap,jaccard_index,similarity_score
1466,17,38,"{Q76NM4, Q8I3W9, A0A5K1K8H7, Q8IHR8}","{Q8IIK8, Q8I3W9, Q8I274, A0A5K1K8H7, C0H516, Q...",0.48666,0.639987,4,6,10,"[Q76NM4, Q8I3W9, A0A5K1K8H7]",3,0.75,0.428571,0.589286
996,11,72,"{Q8I3N3, Q8I2I2, Q8IKI8, Q8I2N1, Q8IL75}","{Q8I3N3, Q8IJC6, Q8I2I2, Q8I6T2, Q8IL75}",0.704249,0.599209,5,5,10,"[Q8I3N3, Q8I2I2, Q8IL75]",3,0.6,0.428571,0.514286
946,11,22,"{Q8I3N3, Q8I2I2, Q8IKI8, Q8I2N1, Q8IL75}","{Q8I3N3, Q8I2I2, Q8IEU2, Q8IBN4, Q8IL75, Q8IC43}",0.704249,0.379117,5,6,11,"[Q8I3N3, Q8I2I2, Q8IL75]",3,0.6,0.375,0.4875
89,1,5,"{Q8IDG2, Q7K6A9, Q76NM6, Q8IC01, Q8IBI3, Q8IEK...","{Q8IDG2, Q7K6A9, Q8II71, C0H4C7, C6KST5, Q8IBI...",0.738465,0.80047,24,35,59,"[Q8IDG2, Q7K6A9, Q8IBI3, Q8IK90, Q8II43, Q8IDG...",15,0.625,0.340909,0.482955
509,6,5,"{Q8IDG2, Q7K6A9, Q76NM6, Q8IC01, Q8IBI3, Q8IEK...","{Q8IDG2, Q7K6A9, Q8II71, C0H4C7, C6KST5, Q8IBI...",0.721655,0.80047,24,35,59,"[Q8IDG2, Q7K6A9, Q8IBI3, Q8IK90, Q8II43, Q8IDG...",15,0.625,0.340909,0.482955


In [180]:
protein_concat_df.dropna(subset=["3D7_MIS"], inplace=True)
protein_concat_df.isna().sum()

Accession ID              0
Gene ID                   0
Product Description       0
3D7_MIS                   0
3D7_MFS                   0
Neighbours                0
essential                 0
Degree                    0
Betweenness Centrality    0
Closeness Centrality      0
stage                     0
redundant                 0
under_MIS_threshold       0
dtype: int64

In [181]:
protein_concat_df[(protein_concat_df["under_MIS_threshold"] == False) & (protein_concat_df["stage"] == 3)].sort_values("Degree", ascending=False).head(5)

Unnamed: 0,Accession ID,Gene ID,Product Description,3D7_MIS,3D7_MFS,Neighbours,essential,Degree,Betweenness Centrality,Closeness Centrality,stage,redundant,under_MIS_threshold
146,K7NTP5,PF3D7_0831700,heat shock protein 70,1.0,0.585,"['Q8IJN7', 'C6KSV0', 'Q76NN8', 'Q8IB24', 'Q8IE...",True,41,0.122628,0.373715,3,True,False
106,Q8I0V2,PF3D7_1235700,"ATP synthase subunit beta, mitochondrial",0.622,-2.512,"['C6KTA4', 'C6KT18', 'O97247', 'Q8IB24', 'Q8IE...",False,27,0.043595,0.339367,3,False,False
104,C0H4V6,PF3D7_0818200,14-3-3 protein,0.585,-3.082,"['Q7KQL9', 'P61074', 'Q8I0V4', 'Q8IJN7', 'Q8II...",False,21,0.086894,0.348331,3,False,False
119,Q8IFP3,PF3D7_0422300,alpha tubulin 2,0.966,-1.974,"['Q8I0P6', 'Q8IKK7', 'Q8I1V1', 'Q8IEC8', 'Q8IK...",False,18,0.021709,0.32967,3,False,False
108,Q8IJW0,PF3D7_1008400,"26S protease regulatory subunit 4, putative",0.745,-2.429,"['Q8I0P6', 'Q8I1V1', 'Q8IB03', 'Q8IES0', 'O962...",False,17,0.003012,0.284024,3,False,False


In [182]:
crossed_df[(crossed_df["num_overlap"] == 1) & (crossed_df["s3_size"] > 2) & (crossed_df["s5_size"] > 2)].sort_values("similarity_score", ascending=False).head(5)

Unnamed: 0,s3_number,s5_number,s3_proteins,s5_proteins,s3_avg_spearman,s5_avg_spearman,s3_size,s5_size,total_size,common_proteins,num_overlap,percent_overlap,jaccard_index,similarity_score
1142,13,50,"{Q8I6S5, Q8IJ34, Q8IEA6}","{Q8I623, Q8IDN6, O97249, Q8IJ34, Q8I431, O97319}",0.474638,0.608103,3,6,9,[Q8IJ34],1,0.333333,0.125,0.229167
1101,13,9,"{Q8I6S5, Q8IJ34, Q8IEA6}","{C6KSV3, Q8IEJ6, Q8IL11, Q8IEA6, Q8I0X1, O9722...",0.474638,0.461839,3,7,10,[Q8IEA6],1,0.333333,0.111111,0.222222
1115,13,23,"{Q8I6S5, Q8IJ34, Q8IEA6}","{Q8IKQ9, Q8IIX0, A0A144A1R5, Q8I463, O97249, Q...",0.474638,0.496267,3,9,12,[Q8IJ34],1,0.333333,0.090909,0.212121
1099,13,7,"{Q8I6S5, Q8IJ34, Q8IEA6}","{C6KSV3, Q8IFP3, Q8IEJ6, Q8IEA6, Q8I0X1, C6KTA...",0.474638,0.407307,3,9,12,[Q8IEA6],1,0.333333,0.090909,0.212121
684,8,12,"{C6KT50, Q7KQL9, Q8IJN7}","{Q8ILA4, Q8ID43, Q8IFP3, Q8I374, Q8IJN9, Q6ZLZ...",0.607708,0.592856,3,17,20,[Q7KQL9],1,0.333333,0.052632,0.192982


In [183]:
s5_crossed_df[(s5_crossed_df["num_overlap"] == 1) & (s5_crossed_df["total_size"] < 15) & (s5_crossed_df["common_proteins"].apply(lambda x: "Q8IC05" in x))].sort_values("total_size", ascending=False).head(5)

Unnamed: 0,number_1,proteins_1,avg_spearman_1,size_1,number_2,proteins_2,avg_spearman_2,size_2,total_size,common_proteins,num_overlap
1285,15,"{Q8ILE3, Q8IJM0, Q8IAR6, Q8I5M9, Q8IC05, Q8IKH...",0.758082,8,25,"{Q8IJS2, Q8IFP2, Q76NM6, Q7KQL5, Q8IC05, Q8I320}",0.416601,6,14,[Q8IC05],1
2115,25,"{Q8IJS2, Q8IFP2, Q76NM6, Q7KQL5, Q8IC05, Q8I320}",0.416601,6,15,"{Q8ILE3, Q8IJM0, Q8IAR6, Q8I5M9, Q8IC05, Q8IKH...",0.758082,8,14,[Q8IC05],1


In [184]:
s5_crossed_df[(s5_crossed_df["num_overlap"] == 1)].sort_values("total_size", ascending=False).head()

Unnamed: 0,number_1,proteins_1,avg_spearman_1,size_1,number_2,proteins_2,avg_spearman_2,size_2,total_size,common_proteins,num_overlap
1430,17,"{Q8IKT2, C6KT25, Q8I3X4, Q8I5P5, P50250, Q8IE6...",0.489377,9,2,"{Q8I4U5, Q8IJX8, Q8IJM9, Q8IKM5, C6KT18, Q8IJ3...",0.516712,65,74,[C6KT25],1
185,2,"{Q8I4U5, Q8IJX8, Q8IJM9, Q8IKM5, C6KT18, Q8IJ3...",0.516712,65,17,"{Q8IKT2, C6KT25, Q8I3X4, Q8I5P5, P50250, Q8IE6...",0.489377,9,74,[C6KT25],1
231,2,"{Q8I4U5, Q8IJX8, Q8IJM9, Q8IKM5, C6KT18, Q8IJ3...",0.516712,65,63,"{Q8IJN9, Q8IIR8, Q8IKT2, Q8I6U7, C6KT25, Q8I3X...",0.499435,8,73,[C6KT25],1
5294,63,"{Q8IJN9, Q8IIR8, Q8IKT2, Q8I6U7, C6KT25, Q8I3X...",0.499435,8,2,"{Q8I4U5, Q8IJX8, Q8IJM9, Q8IKM5, C6KT18, Q8IJ3...",0.516712,65,73,[C6KT25],1
1262,15,"{Q8ILE3, Q8IJM0, Q8IAR6, Q8I5M9, Q8IC05, Q8IKH...",0.758082,8,2,"{Q8I4U5, Q8IJX8, Q8IJM9, Q8IKM5, C6KT18, Q8IJ3...",0.516712,65,73,[Q8IEQ1],1


In [185]:
def get_cluster_descs(cluster_num, stage_num):
    return cluster_concat_df[(cluster_concat_df["stage"] == stage_num) & (cluster_concat_df["Number"] == cluster_num)].values[0][-1]

def get_proteins_from_cluster(cluster_num, stage_num):
    return cluster_concat_df[(cluster_concat_df["stage"] == stage_num) & (cluster_concat_df["Number"] == cluster_num)].values[0][1]

In [None]:
get_cluster_descs(16, 5)

['Signal peptide peptidase',
 'Mitochondrial-processing peptidase subunit beta, putative',
 '60S ribosomal protein L31',
 'High mobility group protein B2',
 'Endoplasmin, putative']

In [187]:
get_proteins_from_cluster(25, 5)
#  Q8I1V1 Q8I5M9 Q8IAR6 Q8IC05 Q8IEQ1 Q8IJM0 Q8IKH3 Q8ILE3 Q76NM6 Q7KQL5 Q8I320 Q8IC05 Q8IFP2 Q8IJS2

{'Q76NM6', 'Q7KQL5', 'Q8I320', 'Q8IC05', 'Q8IFP2', 'Q8IJS2'}

In [188]:
crossed_df[(crossed_df["s3_size"] > 2) & (crossed_df["s5_size"] > 2)].sort_values("similarity_score", ascending=False).head(5)

Unnamed: 0,s3_number,s5_number,s3_proteins,s5_proteins,s3_avg_spearman,s5_avg_spearman,s3_size,s5_size,total_size,common_proteins,num_overlap,percent_overlap,jaccard_index,similarity_score
1466,17,38,"{Q76NM4, Q8I3W9, A0A5K1K8H7, Q8IHR8}","{Q8IIK8, Q8I3W9, Q8I274, A0A5K1K8H7, C0H516, Q...",0.48666,0.639987,4,6,10,"[Q76NM4, Q8I3W9, A0A5K1K8H7]",3,0.75,0.428571,0.589286
996,11,72,"{Q8I3N3, Q8I2I2, Q8IKI8, Q8I2N1, Q8IL75}","{Q8I3N3, Q8IJC6, Q8I2I2, Q8I6T2, Q8IL75}",0.704249,0.599209,5,5,10,"[Q8I3N3, Q8I2I2, Q8IL75]",3,0.6,0.428571,0.514286
946,11,22,"{Q8I3N3, Q8I2I2, Q8IKI8, Q8I2N1, Q8IL75}","{Q8I3N3, Q8I2I2, Q8IEU2, Q8IBN4, Q8IL75, Q8IC43}",0.704249,0.379117,5,6,11,"[Q8I3N3, Q8I2I2, Q8IL75]",3,0.6,0.375,0.4875
509,6,5,"{Q8IDG2, Q7K6A9, Q76NM6, Q8IC01, Q8IBI3, Q8IEK...","{Q8IDG2, Q7K6A9, Q8II71, C0H4C7, C6KST5, Q8IBI...",0.721655,0.80047,24,35,59,"[Q8IDG2, Q7K6A9, Q8IBI3, Q8IK90, Q8II43, Q8IDG...",15,0.625,0.340909,0.482955
89,1,5,"{Q8IDG2, Q7K6A9, Q76NM6, Q8IC01, Q8IBI3, Q8IEK...","{Q8IDG2, Q7K6A9, Q8II71, C0H4C7, C6KST5, Q8IBI...",0.738465,0.80047,24,35,59,"[Q8IDG2, Q7K6A9, Q8IBI3, Q8IK90, Q8II43, Q8IDG...",15,0.625,0.340909,0.482955


In [189]:
protein_concat_df[protein_concat_df["Degree"] < 10].sort_values("Closeness Centrality", ascending=True).head(10)

Unnamed: 0,Accession ID,Gene ID,Product Description,3D7_MIS,3D7_MFS,Neighbours,essential,Degree,Betweenness Centrality,Closeness Centrality,stage,redundant,under_MIS_threshold
71,Q8IJN8,PF3D7_1015800,ribonucleoside-diphosphate reductase small cha...,0.13,-3.167,['Q8IM38'],True,1,0.0,0.002625,5,True,True
97,Q8I2W4,PF3D7_0918900,gamma-glutamylcysteine synthetase,0.137,-2.846,['C0H551'],True,1,0.0,0.002625,5,False,True
316,C0H551,PF3D7_0922600,glutamine synthetase,0.999,-2.576,['Q8I2W4'],False,1,0.0,0.002625,5,False,False
13,Q8IM38,PF3D7_1405600,ribonucleoside-diphosphate reductase small cha...,0.12,-2.846,['Q8IJN8'],True,1,0.0,0.002625,5,True,True
357,Q8I305,PF3D7_0914700,major facilitator superfamily-related transpor...,1.0,0.707,['Q8II64'],True,1,0.0,0.002625,5,True,False
86,P62203,PF3D7_1434200,calmodulin,0.133,-3.037,['Q8IKV9'],True,1,0.0,0.002625,5,False,True
351,Q8II64,PF3D7_1129900,major facilitator superfamily-related transpor...,1.0,-1.042,['Q8I305'],True,1,0.0,0.002625,5,True,False
305,Q8IKV9,PF3D7_1451700,calcineurin subunit B,0.996,0.0,['P62203'],False,1,0.0,0.002625,5,False,False
150,Q8I492,PF3D7_0500800,mature parasite-infected erythrocyte surface a...,1.0,-1.237,['O96127'],False,1,0.0,0.00641,3,False,False
151,Q8IFM9,PF3D7_0423700,early transcribed membrane protein 4,1.0,0.0,['Q8IJM9'],False,1,0.0,0.00641,3,False,False


In [190]:
s3_df[s3_df["significant"] == False].sort_values("s3_size", ascending=False).head(5)

Unnamed: 0,s3_number,s3_proteins,s3_avg_spearman,s3_size,avg_spearman_random,significant,large,stage,s3_essentiality,unique between stages,unique within stage,contains_putative,contains_uncharacterized
14,14,"{Q8IAX8, Q8IIV1, Q8IJX8, Q8IJX3, C6KT18, Q8I60...",0.331886,14,0.369976,False,True,3,0.285714,False,False,True,False
24,24,"{Q8I5A9, Q76NN8, Q8IBD1, O97285, O96252, Q8IKF...",0.312215,13,0.355444,False,True,3,0.461538,False,False,True,True
30,30,"{Q8IAX8, Q8IIV1, Q8IM01, Q8IJX3, C6KT18, Q8I5H...",0.339482,9,0.369432,False,True,3,0.111111,False,False,True,False


In [191]:
cluster_concat_df.sort_values("avg_spearman", ascending=False).head(5)

Unnamed: 0,Number,Proteins,avg_spearman,size,avg_spearman_random,significant,large,stage,essentiality,unique between stages,unique within stage,contains_putative,contains_uncharacterized,descriptions
66,66,"{Q8II42, C6S3I6, Q8I3A1, Q8II92, P61074}",0.940508,5,0.38321,True,True,5,0.6,True,False,True,False,"[Nucleic acid binding protein, putative, Repli..."
4,4,"{A0A144A2H0, Q8IDZ8, Q7KQM0, Q8II24}",0.935112,4,0.346397,True,True,3,0.5,True,False,False,False,"[Aminopeptidase P, 20 kDa chaperonin, Trioseph..."
30,30,"{Q8II82, Q8I3Y6, Q8I3A4, Q8IBZ4, C0H4C7, Q8ILS...",0.8794,7,0.395047,True,True,5,0.428571,True,False,True,True,"[Prefoldin subunit 5, putative, Probable prefo..."
1,1,"{Q8IBS3, Q8IIW2, Q8ILP6, Q8IIA4, Q8IDZ9, Q8I246}",0.873057,6,0.384568,True,True,5,1.0,True,False,False,False,"[serine--tRNA ligase, phenylalanine--tRNA liga..."
36,36,"{C6S3I6, Q8I5Q3, Q8I3A1, Q8II42}",0.838632,4,0.419211,True,True,5,0.5,True,False,True,False,"[Replication factor A protein 3, putative, 10 ..."


In [192]:
cluster_concat_df[cluster_concat_df["size"] > 4].sort_values("avg_spearman", ascending=True).head(5)

Unnamed: 0,Number,Proteins,avg_spearman,size,avg_spearman_random,significant,large,stage,essentiality,unique between stages,unique within stage,contains_putative,contains_uncharacterized,descriptions
13,13,"{Q8IBB9, Q6ZMA7, Q8I476, Q8IM45, Q8IKC8, Q8I0U...",0.199703,22,0.395407,False,True,5,0.363636,False,False,True,True,"[GPI-anchored micronemal antigen, Parasitophor..."
59,59,"{Q8IM15, Q8IBN4, A0A143ZZK9, A0A5K1K8W5, Q8I2X...",0.28722,6,0.395289,False,True,5,0.5,True,False,True,False,"[Plasmepsin III, Secreted ookinete protein, pu..."
77,77,"{Q9TY94, Q8IBG1, A0A144A1R5, Q7KQL5, Q8I463, Q...",0.302591,9,0.406181,False,True,5,0.666667,False,False,True,False,"[RNA helicase, Dynein heavy chain-like protein..."
24,24,"{Q8I5A9, Q76NN8, Q8IBD1, O97285, O96252, Q8IKF...",0.312215,13,0.355444,False,True,3,0.461538,False,False,True,True,"[Ras-related protein Rab-2, Calcium-transporti..."
14,14,"{Q8IAX8, Q8IIV1, Q8IJX8, Q8IJX3, C6KT18, Q8I60...",0.331886,14,0.369976,False,True,3,0.285714,False,False,True,False,"[DNA/RNA-binding protein ALBA1, Histone H2B, E..."


In [201]:
s5_pairs_correlation[s5_pairs_correlation["Spearman_all"] < 0].sort_values("Spearman_all")

Unnamed: 0,Protein1,Protein2,Spearman_all,Pearson_all
991,O77381,Q8IK83,-0.628458,-0.380961
1702,Q8I0U8,Q8I2F2,-0.620553,-0.341463
1783,Q76NN6,Q8IE66,-0.569170,-0.143485
1378,A0A5K1K9E5,Q8IDR9,-0.569170,-0.341872
2053,Q8IDR9,Q8IK83,-0.546443,-0.445453
...,...,...,...,...
878,C0H5C2,Q8IIV2,-0.007905,0.589803
1122,Q8IEB8,Q8IIF0,-0.005929,0.016923
1377,A0A143ZZK9,C0H5H0,-0.004941,0.565102
1816,Q8IFM0,Q8IKY4,-0.004941,-0.004709


In [215]:
cluster_concat_df[cluster_concat_df["contains_uncharacterized"]].sort_values("avg_spearman", ascending=False).head(5)

Unnamed: 0,Number,Proteins,avg_spearman,size,avg_spearman_random,significant,large,stage,essentiality,unique between stages,unique within stage,contains_putative,contains_uncharacterized,descriptions
30,30,"{Q8II82, Q8I3Y6, Q8I3A4, Q8IBZ4, C0H4C7, Q8ILS...",0.8794,7,0.395047,True,True,5,0.428571,True,False,True,True,"[Prefoldin subunit 5, putative, Probable prefo..."
80,80,"{Q8IK89, Q8I3S3, O97306, Q8ILZ7}",0.615119,4,0.385262,True,True,5,0.0,True,False,True,True,"[Trailer hitch homolog, putative, mRNA-binding..."
12,12,"{Q8I490, Q8IDG9, Q8IEI6}",0.588274,3,0.422134,True,True,3,0.0,True,False,False,True,"[EMP1-trafficking protein, Uncharacterized pro..."
68,68,"{Q7KQK6, Q76NN6, A0A144A0F5, Q8IKA6}",0.569499,4,0.398992,True,True,5,0.25,False,False,True,True,"[GTP-binding nuclear protein, Ran-specific GTP..."
74,74,"{Q8IKW0, Q8IM24, Q8IKS0, Q8I3X7}",0.560441,4,0.397528,True,True,5,1.0,True,False,False,True,"[LCCL domain-containing protein, LCCL/lectin a..."


In [None]:
list(get_proteins_from_cluster(30, 5))

['Q8II82', 'Q8I3Y6', 'Q8I3A4', 'Q8IBZ4', 'C0H4C7', 'Q8ILS7', 'Q8IBR6']

In [209]:
list(get_proteins_from_cluster(0, 3))

['Q8IIV1',
 'Q8I0V2',
 'O97285',
 'Q8IKF0',
 'C6KT18',
 'Q8IK89',
 'Q8IKG9',
 'Q8IET7',
 'Q8I5L6',
 'Q8IB24',
 'Q8I5H4',
 'K7NTP5',
 'Q8IIV2',
 'Q8I0P6',
 'Q8IBV7',
 'C6KSV0']

In [210]:
get_protein_desc_from_acccession(list(get_proteins_from_cluster(0, 3)))

['Histone H2B',
 'ATP synthase subunit beta',
 'ATP-dependent RNA helicase DDX6',
 'RNA helicase',
 'Histone H2A',
 'Trailer hitch homolog, putative',
 'Uncharacterized protein',
 '40S ribosomal protein S7',
 'Clathrin heavy chain',
 'Heat shock 70 kDa protein',
 'Polyadenylate-binding protein',
 'Heat shock 70 kDa protein',
 'Histone H4',
 'Elongation factor 1-alpha',
 'Histone H2B',
 'Histone H3']

In [217]:
s5_df

Unnamed: 0,s5_number,s5_proteins,s5_avg_spearman,s5_size,avg_spearman_random,significant,large,stage,s5_essentiality,unique between stages,unique within stage,contains_putative,contains_uncharacterized
0,0,"{C0H4Y6, Q8I2V9, C0H5H0, Q8IIV8, Q8IKB2, O96230}",0.500198,6,0.371061,True,True,5,0.666667,False,True,False,False
1,1,"{Q8IBS3, Q8IIW2, Q8ILP6, Q8IIA4, Q8IDZ9, Q8I246}",0.873057,6,0.384568,True,True,5,1.000000,True,False,False,False
2,2,"{Q8I4U5, Q8IJX8, Q8IJM9, Q8IKM5, C6KT18, Q8IJ3...",0.516712,65,0.399390,True,True,5,0.584615,False,False,True,False
3,3,"{Q8IE84, Q76NM6, Q8IDS0, Q8I280, Q8I2H3, Q8IEP...",0.734660,7,0.424433,True,True,5,0.428571,False,True,True,False
4,4,"{Q8I3J0, Q8I5H2, Q8IIR7, C0H4U4, Q8I608, Q8ILI...",0.411491,7,0.389676,True,True,5,0.571429,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,79,"{Q8I305, Q8II64}",0.518775,2,0.408641,True,False,5,1.000000,True,True,True,False
80,80,"{Q8IK89, Q8I3S3, O97306, Q8ILZ7}",0.615119,4,0.385262,True,True,5,0.000000,True,False,True,True
81,81,"{Q8IAM2, Q9NLB2, Q8IL80, Q7KQL8}",0.670455,4,0.419797,True,True,5,0.500000,True,False,False,False
82,82,"{Q7KQK6, Q76NN6, Q8ILK1, Q8IKA6}",0.655138,4,0.388052,True,True,5,0.250000,False,False,True,False
