In [81]:
import pandas as pd
import os
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import requests

pd.set_option('display.max_colwidth', None)

## Helper Functions

In [82]:
def plot_histogram_per_predclass(df, df_name):
    missing_percentage_per_class = {}
    total_rows_per_class = df.groupby('PredClass').size()
    
    for pred_class, group in df.groupby('PredClass'):
        missing_rows = group['Pred'].isnull().sum()
        missing_percentage = (missing_rows / total_rows_per_class[pred_class]) * 100
        missing_percentage_per_class[pred_class] = missing_percentage

    print(f"\n{df_name} - Percentage of missing 'Pred' values per 'PredClass':")
    for pred_class, missing_percentage in missing_percentage_per_class.items():
        print(f"{pred_class}: {missing_percentage:.2f}%")

    print(f"\n{df_name} - Number of rows per 'PredClass' where 'Pred' value is present:")
    for pred_class, group in df.groupby('PredClass'):
        rows_with_pred = group['Pred'].notnull().sum()
        print(f"{pred_class}: {rows_with_pred}")

        sorted_pred = group['Pred'].dropna().sort_values()   
        print(f"Top 5 highest 'Pred' values for {pred_class}: {sorted_pred[-5:].values}")
        print(f"Bottom 5 lowest 'Pred' values for {pred_class}: {sorted_pred[:5].values}")

    df_dropna = df.dropna(subset=['Pred'])
    plt.figure(figsize=(12, 6))
    for pred_class, group in df_dropna.groupby('PredClass'):
        plt.hist(group['Pred'], bins=20, alpha=0.5, label=str(pred_class))

    plt.xlabel('Pred Value')
    plt.ylabel('Frequency')
    plt.title(f'Histogram of Pred Values per PredClass for {df_name}')
    plt.legend(title='Pred Class', loc='upper right')
    plt.show()


## Data Imports 

In [84]:
directory = '/Users/kristinagrigaityte/PycharmProjects/pulls/Files/BSModel/Predictions'
csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]

for file in csv_files:
    df_name = os.path.splitext(file)[0] 
    globals()[df_name] = pd.read_csv(os.path.join(directory, file))

In [85]:
categories = ['Selection', 'Balancing Selection', 'Negative Freq-Dep. Selection', 'Overdominance']
prefixes = ['recent', 'old', 'medium']
indexes = ['1', '2', '3', '3']
chromosomes = ['ch1', 'ch2', 'ch3', 'ch4', 'ch5', 'ch6', 'ch7', 'ch8', 'ch9', 'ch10',
               'ch11', 'ch12', 'ch13', 'ch14', 'ch15', 'ch16', 'ch17', 'ch18', 'ch19', 'ch20', 'ch21', 'ch22']

data_list = []

for chrom in chromosomes:
    count_dict = {'chromosome': chrom}
    
    for prefix in prefixes:
        for i, category in zip(indexes, categories):
            category_name_parts = category.split()
            formatted_category_name = category_name_parts[0].lower() + ''.join(word.capitalize() for word in category_name_parts[1:])
            df_name = f'prediction_{prefix}_{i}_{chrom}'
            df = globals()[df_name]
            
            selected_rows = df[df['PredClass'] == category]
            
            count = selected_rows.shape[0]
            count_dict[f'{prefix}_{formatted_category_name}'] = count
            
            if category != 'Overdominance':
                if count > 0:
                    avg_score = selected_rows['Pred'].mean()
                else:
                    avg_score = 0.0
                count_dict[f'{prefix}_{formatted_category_name}_avg'] = f"{avg_score:.3f}"
    
    data_list.append(count_dict)

counts_df = pd.DataFrame(data_list)
counts_df

Unnamed: 0,chromosome,recent_selection,recent_selection_avg,recent_balancingSelection,recent_balancingSelection_avg,recent_negativeFreq-dep.Selection,recent_negativeFreq-dep.Selection_avg,recent_overdominance,old_selection,old_selection_avg,...,old_negativeFreq-dep.Selection,old_negativeFreq-dep.Selection_avg,old_overdominance,medium_selection,medium_selection_avg,medium_balancingSelection,medium_balancingSelection_avg,medium_negativeFreq-dep.Selection,medium_negativeFreq-dep.Selection_avg,medium_overdominance
0,ch1,150,0.998,23,0.606,147,0.565,8583,150,0.997,...,150,0.655,8580,150,1.0,0,0.0,150,0.709,8580
1,ch2,1,0.985,0,0.0,1,0.526,9060,1,0.986,...,1,0.549,9060,1,0.998,0,0.0,1,0.575,9060
2,ch3,763,0.998,106,0.619,738,0.552,6633,763,0.998,...,763,0.628,6608,763,1.0,17,0.6,763,0.678,6608
3,ch4,454,0.999,58,0.594,453,0.573,5730,454,0.999,...,454,0.676,5729,454,1.0,0,0.0,454,0.736,5729
4,ch5,133,0.999,10,0.579,133,0.573,6339,133,0.999,...,133,0.672,6339,133,1.0,0,0.0,133,0.732,6339
5,ch6,1393,0.998,252,0.601,1375,0.562,5693,1393,0.999,...,1393,0.652,5675,1393,1.0,1,0.509,1393,0.704,5675
6,ch7,353,0.999,28,0.599,350,0.58,5583,353,0.999,...,353,0.687,5580,353,1.0,0,0.0,353,0.752,5580
7,ch8,145,0.994,20,0.567,144,0.546,5590,145,0.992,...,145,0.614,5589,145,0.999,0,0.0,145,0.662,5589
8,ch9,390,0.999,54,0.629,385,0.579,4579,390,0.999,...,390,0.684,4574,390,1.0,0,0.0,390,0.744,4574
9,ch10,429,0.999,74,0.615,425,0.584,5455,429,0.999,...,429,0.7,5451,429,1.0,0,0.0,429,0.759,5451


In [86]:
def plot_line_with_values(dataframe, prefix, legend_labels, colors, filename):
    relevant_columns = [col for col in dataframe.columns if col.startswith(prefix) and "_avg" in col]
    plot_data = dataframe[['chromosome'] + relevant_columns]
    
    plot_data.set_index('chromosome', inplace=True)
    plot_data = plot_data.apply(pd.to_numeric, errors='coerce')
    plot_data.dropna(subset=relevant_columns, how='all', inplace=True)
    
    ax = plot_data.plot(kind='line', figsize=(14, 7), color=colors, marker='o')
    plt.title(f'Line Plot for {prefix.capitalize()} Selection')
    plt.xlabel('Chromosome')
    plt.ylabel('Prediction Score')
    
    new_labels = [legend_labels.get(col, col) for col in relevant_columns]
    plt.legend(new_labels, title='Selection Type', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    ax.set_xticks(range(len(plot_data.index)))
    ax.set_xticklabels(plot_data.index, rotation=45, ha='right')
    
    for line in ax.get_lines():
        for x, y in zip(line.get_xdata(), line.get_ydata()):
            if y == 0:
                ax.annotate(f'{y:.2f}', xy=(x, y), xytext=(5, 5), textcoords='offset points')

    plt.tight_layout()
    
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

legend_labels_avg = {
    'recent_selection_avg': 'Selection Prediction Average',
    'recent_balancingSelection_avg': 'Balancing Selection Prediction Average',
    'recent_negativeFreq-dep.Selection_avg': 'Negative Freq-dep. Selection Prediction Average',
    
    'old_selection_avg': 'Selection Prediction Average',
    'old_balancingSelection_avg': 'Balancing Selection Prediction Average',
    'old_negativeFreq-dep.Selection_avg': 'Negative Freq-dep. Selection Prediction Average',
    
    'medium_selection_avg': 'Selection Prediction Average',
    'medium_balancingSelection_avg': 'Balancing Selection Prediction Average',
    'medium_negativeFreq-dep.Selection_avg': 'Negative Freq-dep. Selection Prediction Average'
}

secure_blue = '#3357A4'
peaceful_mint = '#A0E0D8'
innovation_purple = '#879BEF'

colors_dict = {
    'old': [secure_blue, peaceful_mint, innovation_purple],
    'recent': [secure_blue, peaceful_mint, innovation_purple],
    'medium': [secure_blue, peaceful_mint, innovation_purple]
}

for prefix in ['old', 'recent', 'medium']:
    filename = f'{prefix}_selection_plot.png'
    plot_line_with_values(counts_df, prefix, legend_labels_avg, colors_dict[prefix], filename)
    plt.show()

In [123]:
def plot_stacked_bars(dataframe, prefix, legend_labels, colors, filename):
    relevant_columns = [col for col in dataframe.columns if col.startswith(prefix) and "avg" not in col]
    plot_data = dataframe[['chromosome'] + relevant_columns]

    plot_data.set_index('chromosome', inplace=True)

    ax = plot_data.plot(kind='bar', stacked=True, figsize=(14, 7), color=colors)
    plt.title(f'Stacked Bar Plot for {prefix.capitalize()} Selection')
    plt.xlabel('Chromosome')
    plt.ylabel('SNP Count')
    
    new_labels = [legend_labels.get(col, col) for col in relevant_columns]
    
    plt.legend(new_labels, title='Selection Type', bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
    plt.tight_layout()
    
    plt.savefig(filename)
    plt.close()

legend_labels = {
    'recent_selection': 'Selection',
    'recent_balancingSelection': 'Balancing Selection',
    'recent_negativeFreq-dep.Selection': 'Negative Freq-dep. Selection',
    'recent_overdominance': 'Overdominance',
    
    'old_selection': 'Selection',
    'old_balancingSelection': 'Balancing Selection',
    'old_negativeFreq-dep.Selection': 'Negative Freq-dep. Selection',
    'old_overdominance': 'Overdominance',
    
    'medium_selection': 'Selection',
    'medium_balancingSelection': 'Balancing Selection',
    'medium_negativeFreq-dep.Selection': 'Negative Freq-dep. Selection',
    'medium_overdominance': 'Overdominance',
}

secure_blue = '#879BEF'
peaceful_mint = '#FFBD00'
innovation_purple = '#72bcd4'
navy = '#3357A4'

colors_dict = {
    'old': [secure_blue, peaceful_mint, innovation_purple, navy],
    'recent': [secure_blue, peaceful_mint, innovation_purple, navy],
    'medium': [secure_blue, peaceful_mint, innovation_purple, navy]
}

for prefix in ['old', 'recent', 'medium']:
    filename = f"{prefix}_stacked_bar_plot.png"
    plot_stacked_bars(counts_df, prefix, legend_labels, colors_dict[prefix], filename)

In [88]:
available_dfs = [var_name for var_name, var in globals().items() if isinstance(var, pd.DataFrame)]

print("Available Dataframes:")
for df_name in available_dfs:
    print(df_name)

Available Dataframes:
__
___
prediction_recent_1_ch15
prediction_old_1_ch15
prediction_medium_3_ch21
prediction_medium_2_ch21
prediction_old_1_ch5
prediction_medium_1_ch3
prediction_medium_1_ch2
prediction_old_1_ch4
prediction_medium_2_ch20
prediction_medium_3_ch20
prediction_old_1_ch14
prediction_recent_1_ch14
prediction_recent_1_ch16
prediction_old_1_ch16
prediction_medium_2_ch22
prediction_medium_3_ch22
prediction_old_1_ch6
prediction_medium_1_ch1
prediction_old_1_ch7
prediction_old_1_ch17
prediction_recent_1_ch17
prediction_old_1_ch13
prediction_recent_1_ch13
prediction_old_1_ch3
prediction_recent_1_ch9
prediction_medium_1_ch5
prediction_medium_1_ch4
prediction_recent_1_ch8
prediction_old_1_ch2
prediction_recent_1_ch12
prediction_old_1_ch12
prediction_old_1_ch10
prediction_recent_1_ch10
prediction_medium_3_ch18
prediction_medium_2_ch18
prediction_medium_1_ch6
prediction_medium_1_ch7
prediction_old_1_ch1
prediction_medium_2_ch19
prediction_medium_3_ch19
prediction_recent_1_ch11
pred

## Archaic Matches

In [48]:
archaic = pd.read_csv('/Users/kristinagrigaityte/PycharmProjects/pulls/Files/BSModel/Predictions/archaic.csv')
archaic.tail()

Unnamed: 0,chr,seg,from,to,AltaiNean,AltaiDeni
1594,22,103,49054049,49103380,0.5859,0.5636
1595,22,104,49103660,49153192,0.9831,0.9636
1596,22,105,49307031,49353474,0.1228,0.2586
1597,22,106,49404049,49450895,0.5225,0.5405
1598,22,107,49460591,49502569,0.7818,0.5862


In [100]:
def add_chr_column(df, chr_number):
    df = df.copy()
    df.loc[:, 'chr'] = chr_number
    return df

def get_df_name(prefix, chr_number):
    return f'{prefix}_ch{chr_number}'

prefixes = ['prediction_recent_2', 'prediction_recent_3']
pred_classes = ["Balancing Selection", "Overdominance", "Negative Freq-Dep. Selection"]
all_dfs = []

for chr_number in range(1, 23):  # Using 1 to 2 for this example
    for prefix in prefixes:
        df_name = get_df_name(prefix, chr_number)
        try:
            df = globals()[df_name]
            for pred_class in pred_classes:
                filtered_df = df[df['PredClass'] == pred_class]
                if not filtered_df.empty:
                    filtered_df = add_chr_column(filtered_df, chr_number)
                    all_dfs.append(filtered_df)
        except KeyError:
            continue

all_data = pd.concat(all_dfs, ignore_index=True)
all_data.tail()

Unnamed: 0,SNP,Positions,Pred,PredClass,chr
109654,rs131718,49404789,0.586998,Negative Freq-Dep. Selection,22
109655,rs131715,49413787,0.585034,Negative Freq-Dep. Selection,22
109656,rs9616915,49464446,0.550545,Negative Freq-Dep. Selection,22
109657,rs739365,49487182,0.597457,Negative Freq-Dep. Selection,22
109658,rs6009951,49498216,0.574389,Negative Freq-Dep. Selection,22


### Genes

In [101]:
%%time

def find_snps_in_range(row, all_data):
    snps_in_range = all_data[
        (all_data['chr'] == row['chr']) & 
        (all_data['Positions'] >= row['from']) & 
        (all_data['Positions'] <= row['to'])
    ]
    grouped_snps = snps_in_range.groupby('PredClass').agg({'SNP': list}).reset_index()
    return grouped_snps

new_rows = []

for _, row in archaic.iterrows():
    snps_grouped = find_snps_in_range(row, all_data)
    for _, snp_group in snps_grouped.iterrows():
        new_rows.append([
            row['chr'], row['seg'], row['from'], row['to'], row['AltaiNean'], row['AltaiDeni'],
            snp_group['SNP'], snp_group['PredClass']
        ])

flattened_df = pd.DataFrame(new_rows, columns=['chr', 'seg', 'from', 'to', 'AltaiNean', 'AltaiDeni', 'SNPs', 'PredClass'])

CPU times: user 1.18 s, sys: 30.7 ms, total: 1.22 s
Wall time: 1.24 s


In [102]:
flattened_df.head()

Unnamed: 0,chr,seg,from,to,AltaiNean,AltaiDeni,SNPs,PredClass
0,1.0,0.0,1613367.0,1662895.0,0.6667,0.3529,[rs3817856],Overdominance
1,1.0,1.0,4568842.0,4612366.0,0.8226,0.8438,"[rs763219, rs2411887, rs1905304, rs1107467]",Overdominance
2,1.0,2.0,6563721.0,6612894.0,0.2703,0.1081,[rs12127219],Overdominance
3,1.0,3.0,7415787.0,7462976.0,0.3509,0.4407,"[rs9988557, rs1750836, rs2693953, rs6702132, rs1149331, rs1149335, rs1011124, rs1011127, rs1149338, rs1193182, rs1193185]",Overdominance
4,1.0,8.0,21763132.0,21812739.0,0.3125,0.2632,[rs12124283],Overdominance


In [107]:
row_count = len(flattened_df)
print(row_count)

1111


In [105]:
filtered_df = flattened_df[(flattened_df['AltaiNean'] > 0.9) & (flattened_df['AltaiDeni'] > 0.9)]
filtered_df.head()

Unnamed: 0,chr,seg,from,to,AltaiNean,AltaiDeni,SNPs,PredClass
25,1.0,44.0,120063627.0,120112597.0,0.9286,0.925,"[rs523395, rs517237, rs532208, rs536662]",Overdominance
47,1.0,83.0,219913313.0,219963090.0,1.0,1.0,"[rs7523649, rs953111, rs1815577, rs11118827, rs6687398, rs908857, rs2280339, rs908858, rs982292, rs17010629]",Overdominance
49,1.0,85.0,227313297.0,227361299.0,1.0,1.0,[rs3923210],Overdominance
50,1.0,86.0,227363765.0,227411863.0,1.0,1.0,"[rs6693489, rs365684, rs12026629, rs869676, rs8179432, rs10916446]",Overdominance
58,2.0,3.0,1336661.0,1378969.0,0.9821,0.9836,"[rs9749785, rs4927602, rs4076290]",Overdominance


In [108]:
row_count = len(filtered_df)
print(row_count)

92


In [109]:
%%time

def get_genes_for_snp(snp_id, chr_column):
    server = "https://grch37.rest.ensembl.org"
    ext = f"/variation/human/{snp_id}?content-type=application/json"
    r = requests.get(server + ext, headers={"Content-Type": "application/json"})
    
    if not r.ok:
        return []
    
    data = r.json()
    genes = []
    
    if 'mappings' in data:
        for mapping in data['mappings']:
            location = f"{chr_column}:{mapping['start']}-{mapping['end']}"
            ext = f"/overlap/region/human/{location}?feature=gene;content-type=application/json"
            r = requests.get(server + ext, headers={"Content-Type": "application/json"})
            
            if not r.ok:
                continue
            
            gene_data = r.json()
            
            for gene in gene_data:
                genes.append(gene['id'])   
    
    return list(set(genes))

def add_gene_info(row):
    all_genes = []
    chr_column = str(int(row['chr']))  
    
    for snp in row['SNPs']:
        gene_ids = get_genes_for_snp(snp, chr_column)
        if gene_ids:
            all_genes.extend(gene_ids)
    
    return list(set(all_genes))

filtered_df = filtered_df.copy()
filtered_df['Genes'] = filtered_df.apply(add_gene_info, axis=1)

CPU times: user 53.6 s, sys: 2.53 s, total: 56.1 s
Wall time: 6min 17s


In [110]:
filtered_df.tail()

Unnamed: 0,chr,seg,from,to,AltaiNean,AltaiDeni,SNPs,PredClass,Genes
1073,22.0,81.0,44107968.0,44149935.0,0.9167,1.0,"[rs11704481, rs2064068, rs1044742, rs5764698]",Balancing Selection,"[ENSG00000100376, ENSG00000077935]"
1074,22.0,81.0,44107968.0,44149935.0,0.9167,1.0,"[rs11704481, rs2064068, rs1044742, rs5764698]",Negative Freq-Dep. Selection,"[ENSG00000100376, ENSG00000077935]"
1075,22.0,81.0,44107968.0,44149935.0,0.9167,1.0,"[rs9614462, rs719925]",Overdominance,[ENSG00000077935]
1090,22.0,93.0,47432419.0,47445957.0,0.9107,0.963,"[rs5771906, rs6010568]",Negative Freq-Dep. Selection,[ENSG00000219438]
1104,22.0,104.0,49103660.0,49153192.0,0.9831,0.9636,[rs4074135],Overdominance,[ENSG00000205593]


In [124]:
row_count = len(filtered_df)
print(row_count)

92


### Proteins

In [139]:
%%time

def get_protein_info_for_gene(gene_id):
    server = "https://grch37.rest.ensembl.org"
    ext = f"/lookup/id/{gene_id}?content-type=application/json"
    r = requests.get(server + ext, headers={"Content-Type": "application/json"})
    
    if not r.ok:
        return None
    
    data = r.json()
    protein_name = data.get('display_name', 'Unknown')
    description = data.get('description', 'No description available')
    
    return [protein_name, description]

def add_protein_info(genes):
    protein_info = []
    for gene in genes:
        info = get_protein_info_for_gene(gene)
        if info:
            protein_info.append(info)
    return protein_info

filtered_df['Proteins'] = filtered_df['Genes'].apply(add_protein_info)
filtered_df = filtered_df[~((filtered_df['Genes'] == '[]') & (filtered_df['Proteins'] == '[]'))]

CPU times: user 3.85 s, sys: 199 ms, total: 4.05 s
Wall time: 25.6 s


In [140]:
filtered_df

Unnamed: 0,chr,seg,from,to,AltaiNean,AltaiDeni,SNPs,PredClass,Genes,Proteins
25,1.0,44.0,120063627.0,120112597.0,0.9286,0.925,"[rs523395, rs517237, rs532208, rs536662]",Overdominance,"[ENSG00000092621, ENSG00000134240]","[[PHGDH, phosphoglycerate dehydrogenase [Source:HGNC Symbol;Acc:8923]], [HMGCS2, 3-hydroxy-3-methylglutaryl-CoA synthase 2 (mitochondrial) [Source:HGNC Symbol;Acc:5008]]]"
47,1.0,83.0,219913313.0,219963090.0,1.0,1.0,"[rs7523649, rs953111, rs1815577, rs11118827, rs6687398, rs908857, rs2280339, rs908858, rs982292, rs17010629]",Overdominance,[ENSG00000143507],"[[DUSP10, dual specificity phosphatase 10 [Source:HGNC Symbol;Acc:3065]]]"
49,1.0,85.0,227313297.0,227361299.0,1.0,1.0,[rs3923210],Overdominance,[],[]
50,1.0,86.0,227363765.0,227411863.0,1.0,1.0,"[rs6693489, rs365684, rs12026629, rs869676, rs8179432, rs10916446]",Overdominance,[],[]
58,2.0,3.0,1336661.0,1378969.0,0.9821,0.9836,"[rs9749785, rs4927602, rs4076290]",Overdominance,"[ENSG00000172554, ENSG00000115705]","[[SNTG2, syntrophin, gamma 2 [Source:HGNC Symbol;Acc:13741]], [TPO, thyroid peroxidase [Source:HGNC Symbol;Acc:12015]]]"
62,2.0,7.0,10931001.0,10979979.0,0.9186,0.9877,[rs921605],Overdominance,[],[]
66,2.0,13.0,21381268.0,21416614.0,0.9712,0.9608,[rs219543],Overdominance,[ENSG00000233005],"[[AC067959.1, No description available]]"
95,2.0,50.0,64530894.0,64579779.0,0.92,0.9796,"[rs12617093, rs888527, rs12477656, rs13409095, rs6756027, rs12104828, rs6546079]",Overdominance,[ENSG00000223863],"[[AC008074.4, No description available]]"
97,2.0,52.0,71032316.0,71079245.0,0.9677,0.9677,"[rs10210235, rs2072462, rs2072463, rs7562668, rs17721027]",Overdominance,"[ENSG00000116039, ENSG00000258881]","[[ATP6V1B1, ATPase, H+ transporting, lysosomal 56/58kDa, V1 subunit B1 [Source:HGNC Symbol;Acc:853]], [AC007040.11, Uncharacterized protein [Source:UniProtKB/TrEMBL;Acc:U3KQ87]]]"
104,2.0,62.0,79234333.0,79280710.0,0.9149,0.92,"[rs892865, rs7592506]",Overdominance,"[ENSG00000224879, ENSG00000066032]","[[AC011754.1, No description available], [CTNNA2, catenin (cadherin-associated protein), alpha 2 [Source:HGNC Symbol;Acc:2510]]]"


In [131]:
row_count = len(filtered_df)
print(row_count)

92


In [132]:
def collect_protein_names(df):
    protein_data = []

    for index, row in df.iterrows():
        pred_class = row['PredClass']
        proteins = row['Proteins']
        
        for protein in proteins:
            protein_name = protein[0]
            protein_data.append((pred_class, protein_name))

    return protein_data

protein_data = collect_protein_names(filtered_df)
protein_df = pd.DataFrame(protein_data, columns=['PredClass', 'Protein'])
result_df = protein_df.groupby(['PredClass', 'Protein']).size().reset_index(name='Count')
result_df = result_df.sort_values(by=['Count'], ascending=[False])
pd.set_option('display.max_rows', None)
result_df

Unnamed: 0,PredClass,Protein,Count
42,Overdominance,HLA-DRB5,2
80,Overdominance,TAP2,2
0,Balancing Selection,FAM118A,1
56,Overdominance,MACROD2,1
64,Overdominance,PSMD13,1
63,Overdominance,PSMB9,1
62,Overdominance,PSMB8,1
61,Overdominance,PRPSAP2,1
60,Overdominance,POU5F1,1
59,Overdominance,PHGDH,1


In [137]:
proteins = pd.read_csv('/Users/kristinagrigaityte/PycharmProjects/pulls/Files/BSModel/Predictions/proteins.csv')
proteins

Unnamed: 0,Function,Protein,Description
0,Immune System and Inflammatory Response,HLA-DRB5,Involved in antigen presentation and immune response.
1,Immune System and Inflammatory Response,TAP2,Plays a role in peptide transport for antigen presentation.
2,General Cellular Function,FAM118A,Associated with various cellular functions.
3,General Cellular Function,MACROD2,Involved in protein de-ADP-ribosylation.
4,General Cellular Function,PSMD13,"A subunit of the proteasome, involved in protein degradation."
5,Immune System and Inflammatory Response,PSMB9,"Part of the immunoproteasome, involved in antigen processing."
6,Immune System and Inflammatory Response,PSMB8,"Part of the immunoproteasome, involved in antigen processing."
7,General Cellular Function,PRPSAP2,Involved in nucleotide biosynthesis.
8,Gene Regulation,POU5F1,Essential for maintaining pluripotency in embryonic stem cells.
9,Metabolism and Biosynthesis,PHGDH,Involved in serine biosynthesis.


In [138]:
gene_counts = proteins.groupby('Function').size().reset_index(name='Protein Count')
gene_counts

Unnamed: 0,Function,Protein Count
0,Gene Regulation,5
1,General Cellular Function,21
2,Genetic Disorders,1
3,Immune System and Inflammatory Response,23
4,Metabolism and Biosynthesis,5
5,Neural Development and Function,6
6,Not Available,25
7,Transport,2
8,Tumor Suppressor,1
