In [1]:
import numpy as np
import pandas as pd
import os
import glob
import requests

In [2]:
# High_AUC_PPV_FUNCTIONS = pd.read_csv("High_AUC_PPV_FUNCTIONS.txt", sep="\t")
# High_AUC_PPV_FUNCTIONS

In [3]:
# Top_148_test_union_function_groups = pd.read_csv("Top_148_test_union_function_groups.txt", sep="\t")
# Top_148_test_union_function_groups

### If needed, remember to change the permission of files and folders

## Protein file

In [4]:
def create_protein_filter_list(protein_file_name):
    protein_df = pd.read_csv(protein_file_name,  names = ["Protein_name","Score"], header=0, index_col=0)
    # protein_filter = protein_df[protein_df['Score'] > protein_cutoff_value]
    
    # choose top 20 proteins in each pred file
    protein_df["Rank"] = protein_df["Score"].rank(ascending=False) 
    protein_df = protein_df.sort_values("Rank")
    protein_filter = protein_df.head(20)
    return protein_filter

### In the protein file, max = 0.25, see the question from email

## IDR file

In [5]:
def compute_IDR_num(filter_IDRs):
    count_dict = dict()
    for ID in filter_IDRs["UniProtID"]:
        if ID in count_dict:
            count_dict[ID] += 1
        else:
            count_dict[ID] = 1

    count_list = []
    for ID in filter_IDRs["UniProtID"]:
        if ID in count_dict:
            count_list.append(count_dict.get(ID))
    filter_IDRs["The number of IDRs in each protein"] = count_list
    return count_list

In [6]:
def get_protein_name(query):
    base_url = "https://www.ebi.ac.uk/proteins/api/{}"
    url = base_url.format(query)
#     print("URL:", url)
    
    try:
        response = requests.get(url, headers={ "Accept" : "application/json"})
#         print("Status code:", response.raise_for_status())  # Check for errors: None == 200
        
        data_list = response.json()
#         print(type(data_list))
        data_dict = data_list[0]
#         print(type(data_dict))
        
        protein_name = data_dict.get('protein').get('recommendedName').get('fullName').get('value')
        return protein_name
    except requests.exceptions.RequestException as e:
        print("Error:", e)
        return None

def get_protein_name_list(uniprot_id_list):
    protein_name_list = []
    for uniprot_id in uniprot_id_list:
        query = "proteins?offset=0&accession=" + uniprot_id
        protein_name = get_protein_name(query)
        protein_name_list.append(protein_name)
    return protein_name_list

In [7]:
# https://plotly.com/python/v3/html-reports/#generate-html-reports-with-d3-graphsusing-python-plotly-and-pandas
def create_IDR_filter_list(IDR_file_name, IDR_cutoff_value1, IDR_cutoff_value2, filtered_protein_name):
    IDR_df = pd.read_csv(IDR_file_name, names = ["Protein IDR","Score"], header=0)
    
    # Split GO_term string in order to let user understand the meaning of each column
    # https://stackoverflow.com/questions/14745022/how-to-split-a-dataframe-string-column-into-two-columns
    # n=the number of columns you expect
    IDR_df[['UniProtID', 'Redundancy', 'Begin', 'End']] = IDR_df['Protein IDR'].str.split('_', n=4, expand=True)
    
    # combine the "beginning position" and "end position" columns
    IDR_df["The beginning and ending positions of IDR"] = IDR_df['Begin'].astype(str) +"-"+ IDR_df["End"]
    
    
    filter_IDRs = pd.DataFrame()
    for p_name in filtered_protein_name:
        for IDR_name in IDR_df["Protein IDR"].values:
            if p_name in IDR_name:
                s2 = IDR_df[IDR_df["Protein IDR"] == IDR_name]
                filter_IDRs = pd.concat([filter_IDRs, s2], ignore_index=True)
                
    # compute the total numbe of IDRs in each protein            
    count_list = compute_IDR_num(filter_IDRs)
    filter_IDRs["The number of IDRs in each protein"] = count_list
    
    # remove all IDRs whose score <= 0.5
    filter_IDRs = filter_IDRs[filter_IDRs["Score"] > IDR_cutoff_value1]
    
    # remove useless columns
    filter_IDRs = filter_IDRs.drop(columns=['Protein IDR', 'Redundancy', 'Begin', 'End'])
    
    # rename columns in order to make them more understandable
    filter_IDRs = filter_IDRs[['UniProtID', 'The beginning and ending positions of IDR', 'Score', 'The number of IDRs in each protein']]
    
    # if the protein has more than 2 IDRs, remove IDRs whose score <= 0.6
    temp_df = filter_IDRs[filter_IDRs["The number of IDRs in each protein"]>2]
    filter_IDRs = filter_IDRs.drop((temp_df["Score"] < 0.6).index.values)
    
    # hold two digits after the decimal points
#     filter_IDRs["Score"] = filter_IDRs["Score"].astype(float)
#     filter_IDRs["Score"] = filter_IDRs["Score"].round(2)
    
    # rename "Score"
#     filter_IDRs = filter_IDRs.rename(columns={"Score": "The probability of the association between the IDR and the function"})
    
    # remove the "Score" column since their probabilities are amostly 1, so not very informative/interesting
    filter_IDRs = filter_IDRs.drop(columns=['Score'])
    
    # add protein name column
    uniprot_id_list = filter_IDRs["UniProtID"]
    protein_name_list = get_protein_name_list(uniprot_id_list)
    filter_IDRs.insert(1, "Protein name", protein_name_list)

    html_filter_IDRs = filter_IDRs.to_html(index=False).replace('<table border="1" class="dataframe">','<table class="table table-striped">')
    html_filter_IDRs = html_filter_IDRs.replace('<td>', '<td style="text-align: center;">') # center text
    html_filter_IDRs= html_filter_IDRs.replace('<th>', '<th style="text-align: center;">') # center column names
    return html_filter_IDRs 

## Features

In [8]:
def create_feature_filter_list(feature_file_name):
    feature_df = pd.read_csv(feature_file_name, names = ["Feature name","Score"])
    filter_features = feature_df.dropna()
    
    # hold two digits after the decimal points
    filter_features["Score"] = filter_features["Score"].astype(float)
    filter_features["Score"] = filter_features["Score"].round(2)
    
    # rename "Score"
    filter_features = filter_features.rename(columns={"Score": "The strength and direction of association with function for each molecular feature"})    
    
    html_filter_features = filter_features.to_html(index=False).replace('<table border="1" class="dataframe">','<table class="table table-striped">')
    html_filter_features = html_filter_features.replace('<td>', '<td style="text-align: center;">') # center text
    html_filter_features = html_filter_features.replace('<th>', '<th style="text-align: center;">') # center column names
    return html_filter_features

## HTML

In [9]:
# refer to https://plotly.com/python/v3/html-reports/#generate-html-reports-with-d3-graphsusing-python-plotly-and-pandas
def convert_to_HTML_format(filter_IDRs, filter_features, function_name):
    html_string = '''
    <html>
        <head>
            <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap.min.css">
            <style>body{ margin:0 100; background:whitesmoke; }</style>
            
            <script> 
                window.addEventListener('DOMContentLoaded', (event) => {
                    // Get all the cells in the first column of the IDRs List table
                    const cells = document.querySelectorAll('table.table-striped:nth-of-type(1) td:nth-child(1)');

                    // Add a hyperlink to each cell
                    cells.forEach((cell) => {
                        const text = cell.innerText;
                        console.log(text)
                        const hyperlink = document.createElement('a');
                        hyperlink.href = "https://www.uniprot.org/uniprotkb/" + text + "/entry" + text; //add URL
                        hyperlink.textContent = text;
                        cell.innerText = '';
                        cell.appendChild(hyperlink);
                    });    
                });
            </script>
            
        </head>
        <body>
            <h1>Group/Function: ''' + function_name.split("/")[-1] + '''</h1>

            <!-- *** Two Lists *** --->
            <h2>IDRs List</h2>
            ''' + filter_IDRs + '''

            <h2>Features List</h2>
            <p><a href="http://142.150.219.123:5000/table/">Molecular Features Table</a></p>
            ''' + filter_features + '''
        </body>
    </html>'''
    
    function_name = function_name.split("/")[-1]
    open_path = f'../webpages/{function_name}.html' # put a variable within a string
    f = open(open_path,'w')
    f.write(html_string)
    f.close()

* Note:
If get rid of "=>" in the line "cells.forEach((cell) => {...", resulting in the hyperlinks do not work

# Functions

## Automatically read each csv file in one folder

### There is huge differences between using severl "if" and "if, elif, else"

In [10]:
# path = os.getcwd()
# print(path)

In [11]:
path = "../statistics_data"
csv_files = glob.glob(os.path.join(path, "*.csv"))
print(len(csv_files))
  
num = 0
# loop over the list of csv files
for f in csv_files:
    
    protein_filter = pd.DataFrame([])
    filter_IDRs = pd.DataFrame([])
    filter_features = pd.DataFrame([])
    print("num:\t", num)
    
    # read the first csv file
    if num == 0:
        df_post_filename = f
        num += 1
    
    # read the second csv file
    elif num == 1:
        df_pred_filename = f
        num += 1
        
    # read the third csv file
    else:
        df_stat_filename = f
        num = 0
        
        
        # so far, read all files of one function
        # print("Group:\t", df_post_filename, df_pred_filename, df_stat_filename, "\t")
        
        # step1: protein
        protein_file_name = df_pred_filename
        # protein_cutoff_value = 0.15
        # protein_filter = create_protein_filter_list(protein_file_name, protein_cutoff_value)
        protein_filter = create_protein_filter_list(protein_file_name)
        filtered_protein_name = protein_filter.index.values
        print ("Protein\n", filtered_protein_name)

        # step2: IDRs
        IDR_file_name = df_post_filename
        IDR_cutoff_value1 = 0.5
        IDR_cutoff_value2 = 0.6
        html_filter_IDRs = create_IDR_filter_list(IDR_file_name, IDR_cutoff_value1, IDR_cutoff_value2, filtered_protein_name)
        # print("IDR\n", filter_IDRs)

        # step3: features
        feature_file_name = df_stat_filename
        html_filter_features = create_feature_filter_list(feature_file_name)
        # print("Feature\n", filter_features)

        # step4: HTML
        function_name = df_pred_filename.split()[0]
        function_name = function_name[:-1]
        convert_to_HTML_format(html_filter_IDRs, html_filter_features, function_name)

444
num:	 0
num:	 1
num:	 2
Protein
 ['P0CG33' 'Q9NYA3' 'A6NDK9' 'Q5VT06' 'Q9C0D2' 'A6NDN3' 'Q8N3K9' 'H3BSY2'
 'A8MQT2' 'Q8IYY4' 'A7E2F4' 'Q8WYP5' 'Q02952' 'Q8IYE1' 'H3BPF8' 'Q96SN8'
 'P15311' 'Q9P219' 'Q6NUN7' 'O95613']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_features["Score"] = filter_features["Score"].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_features["Score"] = filter_features["Score"].round(2)


num:	 0
num:	 1
num:	 2
Protein
 ['Q56NI9' 'Q8WYP5' 'O43663' 'P10243' 'P35251' 'O76021' 'Q69YH5' 'Q99741'
 'Q9UPQ0' 'Q96T88' 'P46013' 'Q14680' 'Q14207' 'Q7RTP6' 'P10244' 'Q5UIP0'
 'Q8NI77' 'P18583' 'Q86T82' 'Q12834']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_features["Score"] = filter_features["Score"].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_features["Score"] = filter_features["Score"].round(2)


num:	 0
num:	 1
num:	 2
Protein
 ['Q56NI9' 'Q8WYP5' 'O43663' 'P35251' 'Q69YH5' 'Q99741' 'P46013' 'P10243'
 'P10244' 'Q8NI77' 'Q14207' 'Q96T88' 'O76021' 'Q7RTP6' 'Q14680' 'Q8WWL7'
 'P18583' 'Q9UPQ0' 'Q5UIP0' 'Q86T82']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_features["Score"] = filter_features["Score"].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_features["Score"] = filter_features["Score"].round(2)


num:	 0
num:	 1
num:	 2
Protein
 ['Q08170' 'Q14498' 'Q13523' 'Q01130' 'P08621' 'Q14152' 'Q05519' 'Q13427'
 'Q9Y2W1' 'Q13247' 'Q15696' 'Q9NQ29' 'Q5T200' 'Q86VM9' 'Q16629' 'Q96IZ7'
 'O95232' 'Q9BRL6' 'O75494' 'P48634']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_features["Score"] = filter_features["Score"].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_features["Score"] = filter_features["Score"].round(2)


num:	 0
num:	 1
num:	 2
Protein
 ['Q08170' 'Q14498' 'Q13523' 'Q01130' 'P08621' 'Q14152' 'Q05519' 'Q13427'
 'Q9Y2W1' 'Q13247' 'Q15696' 'Q9NQ29' 'Q5T200' 'Q86VM9' 'Q16629' 'Q96IZ7'
 'O95232' 'Q9BRL6' 'O75494' 'P48634']



KeyboardInterrupt

