In [1]:
import numpy as np
import pandas as pd
import os
import glob

In [2]:
# High_AUC_PPV_FUNCTIONS = pd.read_csv("High_AUC_PPV_FUNCTIONS.txt", sep="\t")
# High_AUC_PPV_FUNCTIONS

In [3]:
# Top_148_test_union_function_groups = pd.read_csv("Top_148_test_union_function_groups.txt", sep="\t")
# Top_148_test_union_function_groups

### If needed, remember to change the permission of files and folders

## Protein file

In [4]:
def create_protein_filter_list(protein_file_name, protein_cutoff_value):
    protein_df = pd.read_csv(protein_file_name,  names = ["Protein_name","Score"], header=0, index_col=0)
    protein_filter = protein_df[protein_df['Score'] > protein_cutoff_value]
    return protein_filter

### In the protein file, max = 0.25, see the question from email

## IDR file

In [5]:
# https://plotly.com/python/v3/html-reports/#generate-html-reports-with-d3-graphsusing-python-plotly-and-pandas
def create_IDR_filter_list(IDR_file_name, IDR_cutoff_value, filtered_protein_name):
    IDR_df = pd.read_csv(IDR_file_name, names = ["Protein IDR","Score"], header=0)
    
    filter_IDRs = pd.DataFrame()
    for p_name in filtered_protein_name:
        for IDR_name in IDR_df["Protein IDR"].values:
            if p_name in IDR_name:
                s2 = IDR_df[IDR_df["Protein IDR"] == IDR_name]
                filter_IDRs = pd.concat([filter_IDRs, s2], ignore_index=True)

    filter_IDRs = filter_IDRs[filter_IDRs["Score"] > IDR_cutoff_value]
    
    # Split GO_term string in order to let user understand the meaning of each column
    # https://stackoverflow.com/questions/14745022/how-to-split-a-dataframe-string-column-into-two-columns
    # n=the number of columns you expect
    filter_IDRs[['UniProtID', 'Redundancy', 'The beginning position of IDR', 'The ending position of IDR']] = filter_IDRs['Protein IDR'].str.split('_', n=4, expand=True)
    filter_IDRs = filter_IDRs.drop(columns=['Protein IDR', 'Redundancy'])
    filter_IDRs = filter_IDRs[['UniProtID', 'The beginning position of IDR', 'The ending position of IDR', 'Score']]
    
    # hold two digits after the decimal points
    filter_IDRs["Score"] = filter_IDRs["Score"].astype(float)
    filter_IDRs["Score"] = filter_IDRs["Score"].round(2)
    
    # rename "Score"
    filter_IDRs = filter_IDRs.rename(columns={"Score": "The probability of the association between the IDR and the function"})
    
    html_filter_IDRs = filter_IDRs.to_html(index=False).replace('<table border="1" class="dataframe">','<table class="table table-striped">')
    html_filter_IDRs = html_filter_IDRs.replace('<td>', '<td style="text-align: center;">') # center text
    html_filter_IDRs= html_filter_IDRs.replace('<th>', '<th style="text-align: center;">') # center column names
    return html_filter_IDRs 

## Features

In [6]:
def create_feature_filter_list(feature_file_name):
    feature_df = pd.read_csv(feature_file_name, names = ["Feature name","Score"])
    filter_features = feature_df.dropna()
    
    # hold two digits after the decimal points
    filter_features["Score"] = filter_features["Score"].astype(float)
    filter_features["Score"] = filter_features["Score"].round(2)
    
    # rename "Score"
    filter_features = filter_features.rename(columns={"Score": "The strength and direction of association with function for each molecular feature"})    
    
    html_filter_features = filter_features.to_html(index=False).replace('<table border="1" class="dataframe">','<table class="table table-striped">')
    html_filter_features = html_filter_features.replace('<td>', '<td style="text-align: center;">') # center text
    html_filter_features = html_filter_features.replace('<th>', '<th style="text-align: center;">') # center column names
    return html_filter_features

## HTML

In [7]:
# refer to https://plotly.com/python/v3/html-reports/#generate-html-reports-with-d3-graphsusing-python-plotly-and-pandas
def convert_to_HTML_format(filter_IDRs, filter_features, function_name):
    html_string = '''
    <html>
        <head>
            <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap.min.css">
            <style>body{ margin:0 100; background:whitesmoke; }</style>
        </head>
        <body>
            <h1>Group/Function: ''' + function_name + '''</h1>

            <!-- *** Two Lists *** --->
            <h2>IDRs List</h2>
            ''' + filter_IDRs + '''

            <h2>Features List</h2>
            ''' + filter_features + '''
        </body>
    </html>'''

    open_path = f'../webpages/{function_name}.html' # put a variable within a string
    f = open(open_path,'w')
    f.write(html_string)
    f.close()

# Functions

## Automatically read each csv file in one folder

### There is huge differences between using severl "if" and "if, elif, else"

In [8]:
path = os.getcwd()
csv_files = glob.glob(os.path.join(path, "*.csv"))
print(len(csv_files))
  
num = 0
# loop over the list of csv files
for f in csv_files:
    
    protein_filter = pd.DataFrame([])
    filter_IDRs = pd.DataFrame([])
    filter_features = pd.DataFrame([])
    print("num:\t", num)
    
    # read the first csv file
    if num == 0:
        df_post_filename = f.split("/")[-1]
        num += 1
    
    # read the second csv file
    elif num == 1:
        df_pred_filename = f.split("/")[-1]
        num += 1
        
    # read the third csv file
    else:
        df_stat_filename = f.split("/")[-1]
        num = 0
        
        # so far, read all files of one function
        # print("Group:\t", df_post_filename, df_pred_filename, df_stat_filename, "\t")
        
        # step1: protein
        protein_file_name = df_pred_filename
        protein_cutoff_value = 0.15
        protein_filter = create_protein_filter_list(protein_file_name, protein_cutoff_value)
        filtered_protein_name = protein_filter.index.values
        print ("Protein\n", filtered_protein_name)

        # step2: IDRs
        IDR_file_name = df_post_filename
        IDR_cutoff_value = 0.5
        html_filter_IDRs = create_IDR_filter_list(IDR_file_name, IDR_cutoff_value, filtered_protein_name)
        # print("IDR\n", filter_IDRs)

        # step3: features
        feature_file_name = df_stat_filename
        html_filter_features = create_feature_filter_list(feature_file_name)
        # print("Feature\n", filter_features)

        # step4: HTML
        function_name = df_pred_filename.split()[0]
        function_name = function_name[:-1]
        convert_to_HTML_format(html_filter_IDRs, html_filter_features, function_name)

0
