In [1]:
import os
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.multitest import multipletests
from scipy.stats import ttest_ind_from_stats
import plotly.graph_objs as go
from plotly.offline import plot
import scipy
import json
import plotly.io as pio
from plotly.offline import iplot
import plotly as py
from matplotlib_venn import venn2, venn3
import matplotlib.pyplot as plt
import matplotlib
import plotly.express as px
from plotly.subplots import make_subplots
from scipy.stats import t
import seaborn as sns
from datetime import datetime, timedelta
import plotly.express as px
from pathlib import Path
import scipy.stats
import math
from select import select
import numpy as np
import pandas as pd


import requests
from requests.auth import HTTPBasicAuth
import django
from django.conf import settings
from django.contrib.auth.decorators import login_required, permission_required
from file_manager .models import DataAnalysisQueue, SampleRecord, \
    SavedVisualization, VisualizationApp, UserSettings, ProcessingApp
from django.shortcuts import render
from django.conf import settings
from schedule_archive.archive import schedule_stop

os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'rest.settings')
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"
django.setup()
# stop the automatic backup for jupyter notebook instance
schedule_stop()

pd.options.display.max_columns = None # print all the columns, don't Abbreviate
pd.options.mode.chained_assignment = None  # default='warn', eliminate "A value is trying to be set on a copy of a slice from a DataFrame.""


Django system initized and scheduler shutdown


In [2]:
#constants 
Group_LIST_NAME = ["group1_record", "group2_record",
                   "group3_record", "group4_record",
                   "group5_record", "group6_record"]
GROUP_NAME = ["group1_name", "group2_name", "group3_name",
              "group4_name", "group5_name", "group6_name"]
saved_settings ={}
plot_options = {}
JUPYTER_MODE = "JPY_PARENT_PID" in os.environ #check if it's in jupiter notebook mode
APPFOLDER = "./"
url_base = None

#settings
WRITE_OUTPUT = True


In [3]:
def get_run_name(queue_id):
    """_Get the run name/sample list from the result files, mainly
    used for webapp for populate dropdown list_
    Args:
        queue_id (_int_): _task id from the process queue_
    Returns:
        _type_: _pandas data serial contains experiment list_
        0              sample1
        1              sample2
        2              sample3
    """
    if not queue_id:
        return None
    # get processing name
    processor_name = DataAnalysisQueue.objects.filter(
        pk=queue_id).first().processing_app.name
    # fragpipe results
    if "FragPipe" in processor_name:
        peptide_file = DataAnalysisQueue.objects.filter(
            pk=queue_id).first().output_file_2
        peptide = pd.read_table(peptide_file)
        #
        # get experiment names from columns names containning " MaxLFQ Intensity"
        run_names = [
            col for col in peptide.columns if " MaxLFQ Intensity" in col]
        # remove " MaxLFQ Intensity" from the experiment names
        run_names = [name.replace(" MaxLFQ Intensity", "")
                            for name in run_names]
        # create a pandas series to store the experiment names
        run_names = pd.Series(run_names)
    else:
        run_names = pd.Series()

    return run_names

In [4]:
def read_file(queue_id, input1=None, input2=None,process_app = None):
    """_Read data from processing app result files or manually feed with input
    file 1, 2 and process_app name_
    Args:
        queue_id (_int_): _processing queue id_
        input1 (_str_): _input file 1_
        input2 (_str_): _input file 2_
        process_app (_str_): _process app name_
    Returns:
        _dict_: _dictionary containing data all data        
    """

    # fragpile data
    use_maxLFQ = False
    min_unique_peptides = 1
    if input1 is None:        
        peptide_file = DataAnalysisQueue.objects.filter(
            pk=queue_id).first().output_file_2
        protein_file = DataAnalysisQueue.objects.filter(
            pk=queue_id).first().output_file_1
        peptide_table = pd.read_table(peptide_file,low_memory=False)
        protein_table = pd.read_table(protein_file,low_memory=False)
        process_app = DataAnalysisQueue.objects.filter(
            pk=queue_id).first().processing_app.name
    else:
        peptide_table = pd.read_table(input2,low_memory=False)
        protein_table = pd.read_table(input1,low_memory=False)


    #
    # get experiment names from columns names containning " MaxLFQ Intensity"
    experiment_names = [
        col for col in peptide_table.columns if " MaxLFQ Intensity" in col]
    # remove " MaxLFQ Intensity" from the experiment names
    experiment_names = [name.replace(" MaxLFQ Intensity", "")
                        for name in experiment_names]
    # create a pandas dataframe to store the experiment names
    experiment_names = pd.DataFrame(experiment_names, columns=['run name'])
    # get protein data
    protein_data = proteinData(protein_table.copy(), True, min_unique_peptides)

    # get protein accession list
    protein_list = protein_data["Accession"]

    # # Create the abundance matrix for proteins

    abundances = AbundanceMatrix(
        protein_table.copy(), False, min_unique_peptides, True)
    # Merge the abundance matrix with the protein data using 'Accession' column
    # removed the contaminate of protein list (contam_sp)

    abundances = abundances.merge(protein_list, on='Accession', how='inner')

    # get ID matrix
    protein_ID_matrix = toIDMatrix(abundances).merge(
        protein_list, on='Accession', how='inner')

    # get ID summary
    protein_ID_summary = sumIDs(protein_ID_matrix)

    peptide_intensities = AbundanceMatrix(
        peptide_table, maxLFQ_intensity=False, isProtein=False)
    peptide_ID_matrix = toIDMatrix(peptide_intensities)
    peptide_ID_summary = sumIDs(peptide_ID_matrix)
     

    return {'meta': {"quan_method":"Protein",
                    "processing_program":process_app},
            'run_name': experiment_names,
            'protein_data': protein_data,
            'abundances': abundances,
            'protein_ID_matrix': protein_ID_matrix,
            'protein_ID_Summary': protein_ID_summary,
            'peptide_intensities': peptide_intensities,
            'peptide_ID_matrix': peptide_ID_matrix,
            'peptide_ID_Summary': peptide_ID_summary,

            }


In [5]:

def filter_by_missing_values(data_object,
                             missing_value_thresh=33,
                             analysis_program="FragPipe"):
    """_Filter out proteins/peptides with missing values rate above the
    threshold_

    Args:
        data_object (_panada_): _dataframe contain data for one experimental
        condition_
        missing_value_thresh (int, optional): _description_. Defaults to 33.
        analysis_program (str, optional): _description_.
        Defaults to "FragPipe".

    Returns:
        _data_object_: _dictionary containing data for one experimental
         'abundances':        Accession  3_TrypsinLysConly_3A4_channel2
0     A0A096LP49                            0.00
1     A0A0B4J2D5                        89850.26
2         A0AVT1                        83055.87
    """

    if data_object["meta"]["quan_method"] == "Protein":
        name = "Accession"
        is_protein = True
    else:
        name = "Annotated Sequence"
        is_protein = False
    if "FragPipe" in data_object["meta"]["processing_program"]:
        pep_columns = data_object["protein_ID_matrix"].assign(missingValues=0)
        i = 0
        # found all the proteins/peptides with missing values rate below
        # the threshold, pep_columns contains the remaining protein/peptide
        # in a pandas dataframe with $names as its column name
        for each_column in data_object["protein_ID_matrix"].loc[
                :, ~data_object["protein_ID_matrix"].columns.str.contains(
                    name)].columns:
            pep_columns.loc[pep_columns[each_column]
                            != True, "missingValues"] += 1
            i += 1
        pep_columns = pep_columns.assign(missingValuesRate=(
            pep_columns["missingValues"] / i) * 100)

        pep_columns = pep_columns.query(
            "missingValuesRate < @missing_value_thresh")

        pep_columns = pep_columns.loc[:,
                                      pep_columns.columns.str.contains(name)]

        # filter the data_object with the remaining proteins/peptides names
        data_object["abundances"] = pep_columns.merge(
            data_object["abundances"])
        data_object["protein_ID_matrix"] = pep_columns.merge(
            data_object["protein_ID_matrix"])
        data_object["protein_data"] = pep_columns.merge(
            data_object["protein_data"])

    return data_object




In [6]:
def NormalizeToMedian(data_object,
                      analysis_program="FragPipe",
                      newMedian=None):
    """_Normalizes each column by multiplying each value in that column with
    the median of all values in abundances (all experiments) and then dividing
    by the median of that column (experiment)._
    Args:
        data_object (_type_): _description_
        analysis_program (str, optional): _description_.
        Defaults to "FragPipe".
        newMedian (_type_, optional): _description_. Defaults to None.

    Returns:
        _type_: _description_
        format:
         'abundances':        Accession  3_TrypsinLysConly_3A4_channel2
         A0A096LP49                    0.000000e+00
    """

    if "FragPipe" in data_object["meta"]["processing_program"]:
        columns = [col for col in data_object[
            'abundances'].select_dtypes(include=[
                np.number])]
        data_matrix = data_object['abundances'][columns].values
        data_matrix[data_matrix == 0] = np.nan
        medianOfAll = np.nanmedian(data_matrix)

    for each_column in columns:
        data_object['abundances'][each_column] = (
            medianOfAll * data_object['abundances'][each_column] /
            np.nanmedian(data_object['abundances'][
                each_column].replace(0, np.nan)))

    if "FragPipe" not in data_object["meta"]["processing_program"]:
        columns = [col for col in data_object[
            'abundances'].select_dtypes(include=[
                np.number])]
        if newMedian is None:
            medianOfAll = np.nanmedian(data_object['abundances'][columns])
        else:
            medianOfAll = newMedian
        for eachColumn in columns:
            data_object['abundances'][eachColumn] = medianOfAll * \
                data_object['abundances'][eachColumn] / \
                np.nanmedian(data_object['abundances'][eachColumn])
        return data_object

    return data_object




In [7]:
def calculate_cvs(data_object):
    """_Calculate mean, stdev, cv for withn each protein/peptide_

    Args:
        data_object (_type_): _full data frame_

    Returns:
        _type_: _df with Accessionmean, stdev, cv for each protein/peptide_
    """
    if data_object["meta"]["quan_method"] == "Protein":
        name = "Accession"
    else:
        name = "Annotated Sequence"
    return_data = data_object["abundances"]
    return_data = return_data.assign(
        intensity=return_data.loc[:, ~return_data.columns.str.contains(
            name)].mean(axis=1, skipna=True),
        stdev=return_data.loc[:, ~return_data.columns.str.contains(
            name)].std(axis=1, skipna=True),
        CV=return_data.loc[:, ~return_data.columns.str.contains(name)].std(
            axis=1, skipna=True) / return_data.loc[
            :, ~return_data.columns.str.contains(name)].mean(
            axis=1, skipna=True) * 100)

    if data_object["meta"]["quan_method"] == "Protein":
        return_data = return_data.loc[:, [
            "Accession", "intensity", "stdev", "CV"]]
    else:
        return_data = return_data.loc[:, [
            "Annotated Sequence", "intensity", "stdev", "CV"]]
    return return_data



In [8]:
def t_test_from_summary_stats(m1, m2, n1, n2, s1, s2, equal_var=False):
    """_Calculate T-test from summary using ttest_ind_from_stats from
    scipy.stats package_

    Args:
        m1 (_type_): _mean list of sample 1_
        m2 (_type_): mean list of sample 2_
        n1 (_type_): sample size list of sample 1_
        n2 (_type_): sample size list of sample 2_
        s1 (_type_): standard deviation list of sample 1_
        s2 (_type_): standard deviation list of sample 2_
        equal_var (_type_, optional): False would perform Welch's
        t-test, while set it to True would perform Student's t-test. Defaults
        to False.

    Returns:
        _type_: _list of P values_
    """

    p_values = []
    for i in range(len(m1)):
        _, p = ttest_ind_from_stats(
            m1[i], s1[i], n1[i], m2[i], s2[i], n2[i], equal_var=equal_var)
        p_values.append(p)

    return p_values




In [9]:
def normalize(data, magic_num):
    return data * magic_num




In [10]:
def CombineSharedInformation(infoObject1, infoObject2):
    if infoObject1['meta']["quan_method"] == 'Protein':
        name = 'Accession'
    else:
        name = 'Annotated Sequence'

    infoObject = {'meta': None,
                  'run_name': None,
                  'protein_data': None,
                  'abundances': None,
                  'protein_ID_matrix': None,
                  'protein_ID_Summary': None}

    if infoObject1['meta'] == infoObject2['meta']:
        infoObject['meta'] = infoObject1['meta']
        infoObject['run_name'] = pd.concat(
            [infoObject1['run_name'], infoObject2['run_name']], axis=0)
        infoObject['protein_ID_Summary'] = pd.concat(
            [infoObject1['protein_ID_Summary'], infoObject2[
                'protein_ID_Summary']], axis=0)
        infoObject['protein_ID_matrix'] = pd.merge(
            infoObject1['protein_ID_matrix'], infoObject2[
                'protein_ID_matrix'], on=name)
        infoObject['abundances'] = pd.merge(
            infoObject1['abundances'], infoObject2['abundances'], on=name)
        infoObject['protein_data'] = pd.merge(infoObject1[
            'protein_data'].loc[:, infoObject1[
                'protein_data'].columns.str.contains(name)],
            infoObject2['protein_data'],
            on=name)
    else:
        infoObject = "ERROR: incompatible data types"

    return infoObject




In [11]:
def impute_knn(info_object, k=5):
    """_inpute missing value from neighbor values_

    Args:
        info_object (_type_): _description_
        k (int, optional): _number of neighbors used_. Defaults to 5.
    Returns:
        _type_: _description_
    """
    if info_object["meta"]["quan_method"] == "Protein":
        name = "Accession"
    else:
        name = "Annotated Sequence"
    names = info_object["abundances"][name]
    x = info_object["abundances"].select_dtypes(include=['float64', 'int64'])
    imputer = KNNImputer(n_neighbors=k)
    x_imputed = pd.DataFrame(imputer.fit_transform(x), columns=x.columns)
    info_object["abundances"].loc[:, x.columns] = x_imputed.values
    info_object["abundances"][name] = names
    return info_object




In [12]:
def CalculatePCAlog2(infoObject, infotib):
    """_inpute PCA transformed and variance explained by each principal
    component_
    """
    x = infoObject['abundances']
    if infoObject['meta']["quan_method"] == 'Protein':
        name = 'Accession'
    else:
        name = 'Annotated Sequence'
    sampleNames = x.columns[~x.columns.str.contains(
        name)].to_frame(index=False)

    x = np.log2(x.loc[:, ~x.columns.str.contains(name)].values.T)

    is_finite_col = np.isfinite(np.sum(x, axis=0))
    x_filtered = x[:, is_finite_col]

    # Instantiate PCA
    #
    pca = PCA()
    #
    # Determine transformed features
    #
    x_pca = pca.fit_transform(x_filtered)
    # print(x_pca)
    #
    # Determine explained variance using explained_variance_ration_ attribute
    #
    exp_var_pca = pca.explained_variance_ratio_
    #
    # Cumulative sum of eigenvalues; This will be used to create step plot
    # for visualizing the variance explained by each principal component.
    #
    cum_sum_eigenvalues = np.cumsum(exp_var_pca)
    #
    # convert numpy array to pandas dataframe for plotting
    pca_panda = pd.DataFrame(x_pca, columns=[
        'PC' + str(i+1) for i in range(x_pca.shape[1])])

    # add sample names to the dataframe
    pca_panda = pd.concat(
        [infotib, pca_panda], axis=1, join='inner')
    return pca_panda, exp_var_pca




In [13]:
# ##### import functions for ID plot ######
def proteinData(protein_table, isProtein, min_unique_peptides=1):
    """_Load data from the csv protein input file, add accession column based
    on protein ID column, remvoe contaiminate proteins entries (contam_sp|
    prefix) and min_unique_peptide, and remove columns with intensity in the
    name. Perform similar task if input file is pepetide output file_

    Args:
        proteinsFile (_str_): _protein or peptide file name_
        isProtein (bool): _is this a protein output file_
        min_unique_peptides (int, optional): _min requirement for ID a protein
        _. Defaults to 1.

    Returns:
        _df data_: resulted data frame. Column names:
       Protein  Protein ID   Entry Name  ...
       solventblank2_channel1 Total Spectral Count
       Indistinguishable Proteins   Accession

    """

    if isProtein:
        protein_table["Accession"] = protein_table["Protein ID"]
        mainColumn = "Accession"
        protein_table = protein_table[~protein_table['Protein'].str.contains(
            "contam_sp\\|")].query(
            "`Combined Total Peptides` >= @min_unique_peptides")
    else:
        protein_table["Annotated Sequence"] = protein_table["Peptide Sequence"]
        mainColumn = "Annotated Sequence"
        protein_table = protein_table[~protein_table[
            'Mapped Proteins'].str.contains(
            "contam_sp\\|")]

    columnPattern = "Intensity"
    return protein_table.loc[
        :, ~protein_table.columns.str.contains(columnPattern)]




In [14]:
def AbundanceMatrix(protein_table,
                    maxLFQ_intensity=False,
                    min_unique_peptides=1,
                    isProtein=True):
    """_Extract the abundance data from input_

    Args:
        proteinsFile (_type_): _description_
        maxLFQ_intensity (bool, optional): _description_. Defaults to False.
        min_unique_peptides (int, optional): _description_. Defaults to 1.
        isProtein (bool, optional): _description_. Defaults to True.

    Returns:
        _pandas_: _intensity data_
        Accession                   10ng_QC_1_channel2      10ng_QC_2_channel1
0         P00761                   434574016.0                   398855680.0
1         P02769                    86882328.0                    66171816.0
"""
    if isProtein:
        protein_table.rename(columns={"Protein ID": "Accession"}, inplace=True)
        mainColumn = "Accession"
    else:
        protein_table.rename(
            columns={"Peptide Sequence": "Annotated Sequence"}, inplace=True)
        mainColumn = "Annotated Sequence"
    if maxLFQ_intensity:
        columnPattern = mainColumn + "| MaxLFQ Intensity"
        notPattern = "\\.\\.\\.M@D1"
    else:
        columnPattern = mainColumn + "| Intensity"
        notPattern = " MaxLFQ"
    # Create the return data frame
    returnData = protein_table.loc[:, protein_table.columns.str.contains(
        columnPattern) & ~protein_table.columns.str.contains(notPattern)]
    returnData = returnData.rename(columns=lambda x: x.replace(
        " Intensity", ""))

    return returnData




In [15]:
def toIDMatrix(abundanceMatrix):
    """_extract the identification from abundance martrix table_

    Args:
        abundanceMatrix (_type_): _description_

    Returns:
        _type_: _pandas_
              Accession  10ng_QC_1_channel2 Intensity  ......
0     A0A024RBG1                         False           True
    """
    # create a new data frame to store the results
    returnData = abundanceMatrix.copy()
    # Select the numeric columns of the abundance matrix
    columns = returnData.select_dtypes(include='number').columns.tolist()
    # Convert each column to a boolean vector by checking if it's non-zero
    for eachColumn in columns:
        returnData[eachColumn] = returnData[eachColumn] != 0
    return returnData



In [16]:


def sumIDs(IDMatrix):
    """_summarize the ID matrix infor into ID summary_

    Args:
        IDMatrix (_type_): _description_

    Returns:
        _type_: _description_
                                      names  MS2_IDs  MBR_IDs  Total_IDs
0            10ng_QC_1_channel2 Intensity      NaN      NaN       3650
1            10ng_QC_2_channel1 Intensity      NaN      NaN       3604
....
    """
    # Select the logical columns of the ID matrix
    columns = IDMatrix.select_dtypes(include='bool').columns.tolist()
    returnNames = []
    returnIDs = []
    for eachColumn in columns:
        returnNames.append(eachColumn)
        returnIDs.append(
            len(IDMatrix[eachColumn][IDMatrix[eachColumn] != False]))

    return pd.DataFrame({'names': returnNames,
                         'MS2_IDs': [
                             np.nan]*len(returnNames),
                         'MBR_IDs': [np.nan]*len(returnNames),
                         'Total_IDs': returnIDs})




In [17]:
def filter_by_name(data_dict, runname_list):
    """_Filter the data_dict based on runname_list
    fitlers_

    Args:

    Returns:
        _type_: _description_
    """
    filtered_data = {}
    if data_dict["meta"]["quan_method"] == "Protein":
        name = "Accession"
    else:
        name = "Annotated Sequence"
    filtered_data["protein_data"] = data_dict["protein_data"]
    filtered_data["meta"] = data_dict["meta"]
    runname_list.append(name)
    # filter in good ones

    filtered_data["run_name"] = data_dict["run_name"][data_dict[
        "run_name"]["run name"].isin(
        runname_list)]
    filtered_data["abundances"] = data_dict["abundances"][[
        col for col in data_dict["abundances"].columns if any(
            word in col for word in runname_list)]]

    filtered_data["protein_ID_matrix"] = data_dict["protein_ID_matrix"][[
        col for col in data_dict["protein_ID_matrix"].columns if any(
            word in col for word in runname_list)]]
    filtered_data["protein_ID_Summary"] = data_dict["protein_ID_Summary"][
        data_dict["protein_ID_Summary"]["names"].isin(
            runname_list)]

    filtered_data["peptide_ID_Summary"] = data_dict["peptide_ID_Summary"][
        data_dict["peptide_ID_Summary"]["names"].isin(
            runname_list)]
    return filtered_data

In [18]:
def ID_plots(data_object, plot_options, saved_settings, username=None):
    """_Prepare data for creating protein peptide identification bar
    plot_

    Args:
        data_dict (_type_): _description_
    """
    # Create an empty dictionary to store the group names and filters
    group_names = []
    for item in GROUP_NAME:
        if saved_settings[item]:
            group_names.append(saved_settings[item])

    # import the data
    group_dict = {}

    # filter runs into different groups
    i = 1
    runname_list = []  # contain list of run names list for each groups
    for eachGroup in group_names:
        runname_sublist = [d.get('option')
                           for d in saved_settings[f"group{i}_record"]]

        group_dict[eachGroup] = filter_by_name(
            data_object,
            list(runname_sublist))  # prevent the list from being changed
        runname_list.append(runname_sublist)
        i += 1

    # create ID plots
    # allIDs table will be used to store all experiment name, ID types (
    # protein, peptide, MS2 and MS1 based), conditions and IDs numbers
    allIDs = pd.DataFrame(
        columns=["Names", "ID_Type", "ID_Mode", "Conditions", "IDs"])

    # loop through each group and extract IDs, put them into allIDs table
    for eachCondition in group_names:
        # Protein ID summary
        for index, row in group_dict[eachCondition][
                "protein_ID_Summary"].iterrows():
            for item in ["MS2_IDs",
                         "MBR_IDs",
                         "Total_IDs"]:
                if not pd.isna(group_dict[eachCondition][
                        "protein_ID_Summary"].at[index, item]):
                    # if the row with the item column is not empty,
                    # add it to allIDs table.
                    allIDs = pd.concat(
                        [allIDs,
                         pd.DataFrame(
                             [[group_dict[eachCondition][
                                 "protein_ID_Summary"].at[index, "names"],
                              "protein",
                               item,
                               eachCondition,
                               group_dict[eachCondition][
                                 "protein_ID_Summary"].at[index, item]]],
                             columns=["Names",
                                      "ID_Type",
                                      "ID_Mode",
                                      "Conditions",
                                      "IDs"])],
                        ignore_index=True)
        # Peptide ID summary
        for index, row in group_dict[eachCondition][
                "peptide_ID_Summary"].iterrows():
            for item in ["MS2_IDs",
                         "MBR_IDs",
                         "Total_IDs"]:
                if not pd.isna(group_dict[eachCondition][
                        "peptide_ID_Summary"].at[index, item]):
                    allIDs = pd.concat(
                        [allIDs,
                         pd.DataFrame(
                             [[group_dict[eachCondition][
                                 "peptide_ID_Summary"].at[index, "names"],
                              "peptide",
                               item,
                               eachCondition,
                               group_dict[eachCondition][
                                 "peptide_ID_Summary"].at[index, item]]],
                             columns=["Names",
                                      "ID_Type",
                                      "ID_Mode",
                                      "Conditions",
                                      "IDs"])],
                        ignore_index=True)
    # ######################allIDs format###################
    # name	ID_Type	ID_Mode	Conditions	IDs
    # file1	peptide	MS2_IDs	experimetn 1	xxxxx
    # file2	protein	MBR_IDs	experiment 2	xxxx
    # file3	peptide	Total_IDs	experiment 3	xxx
    #######################################################
    # Calcuate mean, standard deviation and number of replicates for each

    # choose protein or peptide
    if plot_options["plot_type"] == "1":  # Protein ID
        allIDs = allIDs[allIDs["ID_Type"] == "protein"]
    elif plot_options["plot_type"] == "2":  # Peptide ID
        allIDs = allIDs[allIDs["ID_Type"] == "peptide"]

    # choose total, MS2 or stacked
    if plot_options["ID mode"] == "MS2":  # MS2 ID
        allIDs = allIDs[allIDs["ID_Mode"] == "MS2_IDs"]
    elif plot_options["ID mode"] == "stacked":  # total separated
        pass
    else:
        # total ID combined, if not already summed (key exist), sum them
        if allIDs[allIDs["ID_Mode"] == "Total_IDs"].empty:
            grouped = allIDs.groupby('name').agg(
                {'IDs': 'sum', 'ID_Type': 'first', 'Conditions': 'first'})
            grouped = grouped.reset_index()
            grouped["ID_Mode"] = "Total_IDs"
            allIDs = grouped

    toPlotIDs = allIDs.groupby(["ID_Mode", "Conditions"]).agg({
        'IDs': ['mean', 'std', 'count'], 'ID_Type': 'first', })

    # rename the columns
    toPlotIDs.columns = ['IDs', 'stdev', 'n', 'ID_Type']
    # reset the index after grouping
    toPlotIDs = toPlotIDs.reset_index()
    # calculate the confidence interval based on 95%confidence interval`
    toPlotIDs["confInt"] = t.ppf(0.975, toPlotIDs['n']-1) * \
        toPlotIDs['stdev']/np.sqrt(toPlotIDs['n'])

    return plot_IDChart_plotly(toPlotIDs,
                               username=username,
                               plot_options=plot_options)
def plot_IDChart_plotly(ID_data,
                        username=None,
                        plot_options=None):
    """_Plot the ID bar plot for the given data_

    Args:
        ID_data (_type_): _description_
        username (str, optional): _description_. Defaults to "test".
        plot_options (_type_, optional): _description_. Defaults to None.

    Returns:
        _type_: _description_
    """

    plot_div = None
    CSV_link = None
    SVG_link = None

        
    if plot_options["ID mode"] != "stacked":

        # plot options
        # error bar
        if plot_options["error bar"] == "stdev":
            error_bars = "stdev"
        elif plot_options["error bar"] == "ci95":
            error_bars = "confInt"
        else:
            error_bars = None

        # mean label
        if plot_options["mean label"] == "True" or \
                plot_options["mean label"] == True:
            total_labels = [{"x": x, "y": total*1.15, "text": str(
                int(total)), "showarrow": False} for x, total in zip(
                    ID_data["Conditions"], ID_data["IDs"])]
        else:
            total_labels = []   # no mean labels



        # create the plot
        fig = px.bar(ID_data,
                     x="Conditions",
                     y="IDs",
                     error_y=error_bars,
                     color="Conditions",
                     color_discrete_sequence=plot_options["color"],
                     width=plot_options["width"],
                     height=plot_options["height"],
                     )
        fig.update_layout(xaxis_title=plot_options["X Title"],
                          yaxis_title=plot_options["Y Title"],
                          annotations=total_labels,
                          font=plot_options["font"]
                          )
    if WRITE_OUTPUT:    
        # export the data to csv for user downloading
        data_dir = os.path.join(APPFOLDER, "csv/")
        # create the directory if it does not exist
        if not os.path.exists(data_dir):
            Path(data_dir).mkdir(parents=True)

        # export the data to csv
        ID_data.to_csv(os.path.join(
            data_dir, f"{username}_ID_data.csv"), index=False)
        # create the link for downloading the data
        CSV_link = f"/files/{url_base}/csv/" \
            f"{username}_ID_data.csv"

        # add SVG download link

        SVG_link = f"/files/{url_base}/images/" \
            f"{username}_ID_Bar_Plot.svg"

        img_dir = os.path.join(APPFOLDER, "images/")
        if not os.path.exists(img_dir):
            Path(img_dir).mkdir(parents=True)

        fig.write_image(os.path.join(
            img_dir, f"{username}_ID_Bar_Plot.svg"))


    else:
        pass
    return fig, CSV_link, SVG_link

In [19]:
# CV Violin plots ###
def CV_plots(data_object, plot_options, saved_settings, username=None):
    """_Prepare data for creating protein CV violin plots_
    """
    group_names = []
    for item in GROUP_NAME:
        if saved_settings[item]:
            group_names.append(saved_settings[item])

    # import the data
    group_dict = {}

    # filter runs into different groups
    i = 1
    runname_list = []  # contain list of run names list for each groups
    for eachGroup in group_names:
        runname_sublist = [d.get('option')
                           for d in saved_settings[f"group{i}_record"]]

        group_dict[eachGroup] = filter_by_name(
            data_object,
            list(runname_sublist))  # prevent the list from being changed
        runname_list.append(runname_sublist)
        i += 1

    # create a dictionary to store the intensity data
    Intensity_dict = {}

    for eachGroup in group_names:
        current_condition_data = filter_by_missing_values(
            group_dict[eachGroup],
            analysis_program="FragPipe")
        Intensity_dict[eachGroup] = NormalizeToMedian(
            current_condition_data, analysis_program="FragPipe")

    all_cvs = pd.DataFrame()

    for eachGroup in Intensity_dict:
        current = calculate_cvs(
            Intensity_dict[eachGroup]).assign(Conditions=eachGroup)
        all_cvs = pd.concat([all_cvs, current], ignore_index=True)
    # ######################all_CVs format###################
#      Accession     intensity          stdev          CV   Conditions
# 0       A6NHR9  3.248547e+06  672989.819300   20.716643    DDMandDTT
# 1       A8MTJ3  5.031539e+05  195535.383583   38.861944    DDMandDTT
# 2       E9PAV3  5.330290e+05  161385.491163   30.277056    DDMandDT
    #######################################################

    return plot_CV_violin(allCVs=all_cvs,
                          username=username,
                          plot_options=plot_options)


def plot_CV_violin(allCVs,
                   username=None,
                   plot_options=None,
                   ):
    """_Plot the CV violin plot for the given data._

    Args:
        allCVs (_type_): _description_
        username (_type_, optional): _description_. Defaults to None.
        plot_options (_type_, optional): _description_. Defaults to None.

    Returns:
        _type_: _description_
    """
    plot_div = None
    CSV_link = None
    SVG_link = None

    allCVs_summary = allCVs.groupby(["Conditions"]).agg(
        {'CV': ['median', 'mean']}).reset_index()
    allCVs_summary.columns = ["Conditions", 'meds', 'CoVar']
    # mean label
    if plot_options["mean label"] == "True" or \
            plot_options["mean label"] == True:
        total_labels = [{"x": x, "y": total*1.15, "text": str(
            int(total)), "showarrow": False} for x, total in zip(
            allCVs_summary["Conditions"], allCVs_summary["meds"])]
    else:
        total_labels = []   # no mean labels

    
    # create the interactive plot
    fig = px.violin(allCVs,
                    x="Conditions",
                    y='CV',
                    color="Conditions",
                    box=bool(plot_options["box"]),
                    violinmode=plot_options["violinmode"], hover_data=[
                        "Conditions", 'CV'],
                    color_discrete_sequence=plot_options["color"],
                    width=plot_options["width"],
                    height=plot_options["height"],
                    )

    fig.update_layout(
        yaxis=dict(title=plot_options["Y Title"],
                   range=plot_options["ylimits"]),
        font=plot_options["font"],
        xaxis=dict(title=plot_options["X Title"]),
        showlegend=True,
        annotations=total_labels
    )

    if WRITE_OUTPUT:        
        # create the file for donwnload
        img_dir = os.path.join(APPFOLDER, "images/")
        if not os.path.exists(img_dir):
            Path(img_dir).mkdir(parents=True)

        fig.write_image(os.path.join(
            img_dir, f"{username}_CV_Violin_Plot.svg"))
        
        # create the download CSV and its link
        data_dir = os.path.join(APPFOLDER, "csv/")
        if not os.path.exists(data_dir):
            Path(data_dir).mkdir(parents=True)
        allCVs.to_csv(os.path.join(
            data_dir, f"{username}_all_CV.csv"), index=False)
        CSV_link = f"/files/{url_base}/csv/" \
            f"{username}_all_CV.csv"

        # download SVG link
        SVG_link = f"/files/{url_base}/images/" \
            f"{username}_CV_Violin_Plot.svg"


    return fig, CSV_link, SVG_link




In [20]:
def venns_plots(data_object, plot_options, saved_settings, username=None):
    """_Prepare data for creating ID veens plots (up to three groups)_
    """
    group_names = []

    # no compare groups is provided, compare first two
    if not plot_options["compare groups"] or \
            plot_options["compare groups"] == "[]" or \
            not isinstance(plot_options["compare groups"], list):
        for item in GROUP_NAME[:2]:
            if saved_settings[item]:
                group_names.append(saved_settings[item])
    else:  # compare groups (list of 2/3 numbers, start with 0) is provided,
        for n in range(len(GROUP_NAME)):
            if n in plot_options["compare groups"] and saved_settings[
                    GROUP_NAME[n]]:
                group_names.append(saved_settings[GROUP_NAME[n]])
    # import the data
    group_dict = {}

    # filter runs into different groups
    i = 0
    runname_list = []  # contain list of run names list for each groups
    for eachGroup in group_names:
        runname_sublist = [d.get('option')
                           for d in saved_settings[
            f"group{plot_options['compare groups'][i]+1}_record"]]
        group_dict[eachGroup] = filter_by_name(
            data_object,
            list(runname_sublist))  # prevent the list from being changed
        runname_list.append(runname_sublist)
        i += 1

    data_set = []
    labels_set = []
    for eachGroup in group_names:
        current_condition_data = filter_by_missing_values(
            group_dict[eachGroup],
            analysis_program="FragPipe")
        data_set.append(
            set(current_condition_data['abundances']['Accession'].unique()))
        labels_set.append(eachGroup)

    return venn_to_plotly(
        data_set,
        labels_set,
        plot_options=plot_options,
        username=username)


def venn_to_plotly(L_sets,
                   L_labels=None,
                   plot_options=None,
                   username=None):
    """_Creates a venn diagramm from a list of
    sets and returns a plotly figure_
    """
    CSV_link = None
    SVG_link = None

    # get number of sets
    n_sets = len(L_sets)

    # choose and create matplotlib venn diagramm
    if n_sets == 2:
        if L_labels and len(L_labels) == n_sets:
            v = venn2(L_sets, L_labels)
        else:
            v = venn2(L_sets)
    elif n_sets == 3:
        if L_labels and len(L_labels) == n_sets:
            v = venn3(L_sets, L_labels)
        else:
            v = venn3(L_sets)
    # supress output of venn diagramm
    # plt.show()
    plt.close()

    # Create empty lists to hold shapes and annotations
    L_shapes = []
    L_annotation = []

    # Define color list for sets
    L_color = plot_options["color"]

    # Create empty list to make hold of min and max values of set shapes
    L_x_max = []
    L_y_max = []
    L_x_min = []
    L_y_min = []

    for i in range(0, n_sets):

        # create circle shape for current set

        shape = go.layout.Shape(
            type="circle",
            xref="x",
            yref="y",
            x0=v.centers[i][0] - v.radii[i],
            y0=v.centers[i][1] - v.radii[i],
            x1=v.centers[i][0] + v.radii[i],
            y1=v.centers[i][1] + v.radii[i],
            fillcolor=L_color[i],
            line_color=L_color[i],
            opacity=plot_options["opacity"]
        )

        L_shapes.append(shape)

        # create set label for current set
        try:
            anno_set_label = go.layout.Annotation(
                xref="x",
                yref="y",
                x=v.set_labels[i].get_position()[0],
                y=v.set_labels[i].get_position()[1],
                text=v.set_labels[i].get_text(),
                showarrow=False
            )

            L_annotation.append(anno_set_label)

            # get min and max values of current set shape
            L_x_max.append(v.centers[i][0] + v.radii[i])
            L_x_min.append(v.centers[i][0] - v.radii[i])
            L_y_max.append(v.centers[i][1] + v.radii[i])
            L_y_min.append(v.centers[i][1] - v.radii[i])
        except Exception as err:
            print(f"No set labels found {err}")

    # determine number of subsets
    n_subsets = sum([scipy.special.binom(n_sets, i+1)
                     for i in range(0, n_sets)])

    for i in range(0, int(n_subsets)):
        try:

            # create subset label (number of common elements for current subset

            anno_subset_label = go.layout.Annotation(
                xref="x",
                yref="y",
                x=v.subset_labels[i].get_position()[0],
                y=v.subset_labels[i].get_position()[1],
                text=v.subset_labels[i].get_text(),
                showarrow=False
            )

            L_annotation.append(anno_subset_label)
        except Exception as err:
            print(f"No set labels found {err}")
    # define off_set for the figure range
    off_set = 0.2

    # get min and max for x and y dimension to set the figure range
    x_max = max(L_x_max) + off_set
    x_min = min(L_x_min) - off_set
    y_max = max(L_y_max) + off_set
    y_min = min(L_y_min) - off_set

    # create plotly figure

    fig = go.Figure()

    # set xaxes range and hide ticks and ticklabels
    fig.update_xaxes(
        range=[x_min, x_max],
        showticklabels=False,
        ticklen=0
    )

    # set yaxes range and hide ticks and ticklabels
    fig.update_yaxes(
        range=[y_min, y_max],
        scaleanchor="x",
        scaleratio=1,
        showticklabels=False,
        ticklen=0
    )

    # set figure properties and add shapes and annotations
    fig.update_layout(
        plot_bgcolor='white',
        margin=dict(b=0, l=10, pad=0, r=10, t=40),
        width=800,
        height=400,
        shapes=L_shapes,
        annotations=L_annotation,
        title=dict(text=plot_options["title"], x=0.5, xanchor='center')
    )
    if WRITE_OUTPUT:
        # SVG file link
        SVG_link = f"/files/{url_base}/images/" \
            f"{username}_ID_venns_Plot.svg"

        # create the file for donwnload
        img_dir = os.path.join(APPFOLDER, "images/")
        if not os.path.exists(img_dir):
            Path(img_dir).mkdir(parents=True)

        fig.write_image(os.path.join(
            img_dir, f"{username}_ID_venns_Plot.svg"))

    return fig, CSV_link, SVG_link


In [29]:
# ###Volcano plots####
def volcano_plots(data_object,  plot_options, saved_settings, username=None):
    """_Prepare data for creating intensity volcano plots (two groups)_
    """
    group_names = []

    # no compare groups is provided, compare first two
    if not plot_options["compare groups"] or \
            plot_options["compare groups"] == "[]" or \
            not isinstance(plot_options["compare groups"], list):
        for item in GROUP_NAME[:2]:
            if saved_settings[item]:
                group_names.append(saved_settings[item])
    else:  # compare groups (list of two, start with 0) is provided,
        for n in range(len(GROUP_NAME)):
            if n in plot_options["compare groups"] and saved_settings[
                    GROUP_NAME[n]]:
                group_names.append(saved_settings[GROUP_NAME[n]])
    # import the data
    group_dict = {}

    # filter runs into different groups
    i = 0
    runname_list = []  # contain list of run names list for each groups
    for eachGroup in group_names:
        runname_sublist = [d.get('option')
                           for d in saved_settings[
            f"group{plot_options['compare groups'][i]+1}_record"]]
        group_dict[eachGroup] = filter_by_name(
            data_object,
            list(runname_sublist))  # prevent the list from being changed
        runname_list.append(runname_sublist)
        i += 1

    # create a dictionary to store the intensity data
    Intensity_dict = {}

    for eachGroup in group_names:
        current_condition_data = filter_by_missing_values(
            group_dict[eachGroup],
            analysis_program="FragPipe")
        Intensity_dict[eachGroup] = NormalizeToMedian(
            current_condition_data, analysis_program="FragPipe")

    group1 = group_names[0]
    group2 = group_names[1]

    # calculate mean, standard deviation, and the number of non-null
    # elements for each row/protein
    group1Data = (Intensity_dict[group1]['abundances']
                  .assign(group1_Intensity=Intensity_dict[group1][
                      'abundances'].drop(columns=['Accession']).mean(axis=1),
        group1_stdev=Intensity_dict[group1]['abundances'].drop(
                      columns=['Accession']).std(axis=1),
        group1_num=Intensity_dict[group1]['abundances'].drop(
                      columns=['Accession']).shape[1] - Intensity_dict[
                      group1]['abundances'].isna().sum(axis=1))
                  .loc[:, ['group1_Intensity',
                           'group1_stdev',
                           'group1_num',
                           'Accession']])
    """ group1Data
            group1_Intensity  group1_stdev  group1_num   Accession
    0        2.824766e+05  1.708060e+05          15  A0A0B4J2D5
    1        2.650998e+06  6.259645e+05          15      A2RUR9
    2        1.973150e+05  5.645698e+04          15      A8MTJ3
    3        2.524020e+05  1.355699e+05          15      A8MWD9
    """
    group1Prots = group1Data.loc[:, ['Accession']]

    group2Data = (Intensity_dict[group2]['abundances']
                  .assign(group2_Intensity=Intensity_dict[group2][
                      'abundances'].drop(columns=['Accession']).mean(axis=1),
        group2_stdev=Intensity_dict[group2]['abundances'].drop(
                      columns=['Accession']).std(axis=1),
        group2_num=Intensity_dict[group2]['abundances'].drop(
                      columns=['Accession']).shape[1]
        - Intensity_dict[group2]['abundances'].isna().sum(axis=1))
        .loc[:, ['group2_Intensity',
                 'group2_stdev',
                 'group2_num',
                 'Accession']])
    # find common proteins
    commonProts = (group2Data.loc[:, ['Accession']]
                   .merge(group1Prots, on='Accession', how='inner'))
    # only leave common proteins
    group2Data = (group2Data
                  .merge(commonProts, on='Accession', how='inner'))
    group1Data = (group1Data
                  .merge(commonProts, on='Accession', how='inner'))

    group2Median = group2Data['group2_Intensity'].median(
        numeric_only=True)
    group1Median = group1Data['group1_Intensity'].median(
        numeric_only=True)

    # calculate the ratio between two group median,
    # will be used to normalize them
    if (Intensity_dict[group1]['abundances'].shape[1] > 3 and
        Intensity_dict[group2]['abundances'].shape[1] > 3 and
            group2 != group1):
        ratio = group2Median / group1Median

        # merge these two set of data together, adjust group 1 intensity
        # with the ratio. Calculate pOriginal, p, significant
        # pOriginal is a numpy array or list of p-values
        # method is the method to be used for adjusting the p-values
        volcanoData = (group2Data
                       .merge(group1Data, on='Accession', how='inner'))
        volcanoData = (volcanoData
                       .assign(group1_Intensity=lambda x: volcanoData[
                           'group1_Intensity'] * ratio))
        volcanoData = (volcanoData
                       .assign(
                           pOriginal=t_test_from_summary_stats(
                               m1=volcanoData['group2_Intensity'],
                               m2=volcanoData['group1_Intensity'],
                               s1=volcanoData['group2_stdev'],
                               s2=volcanoData['group1_stdev'],
                               n1=volcanoData['group2_num'],
                               n2=volcanoData['group1_num'])))
        volcanoData = (volcanoData
                       .assign(p=multipletests(volcanoData[
                           "pOriginal"], method='fdr_bh')[1]))
        volcanoData = (volcanoData
                       .assign(significant=(abs(np.log2(volcanoData[
                           'group2_Intensity'] / volcanoData[
                           'group1_Intensity'])) > 1)
                           & (volcanoData['p'] < 0.05)))
        # add upRegulated, downRegulated, and notRegulated columns
        volcanoData = volcanoData.assign(upRegulated=lambda x: (np.log2(
            volcanoData["group2_Intensity"] / volcanoData[
                "group1_Intensity"]) > 1) & (volcanoData['significant']))

        volcanoData = volcanoData.assign(downRegulated=lambda x: (np.log2(
            volcanoData["group2_Intensity"]/volcanoData[
                "group1_Intensity"]) < -1) & (volcanoData['significant']))
        volcanoData = volcanoData.assign(notRegulated=lambda x: (abs(
            np.log2(volcanoData["group2_Intensity"]/volcanoData[
                "group1_Intensity"])) <= 1) & (~volcanoData['significant']))

        return plot_volcano_colored(
            volcanoData,
            label=f"({group2}/{group1})",
            plot_options=plot_options,
            username=username,
        )


def plot_volcano_colored(allData,
                         label,
                         plot_options=None,
                         username=None,):
    CSV_link = None
    SVG_link = None
    total_labels = []
    left = "group1_Intensity"
    right = "group2_Intensity"
    downData = allData[allData['downRegulated']
                       == True]
    upData = allData[allData['upRegulated'] == True]

    fig = px.scatter(
        width=plot_options["width"],
        height=plot_options["height"],)
    if allData.shape[0] != 0:
        fig.add_scatter(x=np.log2(allData[right]/allData[left]),
                        y=-np.log10(allData["p"]),
                        text=allData["Accession"],
                        mode="markers", marker=dict(
                            color=plot_options["all color"]))
    if downData.shape[0] != 0:
        fig.add_scatter(x=np.log2(downData[right]/downData[left]),
                        y=-np.log10(downData["p"]),
                        text=downData["Accession"],
                        mode="markers",
                        marker=dict(color=plot_options["down color"]))
    if upData.shape[0] != 0:
        fig.add_scatter(x=np.log2(upData[right]/upData[left]),
                        y=-np.log10(upData["p"]),
                        text=upData["Accession"],
                        mode="markers",
                        marker=dict(color=plot_options["up color"]))
        fig.update_traces(
            mode="markers",
            hovertemplate="%{text}<br>x=: %{x}"
            " <br>y=: %{y}")
    fig.add_hline(y=-np.log10(0.05))
    fig.add_vline(x=-1)
    fig.add_vline(x=1)
    if plot_options["title"] != "" or plot_options["title"] is not None:
        plot_title = plot_options["title"] + " " + label
    else:
        plot_title = None
    if not plot_options["xlimits"] or plot_options["xlimits"] == "[]" or \
            not isinstance(plot_options["xlimits"], list):
        xlimits = None
    else:
        xlimits = plot_options["xlimits"]

    if not plot_options["ylimits"] or plot_options["ylimits"] == "[]" or \
            not isinstance(plot_options["ylimits"], list):
        ylimits = None
    else:
        ylimits = plot_options["ylimits"]

    fig.update_layout(
        font=plot_options["font"],

        showlegend=False,
        title=plot_title,
        xaxis=dict(title=dict(
            text=plot_options["X Title"]), range=xlimits),
        yaxis=dict(title=dict(
            text=plot_options["Y Title"]), range=ylimits),
        annotations=total_labels

    )

    if WRITE_OUTPUT:
        # create the file for donwnload
        img_dir = os.path.join(APPFOLDER, "images/")
        if not os.path.exists(img_dir):
            Path(img_dir).mkdir(parents=True)

        fig.write_image(os.path.join(
            img_dir, f"{username}_abundance_volcano_Plot.svg"))
        # create the download CSV and its link

        data_dir = os.path.join(APPFOLDER, "csv/")
        if not os.path.exists(data_dir):
            Path(data_dir).mkdir(parents=True)
        allData.to_csv(os.path.join(
            data_dir, f"{username}_up_down_regulated_volcano.csv"),
            index=False)
        CSV_link = f"/files/{url_base}/csv/" \
            f"{username}_up_down_regulated_volcano.csv"

        # download SVG link
        SVG_link = f"/files/{url_base}/images/" \
            f"{username}_abundance_volcano_Plot.svg"
    return fig, CSV_link, SVG_link


In [22]:
# ###PCA plots####
def PCA_plots(data_object, plot_options, saved_settings,username=None):
    """_Prepare data for creating intensity PCA plots (two groups)_
    """
    group_names = []

    # no compare groups is provided, compare first two
    if not plot_options["compare groups"] or \
            plot_options["compare groups"] == "[]" or \
            not isinstance(plot_options["compare groups"], list):
        for item in GROUP_NAME[:2]:
            if saved_settings[item]:
                group_names.append(saved_settings[item])
    else:  # compare groups (list of two, start with 0) is provided,
        for n in range(len(GROUP_NAME)):
            if n in plot_options["compare groups"] and saved_settings[
                    GROUP_NAME[n]]:
                group_names.append(saved_settings[GROUP_NAME[n]])
    # import the data
    group_dict = {}

    # filter runs into different groups
    i = 0
    runname_list = []  # contain list of run names list for each groups
    for eachGroup in group_names:
        runname_sublist = [d.get('option')
                           for d in saved_settings[f"group{plot_options['compare groups'][i]+1}_record"]]
        group_dict[eachGroup] = filter_by_name(
            data_object,
            list(runname_sublist))  # prevent the list from being changed
        runname_list.append(runname_sublist)
        i += 1
        
    # create a dictionary to store the intensity data
    Intensity_dict = {}

    for eachGroup in group_names:
        current_condition_data = filter_by_missing_values(
            group_dict[eachGroup],
            analysis_program="FragPipe")
        Intensity_dict[eachGroup] = NormalizeToMedian(
            current_condition_data, analysis_program="FragPipe")


    group1 = group_names[0]
    group2 = group_names[1]

    group1_data = Intensity_dict[group1]
    # create infodata to store intensity/abundance data without protein
    #  name column remove " Intensity from the column name"
    infodata = pd.DataFrame({"Sample_Groups": group1_data[
        "abundances"].drop("Accession", axis=1).columns.str.replace(
        " Intensity", ""),
        "Type": group1})

    group2_data = Intensity_dict[group2]
    #combine these two group together 
    combined_infodata = pd.concat([infodata, pd.DataFrame({
        "Sample_Groups": group2_data["abundances"].drop(
            "Accession", axis=1).columns.str.replace(" Intensity", ""),
        "Type": group2}),
    ])

    combined_pcaData = CombineSharedInformation(group1_data, group2_data)
    
    # using 1st group median value divide by the 2nd group median to create 
    # a scaling factor magicNUm to scale the 2nd group
    magicNum = np.nanmedian(combined_pcaData["abundances"][runname_list[
        0]], axis=1) / \
        np.nanmedian(combined_pcaData["abundances"]
                     [runname_list[1]], axis=1)

    for col in combined_pcaData["abundances"][runname_list[
            1]].columns:
        combined_pcaData["abundances"][col] = combined_pcaData[
            "abundances"][col]*magicNum
    #performs k-Nearest Neighbors imputation to fill in any missing values
    combined_pcaData = impute_knn(combined_pcaData)
    combined_infodata.reset_index(drop=True, inplace=True)
    
    # perform PCA transform
    combined_pcaData, exp_var_pca = CalculatePCAlog2(combined_pcaData,
                                                     combined_infodata)

    return plot_PCA_plotly(combined_pcaData,
                           exp_var_pca,
                           plot_options=plot_options,
                           username=username,
                           )


def plot_PCA_plotly(pca_panda,
                    exp_var_pca,
                    plot_options=None,
                    username=None,):

    CSV_link = None
    SVG_link = None

    # Assuming pca_data is a pandas dataframe containing PCA results
    # and "Type" is a column in the dataframe indicating the type of sample
    if not plot_options["xlimits"] or plot_options["xlimits"] == "[]" or \
            not isinstance(plot_options["xlimits"], list):
        xlimits = None
    else:
        xlimits = plot_options["xlimits"]

    if not plot_options["ylimits"] or plot_options["ylimits"] == "[]" or \
            not isinstance(plot_options["ylimits"], list):
        ylimits = None
    else:
        ylimits = plot_options["ylimits"]

    fig = px.scatter(pca_panda,
                     x='PC1',
                     y='PC2',
                     color="Type",
                     text="Sample_Groups",
                     symbol="Type",
                     color_discrete_sequence=plot_options["color"],

                     symbol_sequence=plot_options["symbol"],
                     size_max=30,
                     labels={'PC1': f'PC1 ({round(exp_var_pca[0]*100,2)}%)',
                             'PC2': f'PC2 ({round(exp_var_pca[1]*100,2)}%)',
                             'Type': 'Sample Type'}, title='PCA Plot',
                     width=plot_options["width"],
                     height=plot_options["height"],)

    fig.update_traces(
        mode="markers",
        marker=dict(size=plot_options["marker_size"],),
        hovertemplate="%{text}<br>PC1: %{x} <br>PC2: %{y}")
    fig.update_layout(
        # plot_bgcolor="rgba(0, 0, 0, 0)",
        # paper_bgcolor="rgba(0, 0, 0, 0)",
        font=plot_options["font"],
        title=plot_options["title"],
        xaxis=dict(linecolor='black',
                   showticklabels=False, mirror=True, range=xlimits),
        yaxis=dict(linecolor='black',
                   showticklabels=False, mirror=True, range=ylimits),
    )
    if WRITE_OUTPUT:
        # create the file for donwnload
        img_dir = os.path.join(APPFOLDER, "images/")
        if not os.path.exists(img_dir):
            Path(img_dir).mkdir(parents=True)

        fig.write_image(os.path.join(
            img_dir, f"{username}_PCA_Plot.svg"))
        # create the download CSV and its link
        data_dir = os.path.join(APPFOLDER, "csv/")
        if not os.path.exists(data_dir):
            Path(data_dir).mkdir(parents=True)
        pca_panda.to_csv(os.path.join(
            data_dir, f"{username}_PCA.csv"), index=False)
        CSV_link = f"/files/{url_base}/csv/" \
            f"{username}_PCA.csv"

        # download SVG link
        SVG_link = f"/files/{url_base}/images/" \
            f"{username}_PCA_Plot.svg"

    return fig, CSV_link, SVG_link

In [23]:
##############################All the functions and constants imports above############################

In [24]:
############################# All the following sections are for configuration and plotting####################

In [25]:
# use process queue ID to read from data manage process task
process_queue_id =7272 
#use input file for customized tasks
input_file_1 =""
input_file_2 =""
process_program = ""


In [26]:
data_obj = read_file(process_queue_id)
# data_obj = read_file(process_queue_id,input_file_1,input_file_2) # if user supplied result files


In [27]:
# define group names and assign analysis to different groups(web app version was done through GUI so
# steps are taken to make sure they have same output)
saved_settings = {    
    "group1_name": "72_T",
    "group2_name": "72_V",
    "group3_name": "24_T",
    "group4_name": "24_V",
    "group5_name": "10ng",
    "group6_name": "",
    "group1_record":[],
    "group2_record":[],
    "group3_record":[],
    "group4_record":[],
    "group5_record":[],
    "group6_record":[],
    
}
filter_in = {
    "group1_record":"72_T",
    "group2_record":"72_V",
    "group3_record":"24_T",
    "group4_record":"24_V",
    "group5_record":"10ng",
    "group6_record":"",
}

filter_out = {
    "group1_record":"",
    "group2_record":"",
    "group3_record":"",
    "group4_record":"",
    "group5_record":"",
    "group6_record":"",
}

for n in range(len(Group_LIST_NAME)):
    if saved_settings[GROUP_NAME[n]] != "" and filter_in[Group_LIST_NAME[n]] != "": #both name and in filter in exist
        for item in data_obj["run_name"]["run name"].tolist():
            if filter_in[Group_LIST_NAME[n]] in item and (filter_out[Group_LIST_NAME[n]] =="" or
                                                          filter_out[Group_LIST_NAME[n]] not in item): #
                saved_settings[Group_LIST_NAME[n]].append({"option":item})         


In [None]:
#ID plot(protein and peptides)
plot_options={
            "mean label": "True",
            "error bar": "stdev",
            "X Title": "Conditions",
            "Y Title": "Protein Identification",
            "color": ["blue", "red", "black", "yellow", "green", "purple",
                      "orange", "brown", "pink", "gray", "olive", "cyan"],
            "width": 700,
            "height": 450,
            "font": dict(size=16, family="Arial black"),
            "ID mode": "total",
            "help for information only": \
            "Mean label options: True or False." \
            "error bar options: stdev or ci95." \
            "color: the first few colors will be used"
            "ID mode options: total, MS2, stacked." \
        }
plot_options["plot_type"] = "1" # 1 is protein, 2 is peptide
figure ,_ ,_ =ID_plots(data_obj, plot_options, saved_settings)
figure.show()
#figure.write_image("images/test.svg")


In [None]:
# CV violin plot
plot_options={    
        "mean label": "True",
        "box": "True",
        "X Title": "Conditions",
        "Y Title": "CV of Abundance (%)",
        "color": ["blue", "red", "black", "yellow", "green", "purple",
                  "orange", "brown", "pink", "gray", "olive", "cyan"],
        "width": 700,
        "height": 450,
        "font": dict(size=16, family="Arial black"),
        "violinmode": "overlay",
        "ylimits": [0, 100],
        "help for information only": \
        "Mean label options: True or False." \
        "color: the first few colors will be used"\
        "violinmode: group or overlay." \
    }
figure, _, _ =CV_plots(data_obj, plot_options, saved_settings)
figure.show()
#figure.write_image("images/test.svg")


In [None]:
# ID Venns plot
plot_options={
            "compare groups": [0,2, 4],
            "title": "Venn Diagram",
            "opacity": 0.75,
            "color": ["#00FF00", "#FFFF00", "#FF0000", "yellow", "red",
                      "green", "purple", "orange", "brown", "pink",
                      "gray",  "olive", "cyan", "blue",  "black", ],

            "help for information only": \
            "color: the first few colors will be used"
        }
figure, _, _ =venns_plots(data_obj, plot_options, saved_settings)
figure.show()
#figure.write_image("images/test.svg")



In [None]:
# Abundance PCA plot
plot_options={

            "compare groups": [2, 3],
            "title": "PCA Analysis",
            "color": ["blue", "red", "black", "yellow", "green", "purple",
                      "orange", "brown", "pink", "gray", "olive", "cyan"],
            "symbol": ['star', 'circle'],
            "marker_size": 8,
            "width": 700,
            "height": 450,
            "font": dict(size=16, family="Arial black"),
            "ylimits": [],
            "xlimits": [],
            "help for information only": \
            "compare groups: list of two numbers default [0,1] means compare" \
            "color: the first few colors will be used"
            "xlimits: list or tuple of two numbers [0,10]." \
        }
figure, _, _ =PCA_plots(data_obj, plot_options, saved_settings)
figure.show()
#figure.write_image("images/test.svg")

In [31]:
# Abundance Volcano plot
plot_options={
            "compare groups": [0, 2],
            "title": "Volcano Plot",
            "X Title": "Log2 Fold Change",
            "Y Title": "-Log10(p-value)",
            "up color": "green",
            "down color": "red",
            "all color": "blue",
            "width": 700,
            "height": 450,
            "font": dict(size=16, family="Arial black"),
            "ylimits": [],
            "xlimits": [],
            "help for information only": \
            "compare groups: list of two numbers default [0,1] means compare" \
            "the second group again the first group." \
            "xlimits: list or tuple of two numbers [0,10]." \
        }
figure, _, _ =volcano_plots(data_obj, plot_options, saved_settings)
figure.show()
#figure.write_image("images/test.svg")