In [None]:
##This file contains definitions of functions to be implemented for this project

In [26]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt


In [2]:
#data reading

def get_file_path(sub_dir1,sub_dir2,filename):
    tool_directory=Path.cwd()
    #project_directory=os.path.dirname(tool_directory)
    data_directory=os.path.join(tool_directory,sub_dir1,sub_dir2,filename)
    return data_directory

def read_data(filename):
    """
    Read the data from the data folder and return a pandas dataframe

    Args:
        filename (str, optional): Name of the file to read. Defaults to "human_cachexia.csv".

    Returns:
        pandas.DataFrame: Dataframe containing the data
    """
    input_data=pd.read_csv(filename)
    print("data read successfully, the shape of the dataframe is: ", input_data.shape)

    return input_data

In [27]:
###test function

data_file="/Users/yuzhijian/Documents/学习资料/UMICH/UM2023/Winter_2023/Bioinf 576/tool_github/resources/test_dataset/human_cachexia.csv"

data_file=read_data(data_file)

data read successfully, the shape of the dataframe is:  (77, 65)


Unnamed: 0,Patient ID,Muscle loss,"1,6-Anhydro-beta-D-glucose",1-Methylnicotinamide,2-Aminobutyrate,2-Hydroxyisobutyrate,2-Oxoglutarate,3-Aminoisobutyrate,3-Hydroxybutyrate,3-Hydroxyisovalerate,...,Tryptophan,Tyrosine,Uracil,Valine,Xylose,cis-Aconitate,myo-Inositol,trans-Aconitate,pi-Methylhistidine,tau-Methylhistidine
0,PIF_178,cachexic,40.85,65.37,18.73,26.05,71.52,1480.30,56.83,10.07,...,259.82,290.03,111.05,86.49,72.24,237.46,135.64,51.94,157.59,160.77
1,PIF_087,cachexic,62.18,340.36,24.29,41.68,67.36,116.75,43.82,79.84,...,83.10,167.34,46.99,109.95,192.48,333.62,376.15,217.02,307.97,130.32
2,PIF_090,cachexic,270.43,64.72,12.18,65.37,23.81,14.30,5.64,23.34,...,82.27,60.34,31.50,59.15,2164.62,330.30,86.49,58.56,145.47,83.93
3,NETL_005_V1,cachexic,154.47,52.98,172.43,74.44,1199.91,555.57,175.91,25.03,...,235.10,323.76,30.57,102.51,125.21,1863.11,247.15,75.94,249.64,254.68
4,PIF_115,cachexic,22.20,73.70,15.64,83.93,33.12,29.67,76.71,69.41,...,103.54,142.59,44.26,160.77,186.79,101.49,749.95,98.49,84.77,79.84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,NETCR_019_V2,control,35.16,52.46,13.87,44.26,99.48,208.51,11.25,6.49,...,46.06,45.15,62.18,33.45,62.80,103.54,78.26,18.17,871.31,84.77
73,NETL_012_V1,control,16.95,15.80,10.49,22.42,62.80,10.91,6.96,3.46,...,21.33,21.33,31.19,13.20,14.30,36.23,11.59,12.30,53.52,44.70
74,NETL_012_V2,control,9.39,14.01,5.16,23.57,46.99,13.33,3.35,2.69,...,14.88,15.18,39.65,13.74,21.76,40.85,30.88,8.50,90.02,28.22
75,NETL_003_V1,control,37.71,18.17,26.05,15.03,23.34,33.45,6.05,5.26,...,17.46,29.96,13.46,14.59,36.97,90.92,17.64,12.43,897.85,90.02


In [39]:
##normalization methods

def normalize_by_sum(input_data):
    '''
    Normalize the data by dividing each column by the sum of the row

    Args:
        input_data (pandas.DataFrame): Dataframe containing the data

    Returns:
        pandas.DataFrame: Normalized dataframe
    '''
    data_to_process=input_data.iloc[:,2:]
    normalized_data=data_to_process.div(data_to_process.sum(axis=0), axis=1)
    normed_data=input_data.copy()
    normed_data.iloc[:,2:]=normalized_data
    
    return normed_data


In [36]:
a=data_file.copy()

In [37]:
type(a)

pandas.core.frame.DataFrame

In [62]:
after_norm=normalize_by_sum(data_file).iloc[:,2:].sum(axis=0)
after_norm[0:].sum(axis=0)==len(after_norm)

True

In [None]:

def normalize_by_median(input_data):
    '''
    Normalize the data by dividing each column by the median of the row

    Args:
        input_data (pandas.DataFrame): Dataframe containing the data

    Returns:
        pandas.DataFrame: Normalized dataframe
    '''

    data_to_process=input_data.iloc[:,2:]
    normalized_data=data_to_process.div(data_to_process.median(axis=0), axis=1)
    normed_data=input_data.copy()
    normed_data.iloc[:,2:]=normalized_data

    return normed_data


In [None]:

def normalize_by_reference_sample_PQN(input_data):
    '''
    Normalize the data by dividing each column by the PQN of the reference sample. 
    PQN has been shown to be effective at normalizing data from different platforms and technologies, 
    and can reduce batch effects and other systematic variations in data. However, 
    PQN assumes that most genes or proteins are not differentially expressed, which may not be true in all cases. 
    Additionally, PQN may not be appropriate for all types of data, 
    and other normalization methods may be more appropriate 
    depending on the specific research question and experimental design.

    Args:
        input_data (pandas.DataFrame): Dataframe containing the data

    Returns:
        pandas.DataFrame: Normalized dataframe

    '''
    data_to_process=input_data.iloc[:,2:]
    sample_medians = data_to_process.median(axis=0)
    data_norm = data_to_process.div(sample_medians, axis=1)
    metabolite_means = np.exp(np.log(data_norm).mean(axis=1))
    data_norm = data_norm.div(metabolite_means, axis=0)
    col_medians = data_norm.median(axis=0)
    data_norm = data_norm.div(col_medians, axis=1)
    normed_data=input_data.copy()
    normed_data.iloc[:,2:]=data_norm

    return normed_data
    

In [None]:
def data_transformation_log(input_data):
    '''
    Transform the data by taking the log10 of each value

    Args:
        input_data (pandas.DataFrame): Dataframe containing the data

    Returns:
        pandas.DataFrame: Transformed dataframe
    '''

    data_to_process=input_data.iloc[:,2:]
    data_log10 = data_to_process.apply(np.log10)
    transformed_data=input_data.copy()
    transformed_data.iloc[:,2:]=data_log10
    
    return transformed_data


In [None]:
def data_scaling_mean_centered(input_data):
    '''
    Scale the data by subtracting the mean of each column from each value in the column

    Args:
        input_data (pandas.DataFrame): Dataframe containing the data

    Returns:    
        pandas.DataFrame: Scaled dataframe
    '''

    data_scaled = input_data.subtract(input_data.mean(axis=0), axis=1)
    scaled_data=input_data.copy()
    scaled_data.iloc[:,2:]=data_scaled

    return scaled_data


In [None]:
def PCA_analysis(input_data, number_of_components=2):
    '''
    Perform PCA analysis on the data
    
    Args:
        input_data (pandas.DataFrame): Dataframe containing the data
        number_of_components (int, optional): Number of principal components to return. Defaults to 2.

    Returns:
        pandas.DataFrame: Dataframe containing the principal components
    '''

    # Load data into a Pandas DataFrame

    # Initialize PCA object with desired number of components
    pca = PCA(n_components=number_of_components)

    # Fit the PCA model to the data
    pca.fit(input_data)

    # Transform the data to the new coordinate system defined by the principal components
    data_pca = pca.transform(input_data)

    # Create a new DataFrame with the principal components
    df_pca = pd.DataFrame(data=data_pca, columns=['PC1', 'PC2'])

    return df_pca

In [None]:
def PCA_plot(df_pca):
    '''
    Plot the PCA analysis results

    Returns:
        matplotlib.pyplot: Plot of the PCA analysis results
    '''

    # Plot the principal components
    plt.scatter(df_pca['PC1'], df_pca['PC2'])
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.show()

    return plt

In [None]:
######################33

In [None]:
## to be implemented
def MA_plot():
    return None

In [None]:
##interface for user to select the normalization method and data transformation method
def user_interface():
    return None