# Transformation     # 2

## Two spreadsheets into one union-samples, union-phenotypes spreadsheet.
* Samples x Phenotypes input files must have the different phenotypes.

In [1]:
import os
import numpy as np
import pandas as pd
import knpackage.toolbox as kn

#### Get two dataframes, merge into one dataframe with all sample names and all unique phenotype names.
* Reject overlapping names in phenotype input as unresolved ambiguity.

#### Transformation of Two samples x phenotypes spreadsheets into One s x p spreadsheet

In [2]:
def merge_unique_phenotypes_for_all_samples(spreadsheet_1_df, spreadsheet_2_df):
    """ 
    Args:
        spreadsheet_1_df: samples x phenotypes dataframe
        spreadsheet_2_df: samples x phenotypes dataframe
    Returns:
        union_df:         samples x phenotypes dataframe with combined samples and phenotypes
    """
    spreadsheet_1_samples = kn.extract_spreadsheet_gene_names(spreadsheet_1_df)
    spreadsheet_2_samples = kn.extract_spreadsheet_gene_names(spreadsheet_2_df)
    
    all_samples_list = kn.find_unique_node_names(spreadsheet_1_samples, spreadsheet_2_samples)
    
    spreadsheet_1_phenotypes = list(spreadsheet_1_df.columns)
    spreadsheet_2_phenotypes = list(spreadsheet_2_df.columns)
    
    all_phenotypes_list = kn.find_unique_node_names(spreadsheet_1_phenotypes, spreadsheet_2_phenotypes)
    
    spreadsheet_X_df = pd.concat([spreadsheet_1_df, spreadsheet_2_df], axis=1)
        
    return spreadsheet_X_df

In [3]:
data_source_directory = '../data/transform_data'

spreadsheet_1_df = kn.get_spreadsheet_df(os.path.join(data_source_directory, 'spreadsheet_One.txt'))
spreadsheet_2_df = kn.get_spreadsheet_df(os.path.join(data_source_directory, 'spreadsheet_Two.txt'))
combo_df = merge_unique_phenotypes_for_all_samples(spreadsheet_1_df, spreadsheet_2_df)
combo_df

Unnamed: 0,days_to_death,days_to_last,days_survival,diag_age,race,ethnicity,gender,ICDO3site,stage,stage_simple,stage_ismeta,grade,grade_simple,residual
TCGA-A5-A0G1,3251.0,,3251.0,67.0,white,not hispanic or latino,female,c54.1,stage ia,i,0.0,grade 3,High Grade,r0
TCGA-A5-A0G3,,1079.0,1079.0,61.0,black or african american,,female,c54.1,stage iiic2,iii,0.0,grade 3,High Grade,r0
TCGA-A5-A0G5,,790.0,790.0,73.0,black or african american,,female,c54.1,stage ib,i,0.0,grade 3,High Grade,r0
TCGA-A5-A0GA,543.0,,543.0,67.0,white,not hispanic or latino,female,c54.1,stage iiic2,iii,0.0,grade 3,High Grade,r0
TCGA-A5-A0GB,,275.0,275.0,65.0,white,,female,c54.1,stage ib,i,0.0,grade 3,High Grade,r0
TCGA-A5-A0GD,,,,,,,,c54.1,stage ia,i,0.0,grade 2,"Low grade (1,2)",r0
TCGA-A5-A0GE,,2717.0,2717.0,38.0,asian,not hispanic or latino,female,c54.1,stage ia,i,0.0,grade 2,"Low grade (1,2)",r0
TCGA-A5-A0GG,,2516.0,2516.0,76.0,black or african american,not hispanic or latino,female,c54.1,stage ia,i,0.0,grade 1,"Low grade (1,2)",r0
TCGA-A5-A0GH,,,,,,,,c54.1,stage ia,i,0.0,grade 3,High Grade,r0
TCGA-A5-A0GI,,1750.0,1750.0,63.0,white,not hispanic or latino,female,c54.1,stage ia,i,0.0,grade 2,"Low grade (1,2)",r0


In [None]:
spreadsheet_1_df

In [None]:
spreadsheet_2_df