# Transformation     # 1

## Two samples-overlapping spreadsheets into two intersection-only spreadsheets.
* Samples x Phenotypes (SxP) must have the same phenotypes.
* They must have some sample names in common.

In [1]:
import os
import numpy as np
import pandas as pd
import knpackage.toolbox as kn



#### Get two dataframes, match up the sample names and output two dataframes with those matching sample names.

In [3]:
def get_common_samples_data_frames(sxp_1_df, sxp_2_df):
    """ 
    Args:
        sxp_1_df:      samples x phenotypes dataframe (sxp_1_df = kn.get_spreadsheet_df(sxp_filename_1))
        sxp_2_df:      samples x phenotypes dataframe
    Returns:
        sxp_1_trim_df: samples x phenotypes with only sample names in both input dataframes
        sxp_2_trim_df: samples x phenotypes with only sample names in both input dataframes
    """
    sxp_1_gene_names = kn.extract_spreadsheet_gene_names(sxp_1_df)
    sxp_2_gene_names = kn.extract_spreadsheet_gene_names(sxp_2_df)
    common_samples_list = kn.find_common_node_names(sxp_1_gene_names, sxp_2_gene_names)
    
    return sxp_1_df.loc[common_samples_list], sxp_2_df.loc[common_samples_list]

##### Create the test data; split a samples x phenotypes dataframe into two samples-overlapping  dataframes.

In [14]:
data_source_directory = './transform_data'

spreadsheet_1_df = kn.get_spreadsheet_df(os.path.join(data_source_directory, 'spreadsheet_One.tsv'))
# spreadsheet_1_df = kn.get_spreadsheet_df(os.path.join(data_source_directory, 'gene_samples_1.tsv'))
spreadsheet_2_df = kn.get_spreadsheet_df(os.path.join(data_source_directory, 'spreadsheet_Two.tsv'))

In [15]:
df_1, df_2 = get_common_samples_data_frames(spreadsheet_1_df, spreadsheet_2_df)
df_1

Unnamed: 0_level_0,grade,days_to_death,days_to_last,stage_ismeta,histICDO3,stage
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
TCGA-A5-A0G1,grade 3,3251.0,,0,8441/3,stage ia
TCGA-A5-A0G2,grade 3,,4053.0,0,8441/3,stage iiib
TCGA-A5-A0GA,grade 3,543.0,,0,8380/3,stage iiic2
TCGA-A5-A0GH,grade 3,,1671.0,0,8380/3,stage ia
TCGA-A5-A0GE,grade 2,,2717.0,0,8380/3,stage ia
TCGA-A5-A0GB,grade 3,,275.0,0,8380/3,stage ib
TCGA-A5-A0G9,grade 3,,2165.0,0,8380/3,stage ib
TCGA-A5-A0GG,grade 1,,2516.0,0,8380/3,stage ia


In [16]:
df_2

Unnamed: 0_level_0,ICD10_simple,living,ICDO3site,residual,days_survival,stage_simple
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
TCGA-A5-A0G1,1.0,deceased,c54.1,r0,3251.0,i
TCGA-A5-A0G2,1.0,living,c54.1,r0,4053.0,iii
TCGA-A5-A0GA,1.0,deceased,c54.1,r0,543.0,iii
TCGA-A5-A0GH,1.0,living,c54.1,r0,1671.0,i
TCGA-A5-A0GE,1.0,living,c54.1,r0,2717.0,i
TCGA-A5-A0GB,1.0,living,c54.1,r0,275.0,i
TCGA-A5-A0G9,1.0,living,c54.1,r0,2165.0,i
TCGA-A5-A0GG,1.0,living,c54.1,r0,2516.0,i
