# Transformation     # 1

## Two samples-overlapping spreadsheets into two intersection-only spreadsheets.
* Samples x Phenotypes (SxP) must have the same phenotypes.
* They must have some sample names in common.

In [1]:
import os
import numpy as np
import pandas as pd
import knpackage.toolbox as kn



#### Get two dataframes, match up the sample names and output two dataframes with those matching sample names.

In [2]:
def get_common_samples_data_frames(sxp_1_df, sxp_2_df):
    """ 
    Args:
        sxp_1_df:      samples x phenotypes dataframe (sxp_1_df = kn.get_spreadsheet_df(sxp_filename_1))
        sxp_2_df:      samples x phenotypes dataframe
    Returns:
        sxp_1_trim_df: samples x phenotypes with only sample names in both input dataframes
        sxp_2_trim_df: samples x phenotypes with only sample names in both input dataframes
    """
    sxp_1_gene_names = kn.extract_spreadsheet_gene_names(sxp_1_df)
    sxp_2_gene_names = kn.extract_spreadsheet_gene_names(sxp_2_df)
    common_samples_list = kn.find_common_node_names(sxp_1_gene_names, sxp_2_gene_names)
    
    return sxp_1_df.loc[common_samples_list], sxp_2_df.loc[common_samples_list]

##### Create the test data; split a samples x phenotypes dataframe into two samples-overlapping  dataframes.

In [3]:
data_source_directory = '../data/transform_data'

spreadsheet_1_df = kn.get_spreadsheet_df(os.path.join(data_source_directory, 'spreadsheet_One.txt'))
# spreadsheet_1_df = kn.get_spreadsheet_df(os.path.join(data_source_directory, 'gene_samples_1.tsv'))
spreadsheet_2_df = kn.get_spreadsheet_df(os.path.join(data_source_directory, 'spreadsheet_Two.txt'))

In [4]:
df_1, df_2 = get_common_samples_data_frames(spreadsheet_1_df, spreadsheet_2_df)
df_1

Unnamed: 0_level_0,days_to_death,days_to_last,days_survival,diag_age,race,ethnicity,gender
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TCGA-A5-A0GA,543.0,,543,67,white,not hispanic or latino,female
TCGA-A5-A0GU,,495.0,495,58,white,,female
TCGA-A5-A0GV,,485.0,485,67,white,not hispanic or latino,female
TCGA-A5-A0GB,,275.0,275,65,white,,female
TCGA-A5-A0GE,,2717.0,2717,38,asian,not hispanic or latino,female
TCGA-A5-A0R7,,40.0,40,55,white,not hispanic or latino,female
TCGA-A5-A0GJ,,1447.0,1447,44,white,not hispanic or latino,female
TCGA-A5-A0G5,,790.0,790,73,black or african american,,female
TCGA-A5-A0GN,,1477.0,1477,65,white,not hispanic or latino,female
TCGA-A5-A0GG,,2516.0,2516,76,black or african american,not hispanic or latino,female


In [5]:
df_2

Unnamed: 0_level_0,ICDO3site,stage,stage_simple,stage_ismeta,grade,grade_simple,residual
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TCGA-A5-A0GA,c54.1,stage iiic2,iii,0,grade 3,High Grade,r0
TCGA-A5-A0GU,c54.1,stage ia,i,0,grade 1,"Low grade (1,2)",r0
TCGA-A5-A0GV,c54.1,stage ia,i,0,grade 1,"Low grade (1,2)",r0
TCGA-A5-A0GB,c54.1,stage ib,i,0,grade 3,High Grade,r0
TCGA-A5-A0GE,c54.1,stage ia,i,0,grade 2,"Low grade (1,2)",r0
TCGA-A5-A0R7,c54.1,stage ia,i,0,grade 2,"Low grade (1,2)",r0
TCGA-A5-A0GJ,c54.1,stage ia,i,0,grade 2,"Low grade (1,2)",r0
TCGA-A5-A0G5,c54.1,stage ib,i,0,grade 3,High Grade,r0
TCGA-A5-A0GN,c54.1,stage ib,i,0,grade 1,"Low grade (1,2)",r0
TCGA-A5-A0GG,c54.1,stage ia,i,0,grade 1,"Low grade (1,2)",r0
