In [2]:
import pickle
import pandas as pd
import seaborn as sns
from os.path import exists
import matplotlib.pyplot as plt
import numpy as np

# Load Data

In [2]:
series_matrix = pd.read_csv('data/GSE68086_series_matrix.csv')
print(series_matrix.shape)
print(series_matrix.head())

(285, 46)
  !Sample_geo_accession           !Sample_status !Sample_submission_date  \
0          "GSM1662534"  "Public on Oct 30 2015"           "Apr 21 2015"   
1          "GSM1662535"  "Public on Oct 30 2015"           "Apr 21 2015"   
2          "GSM1662536"  "Public on Oct 30 2015"           "Apr 21 2015"   
3          "GSM1662537"  "Public on Oct 30 2015"           "Apr 21 2015"   
4          "GSM1662538"  "Public on Oct 30 2015"           "Apr 21 2015"   

  !Sample_last_update_date !Sample_type !Sample_channel_count  \
0            "May 15 2019"        "SRA"                   "1"   
1            "May 15 2019"        "SRA"                   "1"   
2            "May 15 2019"        "SRA"                   "1"   
3            "May 15 2019"        "SRA"                   "1"   
4            "May 15 2019"        "SRA"                   "1"   

  !Sample_source_name_ch1 !Sample_organism_ch1 !Sample_characteristics_ch1  \
0    "3-Breast-Her2-ampl"       "Homo sapiens"             "tiss

In [3]:
TEP_data = pd.read_csv('data/GSE68086_TEP_data_matrix.csv')
print(TEP_data.shape)
print(TEP_data.head())

(57736, 286)
        Unnamed: 0  3-Breast-Her2-ampl  8-Breast-WT  10-Breast-Her2-ampl  \
0  ENSG00000000003                   0            0                    0   
1  ENSG00000000005                   0            0                    0   
2  ENSG00000000419                  44           14                   16   
3  ENSG00000000457                  26            1                   14   
4  ENSG00000000460                  81           98                   18   

   Breast-100  15-Breast-Her2-ampl  16-Breast-WT  21-Breast-WT  \
0           0                   17             0             0   
1           0                    0             0             0   
2           8                    9             0           139   
3           0                    4            20             1   
4          17                    0            20           144   

   33-Breast-Her2-ampl  42-Breast-Her2-ampl  ...  MGH-NSCLC-L20-TR500  \
0                    0                    0  ...            

# Clean Data

In [4]:
# Keep only the sample source and cancer type columns
series_matrix_source_cancer = series_matrix[['!Sample_source_name_ch1', '!Sample_characteristics_ch1.3']]
# Rename the columns
series_matrix_source_cancer.columns = ['Source', 'Cancer Type']
# Delete the cancer type in str
series_matrix_source_cancer['Cancer Type'] = series_matrix_source_cancer['Cancer Type'].str.replace('cancer type: ', '')
# One hot encode the cancer type
series_matrix_source_cancer = pd.get_dummies(series_matrix_source_cancer, columns=['Cancer Type'])
print(series_matrix_source_cancer.head())

# Write the data to a csv file
series_matrix_source_cancer.to_csv('data/series_matrix_source_cancer.csv', index=False)

                  Source  Cancer Type_"Breast"  Cancer Type_"CRC"  \
0   "3-Breast-Her2-ampl"                  True              False   
1          "8-Breast-WT"                  True              False   
2  "10-Breast-Her2-ampl"                  True              False   
3           "Breast-100"                  True              False   
4  "15-Breast-Her2-ampl"                  True              False   

   Cancer Type_"GBM"  Cancer Type_"HC"  Cancer Type_"Hepatobiliary"  \
0              False             False                        False   
1              False             False                        False   
2              False             False                        False   
3              False             False                        False   
4              False             False                        False   

   Cancer Type_"Lung"  Cancer Type_"Pancreas"  \
0               False                   False   
1               False                   False   
2          

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series_matrix_source_cancer['Cancer Type'] = series_matrix_source_cancer['Cancer Type'].str.replace('cancer type: ', '')


# Visualization

In [5]:
patient_data = pd.read_csv('data/series_matrix_source_cancer.csv')
rna_expression = pd.read_csv('data/GSE68086_TEP_data_matrix.csv', index_col=0)[:3]
rna_expression.columns.values[0] = 'gene_ID'
rna_expression_long = rna_expression.melt(id_vars=['gene_ID'], var_name='samples', value_name='FPKM')
print(rna_expression_long.head())

# Get patients that have breast cancer
breast_cancer_patients = patient_data[patient_data['Cancer Type_"Breast"'] == 1]
# Get the rna expression of the breast cancer patients
breast_cancer_rna_expression = rna_expression_long[rna_expression_long['samples'].isin(breast_cancer_patients['Source'])]
# Get the average rna expression of the breast cancer patients
breast_cancer_rna_expression_avg = breast_cancer_rna_expression.groupby('gene_ID').mean()
# Get the top 10 genes
top_10_genes = breast_cancer_rna_expression_avg.sort_values('FPKM', ascending=False).head(10)
print(top_10_genes)


   gene_ID              samples  FPKM
0        0          8-Breast-WT     0
1        0          8-Breast-WT     0
2       44          8-Breast-WT    14
3        0  10-Breast-Her2-ampl     0
4        0  10-Breast-Her2-ampl     0
Empty DataFrame
Columns: [samples, FPKM]
Index: []
