In [152]:
# Libraries
import pandas as pd
import re
import numpy as np

In [153]:
# Load death statistic data
###########################

with open(file="./Pawitan_Death_Metrics.sec", mode="r") as file:
    content = file.readlines()

COLUMN_DESCRIBTION = content[:8]
COLUMN_NAMES       = content[8].split()
DATA_ROWS          = content[9:-1]

print(COLUMN_DESCRIBTION)
print(COLUMN_NAMES)
print(DATA_ROWS[:5])

['#RELAPSE = breast cancer relapse (1 = relapse, 0 = no relapse or censored)\n', '#SURV_RELAPSE = time until relapse or censoring (in years)\n', '#DEATH = death from any reason (1 = dead, 0 = alive or censored)\n', '#DEATH_BC = death due to breast cancer (1 = dead from breast cancer, 0 = alive or censored)\n', '#SURV_DEATH = time until death or censoring (in years)\n', '#SUBTYPE = tumor subclasses as decribed in PMID 11553815 (No Subtype = sample could not be classified)\n', '#ELSTON = Elston tumor grade (1-3)\n', '!series_table_begin = Clinical Data\n']
['ID_REF', 'RELAPSE', 'SURV_RELAPSE', 'DEATH', 'DEATH_BC', 'SURV_DEATH', 'SUBTYPE', 'ELSTON']
['X027JO  1       3.82    1       1       4.14    No Subtype      2\n', 'X350JO  0       8.15    0       0       8.15    Luminal B       3\n', 'X028JA  0       8.36    1       0       2.22    Luminal A       1\n', 'X126AS  0       8.23    0       0       8.23    No Subtype      2\n', 'X005JO  0       8.07    1       0       5.55    Luminal A  

In [154]:
# Prepare raw data

# Split cells by at least 2 white spaces.
data_raw = [re.split(r'\s{2,}', row) for row in DATA_ROWS]

# Remove the trailing '\n'

for i in range(len(data_raw)):
    data_raw[i][-1] = data_raw[i][-1].split(sep='\n')[0]

# Transform into pandas Dataframe
df_stat = pd.DataFrame(data_raw, columns = COLUMN_NAMES)

print(df_stat.count())
print(df_stat.head())
df_stat.info()

ID_REF          159
RELAPSE         159
SURV_RELAPSE    159
DEATH           159
DEATH_BC        159
SURV_DEATH      159
SUBTYPE         159
ELSTON          159
dtype: int64
   ID_REF RELAPSE SURV_RELAPSE DEATH DEATH_BC SURV_DEATH     SUBTYPE ELSTON
0  X027JO       1         3.82     1        1       4.14  No Subtype      2
1  X350JO       0         8.15     0        0       8.15   Luminal B      3
2  X028JA       0         8.36     1        0       2.22   Luminal A      1
3  X126AS       0         8.23     0        0       8.23  No Subtype      2
4  X005JO       0         8.07     1        0       5.55   Luminal A     NA
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ID_REF        159 non-null    object
 1   RELAPSE       159 non-null    object
 2   SURV_RELAPSE  159 non-null    object
 3   DEATH         159 non-null    object
 4   DEATH_BC  

In [155]:
# Transform dataframe

# Set NA values : np.nan
df_stat = df_stat.replace(to_replace="NA",value=np.nan)

# Transform data to numerical dataformats
typecast_dict = {"RELAPSE": "float",
 "SURV_RELAPSE": "float",
 "DEATH": "float",
 "DEATH_BC": "float",
 "SURV_DEATH": "float",
 "ELSTON": "float"
 }

df_stat = df_stat.astype(typecast_dict)
df_stat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ID_REF        159 non-null    object 
 1   RELAPSE       159 non-null    float64
 2   SURV_RELAPSE  159 non-null    float64
 3   DEATH         159 non-null    float64
 4   DEATH_BC      159 non-null    float64
 5   SURV_DEATH    159 non-null    float64
 6   SUBTYPE       159 non-null    object 
 7   ELSTON        147 non-null    float64
dtypes: float64(6), object(2)
memory usage: 10.1+ KB


In [156]:
# Display statistics

df_stat.describe()

Unnamed: 0,RELAPSE,SURV_RELAPSE,DEATH,DEATH_BC,SURV_DEATH,ELSTON
count,159.0,159.0,159.0,159.0,159.0,147.0
mean,0.251572,6.195849,0.251572,0.18239,6.403522,2.22449
std,0.435288,2.300886,0.435288,0.387386,1.910139,0.747562
min,0.0,0.23,0.0,0.0,0.18,1.0
25%,0.0,5.625,0.0,0.0,5.7,2.0
50%,0.0,7.07,0.0,0.0,7.05,2.0
75%,0.5,7.93,0.5,0.0,7.84,3.0
max,1.0,8.49,1.0,1.0,8.49,3.0


In [86]:
# Load gene expression data
###########################

with open(file="./Pawitan_Gene_Expression.sec", mode="r") as file:
    content = file.readlines()


ID_REF = content[0].split()
EXPR_COL_NAME = ["Gene-Identifier"]
EXPR_COL_NAME.extend(ID_REF)
print("Count of IDs: " + str(len(ID_REF)))

# Each row consists of [0]: Gene, [1:]: Expressionfactor for each ID
# EXPR_DATA = [row.split()[1:] for row in content[1:]]
# EXPR_COL_NAME    = [row.split()[0] for row in content[1:]]
# print("Count of expression_cols: " + str(len(EXPR_DATA[0])))
EXPR_DATA = [row.split() for row in content[1:]]


# Transform to dataframe

df_expr = pd.DataFrame(EXPR_DATA, columns = EXPR_COL_NAME)

df_expr.head()

Count of IDs: 159


Unnamed: 0,Gene-Identifier,X211WA,X076ER,X164MY,X331HA,X011DA,X349LE,X288BE,X296NY,X215HE,...,X184LO,X125WA,X063PE,X052ZA,X010BJ,X244SO,X071AV,X342BL,X336MO,X026NA
0,219097_x_at,7.822177,7.687284,8.039288,7.8985,7.869872,7.555892,7.637794,7.811304,7.748979,...,7.98671,7.752654,7.749288,7.987051,8.302294,7.816722,7.928655,7.669378,7.883066,7.810159
1,218883_s_at,6.420426,6.935824,6.214857,7.856578,7.447776,7.401657,7.251684,7.520047,7.043208,...,7.115581,7.114517,7.296553,7.875749,6.973822,7.249678,8.132056,7.386043,7.299843,6.398404
2,236623_at,5.300527,6.57275,4.393148,5.005639,6.218209,4.427368,4.515451,4.606987,5.600462,...,4.862075,4.464612,4.613973,4.224854,4.077635,6.297783,4.608932,4.315354,6.147342,4.29826
3,227609_at,7.434736,8.475129,7.609434,8.352944,7.87456,8.974308,8.600711,8.714686,8.176217,...,7.731803,8.34264,8.873129,8.44821,8.55879,9.013164,8.316677,9.101791,8.723541,7.460471
4,221521_s_at,6.66811,5.816826,5.20346,7.33915,6.286962,6.827439,6.580303,8.388103,6.284251,...,6.06219,6.204626,5.224326,7.340663,6.532447,6.458143,6.398205,6.897584,7.66176,5.173594


In [91]:
# How to get gene-expression data for a specific ID_REF?
df_expr["X027JO"]

# Together with Gene-Identifier
df_expr[["Gene-Identifier","X027JO"]]

Unnamed: 0,Gene-Identifier,X027JO
0,219097_x_at,7.820706
1,218883_s_at,7.278875
2,236623_at,4.781702
3,227609_at,8.765088
4,221521_s_at,6.465088
...,...,...
528,209466_x_at,7.623169
529,218074_at,8.235566
530,222993_at,8.516192
531,224634_at,7.771256


In [164]:
# How to get gene-expression data for a specific gene?
df_expr[["Gene-Identifier","X027JO"]][df_expr["Gene-Identifier"] =='218883_s_at']

Unnamed: 0,Gene-Identifier,X027JO
1,218883_s_at,7.278875
