# Survival Prediction of Lung Cancer Patients Based on Different Feature Selection Methods Using CNN-Cox Models

## Import the Libraries

In [27]:
import pandas as pd
import numpy as np

## Prepare the dataset

### Clinical data

#### Load the dataset

In [28]:
clinical_data = pd.read_csv('/content/luad_clinical_patient.txt', sep='\t')

#### Select the relevant columns for analysis

In [29]:
clinical_data = clinical_data[['Patient Identifier', 'Overall Survival Status', 'Overall Survival (Months)']].loc[4:]
clinical_data = clinical_data.rename(columns = {'Overall Survival Status' : 'OS_STATUS', 'Overall Survival (Months)' : 'OS_MONTHS'})

print(clinical_data.head(10))

   Patient Identifier   OS_STATUS OS_MONTHS
4        TCGA-05-4244    0:LIVING         0
5        TCGA-05-4245    0:LIVING     23.98
6        TCGA-05-4249    0:LIVING     50.03
7        TCGA-05-4250  1:DECEASED      3.98
8        TCGA-05-4382    0:LIVING     19.94
9        TCGA-05-4384    0:LIVING     13.99
10       TCGA-05-4389    0:LIVING     44.97
11       TCGA-05-4390    0:LIVING     36.99
12       TCGA-05-4395  1:DECEASED         0
13       TCGA-05-4396  1:DECEASED      9.95


#### Convert OS_MONTHS values into binary format (1 for deceased, 0 for living)

In [30]:
clinical_data["OS_STATUS"] = clinical_data["OS_STATUS"].apply(lambda x : 1 if x.find('1:') != -1 else 0)

print(clinical_data.head(10))

   Patient Identifier  OS_STATUS OS_MONTHS
4        TCGA-05-4244          0         0
5        TCGA-05-4245          0     23.98
6        TCGA-05-4249          0     50.03
7        TCGA-05-4250          1      3.98
8        TCGA-05-4382          0     19.94
9        TCGA-05-4384          0     13.99
10       TCGA-05-4389          0     44.97
11       TCGA-05-4390          0     36.99
12       TCGA-05-4395          1         0
13       TCGA-05-4396          1      9.95


#### Check for unnecessary data

In [31]:
print(clinical_data['OS_STATUS'].value_counts())
print(clinical_data['OS_MONTHS'].value_counts().head(10))

OS_STATUS
0    334
1    188
Name: count, dtype: int64
OS_MONTHS
[Not Available]    9
25.99              4
0                  4
15.64              3
14.29              3
12.65              3
18.66              3
20.04              3
29.43              3
18.99              3
Name: count, dtype: int64


#### Drop rows where OS_MONTHS is 0 or '[Not Available]'

In [32]:
clinical_data = clinical_data[
    (clinical_data['OS_MONTHS'] != '0') &
    (clinical_data['OS_MONTHS'] != '[Not Available]')
]

print(clinical_data.head(10))

   Patient Identifier  OS_STATUS OS_MONTHS
5        TCGA-05-4245          0     23.98
6        TCGA-05-4249          0     50.03
7        TCGA-05-4250          1      3.98
8        TCGA-05-4382          0     19.94
9        TCGA-05-4384          0     13.99
10       TCGA-05-4389          0     44.97
11       TCGA-05-4390          0     36.99
13       TCGA-05-4396          1      9.95
14       TCGA-05-4397          1     24.01
15       TCGA-05-4398          0     47.01


#### Reset the index to start from 0

In [33]:
clinical_data = clinical_data.reset_index(drop=True)

print(clinical_data.head(10))

  Patient Identifier  OS_STATUS OS_MONTHS
0       TCGA-05-4245          0     23.98
1       TCGA-05-4249          0     50.03
2       TCGA-05-4250          1      3.98
3       TCGA-05-4382          0     19.94
4       TCGA-05-4384          0     13.99
5       TCGA-05-4389          0     44.97
6       TCGA-05-4390          0     36.99
7       TCGA-05-4396          1      9.95
8       TCGA-05-4397          1     24.01
9       TCGA-05-4398          0     47.01


### Gene data

#### Load the dataset

In [34]:
gene_data = pd.read_csv('/content/luad_gene_data.txt', sep='\t').T

print(gene_data)

                        0          1          2        3       4       5      \
Hugo_Symbol      LOC100130426   UBE2Q2P3   UBE2Q2P3  HMGB1P1  TIMM23   MOXD2   
Entrez_Gene_Id      100130426  100133144  100134869    10357   10431  136542   
TCGA-05-4244-01       -2.2883      0.038     0.0691  -1.9057 -0.0395     NaN   
TCGA-05-4249-01       -2.2883    -0.3514     0.1971   -0.295  0.1945     NaN   
TCGA-05-4250-01       -2.2883    -0.3435    -0.7239  -1.9091  0.7761     NaN   
...                       ...        ...        ...      ...     ...     ...   
TCGA-NJ-A55O-01       -2.2883     0.5729     1.0176  -0.0218  0.0408     NaN   
TCGA-NJ-A55R-01       -2.2883    -0.1679    -0.0462  -0.8099 -0.3206     NaN   
TCGA-NJ-A7XG-01       -2.2883     1.8645     2.7613  -0.4522 -0.6611     NaN   
TCGA-O1-A52J-01       -2.2883     0.4532      1.087  -1.3473  0.7679     NaN   
TCGA-S2-AA1A-01       -2.2883     0.9225    -0.2293  -0.7319  -1.161     NaN   

                     6         7       

#### Change the column name as gene symbol

In [35]:
gene_data.columns = gene_data.iloc[0]
gene_data = gene_data.drop(['Hugo_Symbol', 'Entrez_Gene_Id'])
gene_data = gene_data.astype(float)

print(gene_data)

Hugo_Symbol      LOC100130426  UBE2Q2P3  UBE2Q2P3  HMGB1P1  TIMM23  MOXD2  \
TCGA-05-4244-01       -2.2883    0.0380    0.0691  -1.9057 -0.0395    NaN   
TCGA-05-4249-01       -2.2883   -0.3514    0.1971  -0.2950  0.1945    NaN   
TCGA-05-4250-01       -2.2883   -0.3435   -0.7239  -1.9091  0.7761    NaN   
TCGA-05-4382-01       -2.2883    0.1873   -0.4402  -0.5333 -0.1787    NaN   
TCGA-05-4384-01       -2.2883   -1.2251   -1.3555  -0.8895 -1.1778    NaN   
...                       ...       ...       ...      ...     ...    ...   
TCGA-NJ-A55O-01       -2.2883    0.5729    1.0176  -0.0218  0.0408    NaN   
TCGA-NJ-A55R-01       -2.2883   -0.1679   -0.0462  -0.8099 -0.3206    NaN   
TCGA-NJ-A7XG-01       -2.2883    1.8645    2.7613  -0.4522 -0.6611    NaN   
TCGA-O1-A52J-01       -2.2883    0.4532    1.0870  -1.3473  0.7679    NaN   
TCGA-S2-AA1A-01       -2.2883    0.9225   -0.2293  -0.7319 -1.1610    NaN   

Hugo_Symbol      LOC155060  RNU12-2P    SSX9  LOC317712  ...    ZXDA    ZXD

#### Drop columns with any NaN values

In [36]:
gene_data = gene_data.dropna(axis=1)

print(gene_data)

Hugo_Symbol      LOC100130426  UBE2Q2P3  UBE2Q2P3  HMGB1P1  TIMM23  LOC155060  \
TCGA-05-4244-01       -2.2883    0.0380    0.0691  -1.9057 -0.0395     1.0624   
TCGA-05-4249-01       -2.2883   -0.3514    0.1971  -0.2950  0.1945    -0.0690   
TCGA-05-4250-01       -2.2883   -0.3435   -0.7239  -1.9091  0.7761    -1.4074   
TCGA-05-4382-01       -2.2883    0.1873   -0.4402  -0.5333 -0.1787     0.5870   
TCGA-05-4384-01       -2.2883   -1.2251   -1.3555  -0.8895 -1.1778     0.7614   
...                       ...       ...       ...      ...     ...        ...   
TCGA-NJ-A55O-01       -2.2883    0.5729    1.0176  -0.0218  0.0408     0.6719   
TCGA-NJ-A55R-01       -2.2883   -0.1679   -0.0462  -0.8099 -0.3206     1.1540   
TCGA-NJ-A7XG-01       -2.2883    1.8645    2.7613  -0.4522 -0.6611     1.0220   
TCGA-O1-A52J-01       -2.2883    0.4532    1.0870  -1.3473  0.7679     0.1428   
TCGA-S2-AA1A-01       -2.2883    0.9225   -0.2293  -0.7319 -1.1610     1.1789   

Hugo_Symbol      RNU12-2P  

#### Remove the last '-01' part in index

In [37]:
gene_data.index = gene_data.index.astype(str)
gene_data.index = gene_data.index.str.replace(r'-\d+$', '', regex=True)

print(gene_data)

Hugo_Symbol   LOC100130426  UBE2Q2P3  UBE2Q2P3  HMGB1P1  TIMM23  LOC155060  \
TCGA-05-4244       -2.2883    0.0380    0.0691  -1.9057 -0.0395     1.0624   
TCGA-05-4249       -2.2883   -0.3514    0.1971  -0.2950  0.1945    -0.0690   
TCGA-05-4250       -2.2883   -0.3435   -0.7239  -1.9091  0.7761    -1.4074   
TCGA-05-4382       -2.2883    0.1873   -0.4402  -0.5333 -0.1787     0.5870   
TCGA-05-4384       -2.2883   -1.2251   -1.3555  -0.8895 -1.1778     0.7614   
...                    ...       ...       ...      ...     ...        ...   
TCGA-NJ-A55O       -2.2883    0.5729    1.0176  -0.0218  0.0408     0.6719   
TCGA-NJ-A55R       -2.2883   -0.1679   -0.0462  -0.8099 -0.3206     1.1540   
TCGA-NJ-A7XG       -2.2883    1.8645    2.7613  -0.4522 -0.6611     1.0220   
TCGA-O1-A52J       -2.2883    0.4532    1.0870  -1.3473  0.7679     0.1428   
TCGA-S2-AA1A       -2.2883    0.9225   -0.2293  -0.7319 -1.1610     1.1789   

Hugo_Symbol   RNU12-2P    SSX9   EZHIP  EFCAB8  ...    ZXDA    

#### Check and control the duplicated genes

In [38]:
duplicated_genes = gene_data.columns[gene_data.columns.duplicated()]

print(duplicated_genes)
print(len(duplicated_genes))

Index(['UBE2Q2P3', 'CC2D2B', 'CCDC7', 'CYorf15B', 'C1orf84', 'LINC00875',
       'ELMOD1', 'NBPF16', 'NEBL', 'NKAIN3', 'C5orf23', 'PALM2AKAP2',
       'PLEKHG7', 'QSOX1', 'SH3D20', 'SNAP47', 'NCRNA00185'],
      dtype='object', name='Hugo_Symbol')
17


In [39]:
for gene in duplicated_genes:
  var_gene = np.var(gene_data[gene])

  if var_gene.values[0] < var_gene.values[1]:
    gene_data[gene] = gene_data[gene].iloc[:, 1]
  else:
    gene_data[gene] = gene_data[gene].iloc[:, 0]

gene_data = gene_data.iloc[:, ~gene_data.columns.duplicated()]
print(gene_data)

  return var(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


Hugo_Symbol   LOC100130426  UBE2Q2P3  HMGB1P1  TIMM23  LOC155060  RNU12-2P  \
TCGA-05-4244       -2.2883    0.0380  -1.9057 -0.0395     1.0624    0.5387   
TCGA-05-4249       -2.2883   -0.3514  -0.2950  0.1945    -0.0690    1.4599   
TCGA-05-4250       -2.2883   -0.3435  -1.9091  0.7761    -1.4074   -2.1796   
TCGA-05-4382       -2.2883    0.1873  -0.5333 -0.1787     0.5870   -0.6958   
TCGA-05-4384       -2.2883   -1.2251  -0.8895 -1.1778     0.7614   -0.3706   
...                    ...       ...      ...     ...        ...       ...   
TCGA-NJ-A55O       -2.2883    0.5729  -0.0218  0.0408     0.6719    0.7543   
TCGA-NJ-A55R       -2.2883   -0.1679  -0.8099 -0.3206     1.1540   -0.1200   
TCGA-NJ-A7XG       -2.2883    1.8645  -0.4522 -0.6611     1.0220   -2.1796   
TCGA-O1-A52J       -2.2883    0.4532  -1.3473  0.7679     0.1428    2.9555   
TCGA-S2-AA1A       -2.2883    0.9225  -0.7319 -1.1610     1.1789   -0.2677   

Hugo_Symbol     SSX9   EZHIP  EFCAB8  SRP14P1  ...    ZXDA    Z