# Survival Prediction of Lung Cancer Patients Based on Different Feature Selection Methods Using CNN-Cox Models

## Import the Libraries

In [19]:
import pandas as pd

## Prepare the dataset

### Clinical data

#### Load the dataset

In [None]:
luad_clinical_data = pd.read_csv('C:/Users/kangj/CNN_Cox/luad_clinical_patient.txt', sep='\t')

print(luad_clinical_data.head(10).to_string())

                          #Other Patient ID                         Patient Identifier  Form completion date                                                                 Neoplasm Histologic Type Name                                                                                       Tissue Prospective Collection Indicator                                                                                                            Tissue Retrospective Collection Indicator     Sex  Tumor Site                                   Race Category                                   Ethnicity Category                                                          Primary Tumor Laterality                                                                                                   Prior Cancer Diagnosis Occurence                                                  Patient Primary Tumor Site  Location lung parenchyma                                                                   Year Cancer Initial 

#### Select the relevant columns for analysis

In [21]:
luad_clinical_data = luad_clinical_data[['Patient Identifier', 'Overall Survival Status', 'Overall Survival (Months)']].loc[4:]
luad_clinical_data = luad_clinical_data.rename(columns = {'Overall Survival Status' : 'OS_STATUS', 'Overall Survival (Months)' : 'OS_MONTHS'})

print(luad_clinical_data.head(10))

   Patient Identifier   OS_STATUS OS_MONTHS
4        TCGA-05-4244    0:LIVING         0
5        TCGA-05-4245    0:LIVING     23.98
6        TCGA-05-4249    0:LIVING     50.03
7        TCGA-05-4250  1:DECEASED      3.98
8        TCGA-05-4382    0:LIVING     19.94
9        TCGA-05-4384    0:LIVING     13.99
10       TCGA-05-4389    0:LIVING     44.97
11       TCGA-05-4390    0:LIVING     36.99
12       TCGA-05-4395  1:DECEASED         0
13       TCGA-05-4396  1:DECEASED      9.95


#### Convert OS_MONTHS values into binary format (1 for deceased, 0 for living)

In [22]:
luad_clinical_data["OS_STATUS"] = luad_clinical_data["OS_STATUS"].apply(lambda x : 1 if x.find('1:') != -1 else 0)

print(luad_clinical_data.head(10))

   Patient Identifier  OS_STATUS OS_MONTHS
4        TCGA-05-4244          0         0
5        TCGA-05-4245          0     23.98
6        TCGA-05-4249          0     50.03
7        TCGA-05-4250          1      3.98
8        TCGA-05-4382          0     19.94
9        TCGA-05-4384          0     13.99
10       TCGA-05-4389          0     44.97
11       TCGA-05-4390          0     36.99
12       TCGA-05-4395          1         0
13       TCGA-05-4396          1      9.95


#### Check for unnecessary data

In [23]:
print(luad_clinical_data['OS_STATUS'].value_counts())
print(luad_clinical_data['OS_MONTHS'].value_counts().head(10))

OS_STATUS
0    334
1    188
Name: count, dtype: int64
OS_MONTHS
[Not Available]    9
0                  4
25.99              4
18.99              3
18.66              3
12.65              3
29.43              3
19.81              3
15.64              3
20.57              3
Name: count, dtype: int64


#### Drop rows where OS_MONTHS is 0 or '[Not Available]'

In [24]:
luad_clinical_data = luad_clinical_data[
    (luad_clinical_data['OS_MONTHS'] != '0') & 
    (luad_clinical_data['OS_MONTHS'] != '[Not Available]')
]

print(luad_clinical_data.head(10))

   Patient Identifier  OS_STATUS OS_MONTHS
5        TCGA-05-4245          0     23.98
6        TCGA-05-4249          0     50.03
7        TCGA-05-4250          1      3.98
8        TCGA-05-4382          0     19.94
9        TCGA-05-4384          0     13.99
10       TCGA-05-4389          0     44.97
11       TCGA-05-4390          0     36.99
13       TCGA-05-4396          1      9.95
14       TCGA-05-4397          1     24.01
15       TCGA-05-4398          0     47.01


#### Reset the index to start from 0

In [25]:
luad_clinical_data = luad_clinical_data.reset_index(drop=True)

print(luad_clinical_data.head(10))

  Patient Identifier  OS_STATUS OS_MONTHS
0       TCGA-05-4245          0     23.98
1       TCGA-05-4249          0     50.03
2       TCGA-05-4250          1      3.98
3       TCGA-05-4382          0     19.94
4       TCGA-05-4384          0     13.99
5       TCGA-05-4389          0     44.97
6       TCGA-05-4390          0     36.99
7       TCGA-05-4396          1      9.95
8       TCGA-05-4397          1     24.01
9       TCGA-05-4398          0     47.01


### Gene data