# Glioma Grading Clinical and Mutation Features

In [1]:
from pathlib import Path

import pandas as pd

from src.data_processing import convert_age, encode_dataset

In [2]:
DATASETS_PATH = Path('datasets')
train_dataset_path = DATASETS_PATH / 'train.csv'
test_dataset_path = DATASETS_PATH / 'test.csv'

In [3]:
data = pd.read_csv(train_dataset_path, index_col=0)
data.head()

Unnamed: 0,Grade,Case_ID,Gender,Age_at_diagnosis,Primary_Diagnosis,Race,IDH1,TP53,ATRX,PTEN,...,FUBP1,RB1,NOTCH1,BCOR,CSMD3,SMARCA4,GRIN2A,IDH2,FAT4,PDGFRA
0,GBM,TCGA-06-A6S0,Male,79 years 183 days,Glioblastoma,black or african american,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,...,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED
1,LGG,TCGA-HT-8106,Male,53 years 197 days,"Astrocytoma, anaplastic",white,MUTATED,MUTATED,NOT_MUTATED,NOT_MUTATED,...,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,MUTATED,NOT_MUTATED
2,LGG,TCGA-HT-7607,Female,61 years 305 days,"Astrocytoma, NOS",white,MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,...,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED
3,LGG,TCGA-QH-A6X5,Female,58 years 55 days,Mixed glioma,white,MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,...,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED
4,LGG,TCGA-P5-A5F1,Male,30 years 113 days,"Astrocytoma, NOS",white,MUTATED,MUTATED,MUTATED,NOT_MUTATED,...,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED


### Column Analysis

* The `Grade` column is our target.

* The column `Primary_Diagnosis` has 5 unique values, we will encode it with `LabelEncoder`.

* Column `Case_ID` represents unique id of case, we will remove from our dataset.

* The following columns represent the gen mutations.
    ```text
    IDH1, TP53, ATRX, PTEN, EGFR, CIC, MUC16, PIK3CA,
    NF1, PIK3R1, FUBP1, RB1, NOTCH1, BCOR, CSMD3, SMARCA4,
    GRIN2A, IDH2, FAT4, PDGFRA
    ```
    They might be only `MUTATED` or `NOT_MUTATED`, so we will encode it with `LabelEncoder`.

* The binary type column `Gender` will be encoded with `LabelEncoder` too.

* The `Age_at_diagnosis` column has a string representation of date. We will convert it into the numeric type.

### Missing Data

We detect that 4 cases has no `Age_at_diagnosis` data. We decided to remove them from training set.

# Data processing

In [7]:
data_processing = data.copy().drop(['Case_ID'], axis=1)


data_processing['Age_at_diagnosis'] = convert_age(data_processing['Age_at_diagnosis'])
data_processing = data_processing.dropna(axis=0)

object_columns = list(data_processing.select_dtypes(include='object').columns)
data_processing = encode_dataset(data_processing, columns=object_columns, verbose=True)

Grade: ['GBM' 'LGG']
Gender: ['Female' 'Male']
Primary_Diagnosis: ['Astrocytoma, NOS' 'Astrocytoma, anaplastic' 'Glioblastoma'
 'Mixed glioma' 'Oligodendroglioma, NOS' 'Oligodendroglioma, anaplastic']
Race: ['american indian or alaska native' 'asian' 'black or african american'
 'not reported' 'white']
IDH1: ['MUTATED' 'NOT_MUTATED']
TP53: ['MUTATED' 'NOT_MUTATED']
ATRX: ['MUTATED' 'NOT_MUTATED']
PTEN: ['MUTATED' 'NOT_MUTATED']
EGFR: ['MUTATED' 'NOT_MUTATED']
CIC: ['MUTATED' 'NOT_MUTATED']
MUC16: ['MUTATED' 'NOT_MUTATED']
PIK3CA: ['MUTATED' 'NOT_MUTATED']
NF1: ['MUTATED' 'NOT_MUTATED']
PIK3R1: ['MUTATED' 'NOT_MUTATED']
FUBP1: ['MUTATED' 'NOT_MUTATED']
RB1: ['MUTATED' 'NOT_MUTATED']
NOTCH1: ['MUTATED' 'NOT_MUTATED']
BCOR: ['MUTATED' 'NOT_MUTATED']
CSMD3: ['MUTATED' 'NOT_MUTATED']
SMARCA4: ['MUTATED' 'NOT_MUTATED']
GRIN2A: ['MUTATED' 'NOT_MUTATED']
IDH2: ['MUTATED' 'NOT_MUTATED']
FAT4: ['MUTATED' 'NOT_MUTATED']
PDGFRA: ['MUTATED' 'NOT_MUTATED']


In [5]:
data_processing

Unnamed: 0,Grade,Gender,Age_at_diagnosis,Primary_Diagnosis,Race,IDH1,TP53,ATRX,PTEN,EGFR,...,FUBP1,RB1,NOTCH1,BCOR,CSMD3,SMARCA4,GRIN2A,IDH2,FAT4,PDGFRA
0,0,1,79.501370,2,2,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,1,1,53.539726,1,4,0,0,1,1,1,...,1,1,1,1,1,1,1,1,0,1
2,1,0,61.835616,0,4,0,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,1,0,58.150685,3,4,0,1,1,1,1,...,1,1,1,0,1,1,1,1,1,1
4,1,1,30.309589,0,4,0,0,0,1,1,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597,1,0,31.021918,0,2,0,0,0,1,1,...,1,1,1,1,1,1,1,1,1,1
599,1,0,27.846575,1,4,0,0,0,1,1,...,1,1,1,1,1,1,1,1,1,1
600,1,1,29.087671,3,4,0,0,0,1,1,...,1,1,1,1,1,1,1,1,1,1
601,1,1,51.854795,5,4,0,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [6]:
max(data_processing['Primary_Diagnosis'])

5