# Glioma Grading Clinical and Mutation Features

In [None]:
from pathlib import Path
import shutil
import os

import pandas as pd
import matplotlib.font_manager
import warnings
warnings.filterwarnings("ignore")

from src.data_processing import process_data, normalize_dataset
from src.feature_estimators import get_feature_estimator, select_by_correlation_value, concat_important_features
from src.visualize import make_2d_representation

# Data processing
---

In [None]:
DATASETS_PATH = Path('datasets')
train_dataset_path = DATASETS_PATH / 'train.csv'
test_dataset_path = DATASETS_PATH / 'test.csv'

In [None]:
train_dataset = pd.read_csv(train_dataset_path, index_col=0)
test_dataset = pd.read_csv(test_dataset_path, index_col=0)

train_dataset.head()

### Column Analysis

* The `Grade` column is our target.

* The column `Primary_Diagnosis` has 5 unique values, we will encode it with `LabelEncoder`.

* Column `Case_ID` represents unique id of case, we will remove from our dataset.

* The following columns represent the gen mutations.
    ```text
    IDH1, TP53, ATRX, PTEN, EGFR, CIC, MUC16, PIK3CA,
    NF1, PIK3R1, FUBP1, RB1, NOTCH1, BCOR, CSMD3, SMARCA4,
    GRIN2A, IDH2, FAT4, PDGFRA
    ```
    They might be only `MUTATED` or `NOT_MUTATED`, so we will encode it with `LabelEncoder`.

* The binary type column `Gender` will be encoded with `LabelEncoder` too.

* The `Age_at_diagnosis` column has a string representation of date. We will convert it into the numeric type.

### Missing Data

We detect that 4 cases has no `Age_at_diagnosis` data. We decided to remove them from training set.

In [None]:
encoder = 'Label'

train_dataset = process_data(train_dataset, encoder=encoder, target='Grade')
test_dataset = process_data(test_dataset, encoder=encoder)

2d Representation of our data is presented at the figure below

In [None]:
figure = make_2d_representation(train_dataset)

## Feature selection
---

To investigate the most valuable features we decided to calculate some correlation metrics.

In [None]:
data_for_feature_analysis = train_dataset.copy()
training_data = normalize_dataset(train_dataset.drop(columns=['Grade']))
validation_data = normalize_dataset(test_dataset)
targets = data_for_feature_analysis['Grade']

### Mutual information
Mutual information is a lot like correlation in that it measures a relationship between two quantities. The advantage of mutual information is that it can detect any kind of relationship, while correlation only detects linear relationships.


In [None]:
correlation_estimator = get_feature_estimator(training_data, targets, method='mutual_info-classification')
mutual_important_values = select_by_correlation_value(correlation_estimator, min_score=0.2)
mutual_important_values

The most valuable features are:
```text
Primary_Diagnosis, IDH1, Age_at_diagnosis
```

The least valuable features are:
```text
Gender, BCOR, FAT4, PIK3CA, Race, GRIN2A, PIK3R1
```

### Pearson's 

The Pearson correlation measures the strength of the linear relationship between two variables.

In [None]:
correlation_estimator = get_feature_estimator(training_data, targets, method='pearson')
pearson_important_values = select_by_correlation_value(correlation_estimator, min_score=0.2)
pearson_important_values

The most valuable features are
```text
IDH1, Age_at_diagnosis, PTEN, ATRX, CIC
```

The least valuable features are:
```text
BKOR, PIK3CA, FAT4
```

### Selected Features

We decided to automatically select features if their correlation metric is greater than 0.2

In [None]:
keep_columns = concat_important_features(pearson_important_values, mutual_important_values)
training_data = training_data[keep_columns]
validation_data = test_dataset[keep_columns]
training_data

## Experiments with model

In [None]:
from src.models import logreg_classifier, catboost_classifier, rf_classifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

x_train, x_test, y_train, y_test = train_test_split(training_data, targets, test_size=0.3, random_state=42)

model_map = {
    'Logistic Regression Classifier': logreg_classifier,
    'CatBoost Classifier': catboost_classifier,
    'Random Forest' : rf_classifier,
}
results_path = Path('results')
if (os.path.exists(results_path)):
    shutil.rmtree(results_path)
results_path.mkdir(exist_ok=True)

for model_name, model in model_map.items():
    classifier = model(x_train, y_train)
    
    prediction = classifier.predict(x_test)
    score = f1_score(y_test, prediction)
    print(f'Model {model_name}: {score}')

    model_file_name = model_name.replace(' ', '_').lower()
    model_result = results_path / f'model_{model_file_name}_{score: .2f}.csv'
    val_prediction = classifier.predict(validation_data)
    val_prediction = [int(not value) for value in val_prediction]
    pd.DataFrame(zip(range(len(val_prediction)), val_prediction),
                 columns=['Id', 'Grade']).to_csv(model_result, index=False)

In [None]:
from src.models import voting_classifier
ensemble_model = voting_classifier(x_train, y_train)

prediction = classifier.predict(x_test)
score = f1_score(y_test, prediction)

print(f'voting classifier score: {score}')