# TUMOR TYPE PREDICTION

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import joblib
from joblib import load

import sys
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

# dataset loading 

In [72]:
data = pd.read_csv('BCW_dataset.csv')

X = data.iloc[:, 1:-1]
y = data[['y']].replace('B', 0).replace('M', 1)
X = X.drop(['x.radius_se', 'x.texture_se', 'x.perimeter_se', 'x.area_se', 'x.smoothness_se', 'x.compactness_se', 
              'x.concavity_se', 'x.concave_pts_se', 'x.symmetry_se', 'x.fractal_dim_se'], axis=1)
X = X.drop(['x.perimeter_mean', 'x.area_mean', 'x.perimeter_worst', 'x.area_worst'], axis=1)

X_array = np.array(X)
y_array = np.array(y)
print('target :', y.shape)
print('features :', X.shape)

target : (569, 1)
features : (569, 16)


# function model loading and direction to prediction

In [106]:
def model_loading_and_work(filename, model_name):
    try:
        model = joblib.load(filename)
        probability = prediction_function(model, X_array, model_name)
    except Exception as e:
        probability = pd.DataFrame(np.zeros(len(X_array[0:]), dtype=float, order='C'), columns=[model_name+'_predict']) 
    return probability

# function for prediction

In [70]:
# creates dataframe with 3 columns: probability_B, probability_M, predicted tumor type (B or M)
# predicted types are marked according to probability (>=0.99*, >=0.999**) 

def prediction_function(model, X_test, name):
    
    #  predicted classes 
    tumor_type = model.predict(X_test)
    tumor_type = pd.DataFrame(tumor_type)
    tumor_type = tumor_type[0].replace(0, 'B').replace(1, 'M')

    # prediction probabilities
    probability = model.predict_proba(X_test)
    probability = pd.DataFrame(probability, columns=[f'{name}_prob_B', f'{name}_prob_M'] )
    probability[f'{name}_predict'] = tumor_type

    # stars
    for i in range(len(tumor_type)):
        for p in [0.99, 0.999]:
            if probability[f'{name}_prob_B'].iloc[i] >= p or probability[f'{name}_prob_M'].iloc[i] >= p:
                probability[f'{name}_predict'].iloc[i] = probability[f'{name}_predict'].iloc[i] + '*'

    return probability

# function saving csv file

In [71]:
# saves files filename(1).csv, filename(2).csv...

def save_file_function(content, file_name, file_format='csv'):  
    import os    
    n = 1
    while True:  
        name = file_name+'('+str(n)+').'+file_format
        if os.path.isfile(name) == True:
            n += 1 
        else:
            try:
                content.to_csv(name)   
                print('object saved:', os.getcwd()+'\\'+name)
                break
            except Exception:
                print('sorry, object not saved')
                break

# prediction with models 

In [124]:
# linear Discriminant Analysis
lda_probability = model_loading_and_work('model_lda.joblib', 'LDA')
lda_probability

Unnamed: 0,LDA_prob_B,LDA_prob_M,LDA_predict
0,9.239595e-01,0.076040,B
1,9.988494e-01,0.001151,B*
2,9.999656e-01,0.000034,B**
3,9.994607e-01,0.000539,B**
4,9.999778e-01,0.000022,B**
...,...,...,...
564,4.581967e-05,0.999954,M**
565,4.029288e-05,0.999960,M**
566,5.314704e-04,0.999469,M**
567,1.260222e-01,0.873978,M


In [125]:
# Quadratic Discriminant Analysis
qda_probability = model_loading_and_work('model_qda.joblib', 'QDA')
qda_probability

Unnamed: 0,QDA_prob_B,QDA_prob_M,QDA_predict
0,9.937488e-01,6.251161e-03,B*
1,9.999722e-01,2.783966e-05,B**
2,9.999999e-01,1.375763e-07,B**
3,9.995190e-01,4.810438e-04,B**
4,1.000000e+00,3.955391e-10,B**
...,...,...,...
564,9.519109e-29,1.000000e+00,M**
565,9.682498e-50,1.000000e+00,M**
566,4.094867e-16,1.000000e+00,M**
567,1.513718e-02,9.848628e-01,M


In [126]:
# Logistic Regression
lgr_probability = model_loading_and_work('log_regression.joblib', 'LGR')
lgr_probability

Unnamed: 0,LGR_prob_B,LGR_prob_M,LGR_predict
0,9.746063e-01,0.025394,B
1,9.818794e-01,0.018121,B
2,9.999526e-01,0.000047,B**
3,9.980490e-01,0.001951,B*
4,9.999430e-01,0.000057,B**
...,...,...,...
564,3.041849e-05,0.999970,M**
565,3.469999e-05,0.999965,M**
566,3.842496e-05,0.999962,M**
567,1.095524e-02,0.989045,M


In [127]:
result_table = pd.concat([lda_probability, qda_probability, lgr_probability, y], axis=1)
save_file_function(result_table, 'BC_prediction')

object saved: C:\Users\MyWork\PycharmProjects\pythonProject3\BC_prediction(2).csv
