In [1]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from collections import OrderedDict

from sklearn import datasets
from sklearn.preprocessing import label_binarize, LabelBinarizer
from sklearn.model_selection import train_test_split, GridSearchCV


from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score


In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
DISPLAY_PRECISION = 4

In [6]:
pd.set_option("display.precision", DISPLAY_PRECISION)

#Get the breast cancer dataset from sklearn
dat = datasets.load_breast_cancer()
## print(dat.DESCR)  # detailed description

In [7]:
li_classes = [dat.target_names[1], dat.target_names[0]]
li_target = [1 if x==0 else 0 for x in list(dat.target)]
li_ftrs = list(dat.feature_names)

In [8]:
print("There are 2 target classes:")
print("li_classes", li_classes)
print("---")
print("Target class distribution from a total of %d target values:" % len(li_target))
print(pd.Series(li_target).value_counts())
print("---")


There are 2 target classes:
li_classes ['benign', 'malignant']
---
Target class distribution from a total of 569 target values:
0    357
1    212
dtype: int64
---


In [9]:
df_all = pd.DataFrame(dat.data[:,:], columns=li_ftrs)
print("Describe dataframe, first 6 columns:")
print(df_all.iloc[:,:6].describe().to_string())
     

Describe dataframe, first 6 columns:
       mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness
count     569.0000      569.0000         569.000   569.0000         569.0000          569.0000
mean       14.1273       19.2896          91.969   654.8891           0.0964            0.1043
std         3.5240        4.3010          24.299   351.9141           0.0141            0.0528
min         6.9810        9.7100          43.790   143.5000           0.0526            0.0194
25%        11.7000       16.1700          75.170   420.3000           0.0864            0.0649
50%        13.3700       18.8400          86.240   551.1000           0.0959            0.0926
75%        15.7800       21.8000         104.100   782.7000           0.1053            0.1304
max        28.1100       39.2800         188.500  2501.0000           0.1634            0.3454


In [10]:
# Setup X and y
train_df = df_all.copy()
train_df['target'] = li_target

Question: (2 min)
1.) What is the most correlated variable to target?

Question: (8 min)
2.) Of the top five most correlated variable what is the quatitative difference, with tolerance, between Benign(Negative) and Malignant(Positive)?

In [15]:
corr_matrix=df_all.corr()


In [16]:
corr_matrix

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
mean radius,1.0,0.3238,0.9979,0.9874,0.1706,0.5061,0.6768,0.8225,0.1477,-0.3116,...,0.9695,0.297,0.9651,0.9411,0.1196,0.4135,0.5269,0.7442,0.164,0.0071
mean texture,0.3238,1.0,0.3295,0.3211,-0.0234,0.2367,0.3024,0.2935,0.0714,-0.0764,...,0.3526,0.912,0.358,0.3435,0.0775,0.2778,0.301,0.2953,0.105,0.1192
mean perimeter,0.9979,0.3295,1.0,0.9865,0.2073,0.5569,0.7161,0.851,0.183,-0.2615,...,0.9695,0.303,0.9704,0.9415,0.1505,0.4558,0.5639,0.7712,0.1891,0.051
mean area,0.9874,0.3211,0.9865,1.0,0.177,0.4985,0.686,0.8233,0.1513,-0.2831,...,0.9627,0.2875,0.9591,0.9592,0.1235,0.3904,0.5126,0.722,0.1436,0.0037
mean smoothness,0.1706,-0.0234,0.2073,0.177,1.0,0.6591,0.522,0.5537,0.5578,0.5848,...,0.2131,0.0361,0.2389,0.2067,0.8053,0.4725,0.4349,0.5031,0.3943,0.4993
mean compactness,0.5061,0.2367,0.5569,0.4985,0.6591,1.0,0.8831,0.8311,0.6026,0.5654,...,0.5353,0.2481,0.5902,0.5096,0.5655,0.8658,0.8163,0.8156,0.5102,0.6874
mean concavity,0.6768,0.3024,0.7161,0.686,0.522,0.8831,1.0,0.9214,0.5007,0.3368,...,0.6882,0.2999,0.7296,0.676,0.4488,0.755,0.8841,0.8613,0.4095,0.5149
mean concave points,0.8225,0.2935,0.851,0.8233,0.5537,0.8311,0.9214,1.0,0.4625,0.1669,...,0.8303,0.2928,0.8559,0.8096,0.4528,0.6675,0.7524,0.9102,0.3757,0.3687
mean symmetry,0.1477,0.0714,0.183,0.1513,0.5578,0.6026,0.5007,0.4625,1.0,0.4799,...,0.1857,0.0907,0.2192,0.1772,0.4267,0.4732,0.4337,0.4303,0.6998,0.4384
mean fractal dimension,-0.3116,-0.0764,-0.2615,-0.2831,0.5848,0.5654,0.3368,0.1669,0.4799,1.0,...,-0.2537,-0.0513,-0.2052,-0.2319,0.5049,0.4588,0.3462,0.1753,0.334,0.7673


In [29]:
df_all.corrwith(train_df['target'])

mean radius                0.7300
mean texture               0.4152
mean perimeter             0.7426
mean area                  0.7090
mean smoothness            0.3586
mean compactness           0.5965
mean concavity             0.6964
mean concave points        0.7766
mean symmetry              0.3305
mean fractal dimension    -0.0128
radius error               0.5671
texture error             -0.0083
perimeter error            0.5561
area error                 0.5482
smoothness error          -0.0670
compactness error          0.2930
concavity error            0.2537
concave points error       0.4080
symmetry error            -0.0065
fractal dimension error    0.0780
worst radius               0.7765
worst texture              0.4569
worst perimeter            0.7829
worst area                 0.7338
worst smoothness           0.4215
worst compactness          0.5910
worst concavity            0.6596
worst concave points       0.7936
worst symmetry             0.4163
worst fractal 

# worst concave points is most correlated with target