# Exploration project
# (3) load_breast_cancer : 유방암 여부를 진단해봅시다

In [39]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [4]:
cancer = load_breast_cancer()

In [5]:
cancer.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [12]:
print(cancer.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

## feature, label 지정하고 target_names 출력하기

In [30]:
cancer_data = cancer.data
cancer_feature = cancer.feature_names
cancer_label = cancer.target

In [32]:
print(cancer.target_names)

['malignant' 'benign']


In [28]:
import pandas as pd

cancer_df = pd.DataFrame(data= cancer.data, columns= cancer.feature_names)
cancer_df["label"] = cancer.target

## Data describe 하기

In [29]:
cancer_df.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,label
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946,0.627417
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061,0.483918
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504,0.0
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146,0.0
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004,1.0
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208,1.0
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075,1.0


In [34]:
x_train, x_test, y_train, y_test = train_test_split(cancer_data,
                                                    cancer_label,
                                                    test_size = 0.2,
                                                    random_state = 12)

## Dicision tree

In [36]:
from sklearn.tree import DecisionTreeClassifier

tree_data = DecisionTreeClassifier(random_state=32)
tree_data.fit(x_train, y_train)
c_pred_t = tree_data.predict(x_test)

accuracy_t = accuracy_score(y_test, c_pred_t)
accuracy_t

0.9035087719298246

In [43]:
confusion_matrix(y_test, c_pred_t) #양성인데 음성이 나온 경우가 6가지나 된다. 매우 큰 오류

array([[42,  6],
       [ 5, 61]])

In [42]:
print(classification_report(y_test, c_pred_t))

              precision    recall  f1-score   support

           0       0.89      0.88      0.88        48
           1       0.91      0.92      0.92        66

    accuracy                           0.90       114
   macro avg       0.90      0.90      0.90       114
weighted avg       0.90      0.90      0.90       114



## Random Forest

In [45]:
from sklearn.ensemble import RandomForestClassifier

In [46]:
ran_data = RandomForestClassifier(random_state=32)
ran_data.fit(x_train, y_train)
c_pred_r = ran_data.predict(x_test)

accuracy_r = accuracy_score(y_test, c_pred_r)
accuracy_r

0.9298245614035088

In [47]:
print(confusion_matrix(y_test, c_pred_r)) #양성인데 못찾아낸 경우가 7가지

[[41  7]
 [ 1 65]]


In [48]:
print(classification_report(y_test, c_pred_r))

              precision    recall  f1-score   support

           0       0.98      0.85      0.91        48
           1       0.90      0.98      0.94        66

    accuracy                           0.93       114
   macro avg       0.94      0.92      0.93       114
weighted avg       0.93      0.93      0.93       114



## SVM

In [49]:
from sklearn import svm

In [51]:
svm_data = svm.SVC()

svm_data.fit(x_train, y_train)
c_pred_s = svm_data.predict(x_test)

accuracy_s = accuracy_score(y_test, c_pred_s)
accuracy_s

0.8859649122807017

In [52]:
print(confusion_matrix(y_test, c_pred_s)) # 양성을 13개나 못찾아냈다.

[[35 13]
 [ 0 66]]


In [53]:
print(classification_report(y_test, c_pred_s))

              precision    recall  f1-score   support

           0       1.00      0.73      0.84        48
           1       0.84      1.00      0.91        66

    accuracy                           0.89       114
   macro avg       0.92      0.86      0.88       114
weighted avg       0.90      0.89      0.88       114



## SGD

In [55]:
from sklearn.linear_model import SGDClassifier

In [57]:
sgd_data = SGDClassifier()

sgd_data.fit(x_train, y_train)
c_pred_sg = sgd_data.predict(x_test)

accuracy_sg = accuracy_score(y_test, c_pred_sg)
accuracy_sg

0.6666666666666666

In [58]:
print(confusion_matrix(y_test, c_pred_sg)) #양성을 찾는 기준이 뭔가 잘못된듯. 음성을 양성이라고 너무 많이 판단.

[[48  0]
 [38 28]]


In [59]:
print(classification_report(y_test, c_pred_sg))

              precision    recall  f1-score   support

           0       0.56      1.00      0.72        48
           1       1.00      0.42      0.60        66

    accuracy                           0.67       114
   macro avg       0.78      0.71      0.66       114
weighted avg       0.81      0.67      0.65       114



## Logistic Regression

In [60]:
from sklearn.linear_model import LogisticRegression

In [62]:
Lr_data = LogisticRegression(max_iter=3000)

Lr_data.fit(x_train, y_train)
c_pred_Lr = Lr_data.predict(x_test)

accuracy_Lr = accuracy_score(y_test, c_pred_Lr)
accuracy_Lr

0.9385964912280702

In [63]:
print(confusion_matrix(y_test, c_pred_Lr)) # 양성을 7개나 못찾아냈다.

[[41  7]
 [ 0 66]]


In [64]:
print(classification_report(y_test, c_pred_Lr))

              precision    recall  f1-score   support

           0       1.00      0.85      0.92        48
           1       0.90      1.00      0.95        66

    accuracy                           0.94       114
   macro avg       0.95      0.93      0.94       114
weighted avg       0.94      0.94      0.94       114

