# SVM (Support Vector Machine)

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import  precision_score, accuracy_score, recall_score, roc_curve, roc_auc_score

dataset = pd.read_csv('datasets/Social_Network_Ads.csv')

In [2]:
dataset.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [3]:
X = dataset.iloc[:, [2,3]].values.astype("float32")
y = dataset.iloc[:, 4].values.astype("float32")

In [4]:
# dataset 을 Training 과 Test set 으로 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [5]:
# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test  = sc.transform(X_test)
print(X_train.shape)
X_train[:10]

(320, 2)


array([[ 1.9229547 ,  2.1460156 ],
       [ 2.0201657 ,  0.3787193 ],
       [-1.3822186 , -0.4324987 ],
       [-1.1877967 , -1.0119401 ],
       [ 1.9229547 , -0.9250239 ],
       [ 0.367579  ,  0.2918031 ],
       [ 0.17315707,  0.14694272],
       [ 2.0201657 ,  1.7404066 ],
       [ 0.75642294, -0.8381077 ],
       [ 0.27036804, -0.28763834]], dtype=float32)

In [6]:
# Training set 에 대해 Logistic Regression 을 fitting
classifier = SVC(kernel='rbf')

classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [7]:
# test set 으로 predict
y_pred = classifier.predict(X_test)
y_pred

array([0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0.,
       0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1.,
       0., 0., 1., 1., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0.,
       0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 1.], dtype=float32)

In [8]:
# making confusion matrix
print("confution matrix\n", confusion_matrix(y_test, y_pred, labels=[1, 0]))
print()
print("classification report\n", classification_report(y_test, y_pred, labels=[1, 0]))

confution matrix
 [[21  1]
 [ 3 55]]

classification report
               precision    recall  f1-score   support

           1       0.88      0.95      0.91        22
           0       0.98      0.95      0.96        58

   micro avg       0.95      0.95      0.95        80
   macro avg       0.93      0.95      0.94        80
weighted avg       0.95      0.95      0.95        80



In [9]:
print("Accuracy: {:.2f}".format(accuracy_score(y_test, y_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, y_pred, labels=[1, 0])))
print("Recall: {:.2f}".format(recall_score(y_test, y_pred, labels=[1, 0])))

Accuracy: 0.95
Precision: 0.88
Recall: 0.95


# 실습

[UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)) 를 이용한 종양의 양성, 악성 여부 check  

[cell_samples.csv](https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/cell_samples.csv)

1. Sample code number: id number 
2. Clump Thickness: 1 - 10 
3. Uniformity of Cell Size: 1 - 10 
4. Uniformity of Cell Shape: 1 - 10 
5. Marginal Adhesion: 1 - 10 
6. Single Epithelial Cell Size: 1 - 10 
7. Bare Nuclei: 1 - 10 
8. Bland Chromatin: 1 - 10 
9. Normal Nucleoli: 1 - 10 
10. Mitoses: 1 - 10 
11. Class: (2 for benign, 4 for malignant)

In [10]:
cell_df = pd.read_csv("datasets/cell_samples.csv")

In [11]:
cell_df.dtypes

ID              int64
Clump           int64
UnifSize        int64
UnifShape       int64
MargAdh         int64
SingEpiSize     int64
BareNuc        object
BlandChrom      int64
NormNucl        int64
Mit             int64
Class           int64
dtype: object

### ID column 은 종양의 양성, 악성 판별과 무관한 것이 확실하므로 feature 에서 제외

### BareNuc	역시 numerical data 가 아니므로 dop 

In [12]:
feature_df = cell_df.drop(['ID', 'BareNuc'], axis=1)

In [13]:
feature_df.head()

Unnamed: 0,Clump,UnifSize,UnifShape,MargAdh,SingEpiSize,BlandChrom,NormNucl,Mit,Class
0,5,1,1,1,2,3,1,1,2
1,5,4,4,5,7,3,2,1,2
2,3,1,1,1,2,3,1,1,2
3,6,8,8,1,3,3,7,1,2
4,4,1,1,3,2,3,1,1,2


### dataframe 을 numpy array 로 변환

## Train/Test dataset 분리

In [None]:
# your code below