# 분류 문제 데이터셋
## 아래의 예제 중 원하는 데이터 Cell 만 실행
- 필기체 숫자 데이터 분류 문제 (MNIST)
- 유방암 악성/양성 분류 문제
- 당뇨병 환자 분류 문제
- 와인 종류 분류 문제

In [3]:
### MNIST
from sklearn import datasets
digits=datasets.load_digits()

n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))
print(data.shape)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, digits.target, test_size=0.3, shuffle=True)


(1797, 64)


In [None]:
### 유방암
from sklearn import datasets
cancer=datasets.load_breast_cancer()

n_samples = len(cancer.data)
data = cancer.data.reshape((n_samples, -1))
print(data.shape)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, cancer.target, test_size=0.3, shuffle=True)


(569, 30)


In [None]:
### 당뇨병
from sklearn import datasets
diabetes=datasets.load_diabetes()

n_samples = len(diabetes.data)
data = diabetes.data.reshape((n_samples, -1))
print(data.shape)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, diabetes.target, test_size=0.3, shuffle=True)


(442, 10)


In [48]:
### 와인
from sklearn import datasets
wine=datasets.load_wine()

n_samples = len(wine.data)
data = wine.data.reshape((n_samples, -1))
print(data.shape)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, wine.target, test_size=0.3, shuffle=True)


(178, 13)


# 분류 알고리즘 별 성능 평가
## [1] 데이터 전처리 본인 스스로 공부!!
## [2] 각 알고리즘 마다 설정 가능한 파라미터는 본인 스스로 공부!
- Decision Tree Classifier
- DA Classifier
- Logistic Regression Classifier
- KNN Classifier

### Decision Tree

In [20]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(criterion='entropy', max_depth=4, random_state=1) # gini, max_depth
dtc.fit(X_train, y_train)
y_train_pred = dtc.predict(X_train)
y_test_pred = dtc.predict(X_test)

from sklearn.metrics import accuracy_score
print(accuracy_score(y_train, y_train_pred))
print(accuracy_score(y_test, y_test_pred))

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_test_pred))

0.9919354838709677
0.9259259259259259
[[21  0  0]
 [ 3 16  1]
 [ 0  0 13]]


### LDA

In [49]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

cld = LinearDiscriminantAnalysis()
cld.fit(X_train,y_train)
y_train_pred = cld.predict(X_train)
y_test_pred = cld.predict(X_test)

from sklearn.metrics import accuracy_score
print(accuracy_score(y_train, y_train_pred))
print(accuracy_score(y_test, y_test_pred))

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_test_pred))

0.9919354838709677
1.0
[[19  0  0]
 [ 0 16  0]
 [ 0  0 19]]


### QDA

In [50]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

cld = QuadraticDiscriminantAnalysis()
cld.fit(X_train,y_train)
y_train_pred = cld.predict(X_train)
y_test_pred = cld.predict(X_test)

from sklearn.metrics import accuracy_score
print(accuracy_score(y_train, y_train_pred))
print(accuracy_score(y_test, y_test_pred))

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_test_pred))

0.9919354838709677
0.9629629629629629
[[17  2  0]
 [ 0 16  0]
 [ 0  0 19]]


### Logistic Regression

In [47]:
# 표준화
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=1, penalty='l2', C=1.0)
lr.fit(X_train, y_train)
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)


from sklearn.metrics import accuracy_score
print(accuracy_score(y_train, y_train_pred))
print(accuracy_score(y_test, y_test_pred))

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_test_pred))

0.9838709677419355
0.9629629629629629
[[19  1  0]
 [ 0 18  1]
 [ 0  0 15]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### KNN

In [34]:
# 표준화
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5, p=1)
knn.fit(X_train,y_train)

y_train_pred = knn.predict(X_train)
y_test_pred = knn.predict(X_test)

from sklearn.metrics import accuracy_score
print(accuracy_score(y_train, y_train_pred))
print(accuracy_score(y_test, y_test_pred))

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_test_pred))

0.8629032258064516
0.7777777777777778
[[19  2  2]
 [ 1 18  1]
 [ 1  5  5]]
