# 모델 생성

In [1]:
import pandas as pd
import joblib
import sklearn.datasets as sd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler

## 유방암 데이터

In [2]:
df_train = pd.read_csv('../static/data/cancer_train.csv')
df_train.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,15.66,23.2,110.2,773.5,0.1109,0.3114,0.3176,0.1377,0.2495,0.08104,...,31.64,143.7,1226.0,0.1504,0.5172,0.6181,0.2462,0.3277,0.1019,0
1,16.16,21.54,106.2,809.8,0.1008,0.1284,0.1043,0.05613,0.216,0.05891,...,31.68,129.7,1175.0,0.1395,0.3055,0.2992,0.1312,0.348,0.07619,0
2,22.27,19.67,152.8,1509.0,0.1326,0.2768,0.4264,0.1823,0.2556,0.07039,...,28.01,206.8,2360.0,0.1701,0.6997,0.9608,0.291,0.4055,0.09789,0
3,11.76,18.14,75.0,431.1,0.09968,0.05914,0.02685,0.03515,0.1619,0.06287,...,23.39,85.1,553.6,0.1137,0.07974,0.0612,0.0716,0.1978,0.06915,0
4,17.93,24.48,115.2,998.9,0.08855,0.07027,0.05699,0.04744,0.1538,0.0551,...,34.69,135.1,1320.0,0.1315,0.1806,0.208,0.1136,0.2504,0.07948,0


### 결정 트리

In [3]:
X_train = df_train.drop(columns='target', axis=1)
y_train = df_train.target.values
X_train.shape, y_train.shape

((426, 30), (426,))

In [4]:
df_test = pd.read_csv('../static/data/cancer_test.csv')
X_test = df_test.drop(columns='target', axis=1)
y_test = df_test.target.values

In [5]:
dtc = DecisionTreeClassifier()
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': None,
 'splitter': 'best'}

In [6]:
params = {
    'max_depth': [2, 3, 4, 5, 7],
    'min_samples_split': [2, 3, 4, 5]
}

In [7]:
grid_cv = GridSearchCV(dtc, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9272
최적 파라미터: {'max_depth': 3, 'min_samples_split': 4}


In [8]:
best_dt = grid_cv.best_estimator_
pred = best_dt.predict(X_test)
accuracy_score(y_test, pred)

0.9440559440559441

In [9]:
joblib.dump(best_dt, '../static/model/cancer_dt.pkl')

['../static/model/cancer_dt.pkl']

### 정규화

In [10]:
scaler = MinMaxScaler()
X_train_sc = scaler.fit_transform(df_train.drop(columns='target', axis=1))
X_test_sc = scaler.fit_transform(df_test.drop(columns='target', axis=1))

### SVM

In [11]:
svc = SVC()
svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [24]:
params = {'C': [6, 7, 8]}

In [25]:
grid_cv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train_sc, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9766
최적 파라미터: {'C': 8}


In [26]:
best_svc = grid_cv.best_estimator_
pred = best_svc.predict(X_test_sc)
accuracy_score(y_test, pred)

0.8881118881118881

In [27]:
joblib.dump(best_svc, '../static/model/cancer_sv.pkl')

['../static/model/cancer_sv.pkl']

### Logistic Regression

In [28]:
lr = LogisticRegression()
lr.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [33]:
params = {'C': [5, 6, 7, 8]}

In [34]:
grid_cv = GridSearchCV(lr, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train_sc, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9718
최적 파라미터: {'C': 6}


In [35]:
best_lr = grid_cv.best_estimator_
pred = best_lr.predict(X_test_sc)
accuracy_score(y_test, pred)

0.9300699300699301

In [36]:
joblib.dump(best_lr, '../static/model/cancer_lr.pkl')

['../static/model/cancer_lr.pkl']