# 데이터 처리 모델 만들기

In [1]:
import pandas as pd

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
from sklearn.model_selection import GridSearchCV

In [4]:
from sklearn.metrics import accuracy_score

In [5]:
import joblib

### 유방암 데이터

In [6]:
breast_train = pd.read_csv('./static/data/breast_train.csv')
breast_test = pd.read_csv('./static/data/breast_test.csv')
X_train = breast_train.iloc[:,:-1]
X_test = breast_test.iloc[:,:-1]
y_train = breast_train['target']
y_test = breast_test['target']

In [7]:
breast_dtclf = DecisionTreeClassifier()
breast_svc = SVC()
breast_lr = LogisticRegression()
breast_knn = KNeighborsClassifier()
breast_rfclf = RandomForestClassifier()

In [8]:
breast_dtclf.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': None,
 'splitter': 'best'}

In [9]:
params = {
    'ccp_alpha': [0 ,1, 10],
    'min_samples_leaf': [3,4,5,6,7],
    'min_samples_split': [3, 4, 5, 8, 10]
}

In [10]:
grid_cv = GridSearchCV(breast_dtclf, param_grid=params, cv=3, n_jobs=-1)
grid_cv.fit(X_train,y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터: ', grid_cv.best_params_)

최고 평균 정확도: 0.9343
최적 파라미터:  {'ccp_alpha': 0, 'min_samples_leaf': 3, 'min_samples_split': 3}


In [11]:
best_breast_dtclf = grid_cv.best_estimator_
pred = best_breast_dtclf.predict(X_test)
accuracy_score(y_test, pred)

0.951048951048951

In [12]:
joblib.dump(best_breast_dtclf, 'static/model/best_breast_dtclf.pkl')

['static/model/best_breast_dtclf.pkl']

In [13]:
breast_svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [14]:
params = {
    'C': [0 ,1, 10, 100],
    'cache_size': [150,200,250],
    'degree': [1,2,3,4,5]
}
grid_cv = GridSearchCV(breast_svc, param_grid=params, cv=3, n_jobs=-1)
grid_cv.fit(X_train,y_train)
best_breast_svc = grid_cv.best_estimator_
pred = best_breast_svc.predict(X_test)
accuracy_score(y_test, pred)

0.9300699300699301

In [15]:
joblib.dump(best_breast_svc, 'static/model/best_breast_svc.pkl')

['static/model/best_breast_svc.pkl']

In [16]:
breast_lr.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [17]:
params = {
    'C': [0,1,2,3,4,5,6,7,8,9,10],
    'max_iter': [50,100,150,200,250],
}
grid_cv = GridSearchCV(breast_lr, param_grid=params, cv=3, n_jobs=-1)
grid_cv.fit(X_train,y_train)
best_breast_lr = grid_cv.best_estimator_
pred = best_breast_lr.predict(X_test)
accuracy_score(y_test, pred)

0.9440559440559441

In [18]:
joblib.dump(best_breast_lr, 'static/model/best_breast_lr.pkl')

['static/model/best_breast_lr.pkl']

In [19]:
breast_knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [20]:
params = {
    'leaf_size': [20,25,30,35,40],
    'n_neighbors': [3,4,5,6,7],
    'p': [1,2,3]
}
grid_cv = GridSearchCV(breast_knn, param_grid=params, cv=3, n_jobs=-1)
grid_cv.fit(X_train,y_train)
best_breast_knn = grid_cv.best_estimator_
pred = best_breast_knn.predict(X_test)
accuracy_score(y_test, pred)

0.9440559440559441

In [21]:
joblib.dump(best_breast_knn, 'static/model/best_breast_knn.pkl')

['static/model/best_breast_knn.pkl']

In [22]:
breast_rfclf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [23]:
params = {
    'ccp_alpha': [0,1,2,3,4,5],
    'min_samples_leaf': [1,2,3,4,5],
    'min_samples_split': [1,2,3,4,5],
    'n_estimators':[50,75,100,125,150]
}
grid_cv = GridSearchCV(breast_rfclf, param_grid=params, cv=3, n_jobs=-1)
grid_cv.fit(X_train,y_train)
best_breast_rfclf = grid_cv.best_estimator_
pred = best_breast_rfclf.predict(X_test)
accuracy_score(y_test, pred)

0.965034965034965

In [24]:
joblib.dump(best_breast_rfclf, 'static/model/best_breast_rfclf.pkl')

['static/model/best_breast_rfclf.pkl']

In [25]:
y_test[142]

1

### 당뇨병 데이터

In [56]:
diabetes_train = pd.read_csv('./static/data/diabetes_train.csv')
diabetes_test = pd.read_csv('./static/data/diabetes_test.csv')
X_train = diabetes_train.iloc[:,:-1]
X_test = diabetes_test.iloc[:,:-1]
y_train = diabetes_train.iloc[:,-1]
y_test = diabetes_test.iloc[:,-1]
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test= scaler.fit_transform(X_test)

In [58]:
diabetes_dtclf = DecisionTreeClassifier()
diabetes_svc = SVC()
diabetes_lr = LogisticRegression()
diabetes_knn = KNeighborsClassifier()
diabetes_rfclf = RandomForestClassifier()

In [59]:
params = {
    'ccp_alpha': [0 ,1, 2,3, 4, 5,6,7,8,9, 10],
    'min_samples_leaf': [1,2,3,4,5,6,7,8,9],
    'min_samples_split': [1,2, 3, 4, 5,6,7,8,9]
}
grid_cv = GridSearchCV(diabetes_dtclf, param_grid=params, cv=3, n_jobs=-1)
grid_cv.fit(X_train,y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터: ', grid_cv.best_params_)
best_diabetes_dtclf = grid_cv.best_estimator_
pred = best_diabetes_dtclf.predict(X_test)
print(accuracy_score(y_test, pred))

최고 평균 정확도: 0.7014
최적 파라미터:  {'ccp_alpha': 0, 'min_samples_leaf': 2, 'min_samples_split': 2}
0.7135416666666666


In [65]:
joblib.dump(best_diabetes_dtclf, 'static/model/best_diabetes_dtclf.pkl')

['static/model/best_diabetes_dtclf.pkl']

In [60]:
params = {
    'C': [7],
    'cache_size': [1,2,3,4,5,6,7,8,9,10],
    'degree': [1]
}
grid_cv = GridSearchCV(diabetes_svc, param_grid=params, cv=3, n_jobs=-1)
grid_cv.fit(X_train,y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터: ', grid_cv.best_params_)
best_diabetes_svc = grid_cv.best_estimator_
pred = best_diabetes_svc.predict(X_test)
accuracy_score(y_test, pred)

최고 평균 정확도: 0.7153
최적 파라미터:  {'C': 7, 'cache_size': 1, 'degree': 1}


0.7708333333333334

In [66]:
joblib.dump(best_diabetes_svc, 'static/model/best_diabetes_svc.pkl')

['static/model/best_diabetes_svc.pkl']

In [61]:
params = {
    'C': [5],
    'max_iter': [95,100,105],
}
grid_cv = GridSearchCV(diabetes_lr, param_grid=params, cv=3, n_jobs=-1)
grid_cv.fit(X_train,y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터: ', grid_cv.best_params_)
best_diabetes_lr = grid_cv.best_estimator_
pred = best_diabetes_lr.predict(X_test)
accuracy_score(y_test, pred)

최고 평균 정확도: 0.7535
최적 파라미터:  {'C': 5, 'max_iter': 95}


0.8125

In [67]:
joblib.dump(best_diabetes_lr, 'static/model/best_diabetes_lr.pkl')

['static/model/best_diabetes_lr.pkl']

In [62]:
params = {
    'leaf_size': [1,2,3,4,5,6,7,8,9,10],
    'n_neighbors': [8],
    'p': [5,6,7,8,9]
}
grid_cv = GridSearchCV(diabetes_knn, param_grid=params, cv=3, n_jobs=-1)
grid_cv.fit(X_train,y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터: ', grid_cv.best_params_)
best_diabetes_knn = grid_cv.best_estimator_
pred = best_diabetes_knn.predict(X_test)
accuracy_score(y_test, pred)

최고 평균 정확도: 0.7153
최적 파라미터:  {'leaf_size': 1, 'n_neighbors': 8, 'p': 7}


0.7708333333333334

In [68]:
joblib.dump(best_diabetes_knn, 'static/model/best_diabetes_knn.pkl')

['static/model/best_diabetes_knn.pkl']

In [64]:
params = {
    'ccp_alpha': [0],
    'min_samples_leaf': [1],
    'min_samples_split': [3],
    'n_estimators':[150,175,200]
}
grid_cv = GridSearchCV(diabetes_rfclf, param_grid=params, cv=3, n_jobs=-1)
grid_cv.fit(X_train,y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터: ', grid_cv.best_params_)
best_diabetes_rfclf = grid_cv.best_estimator_
pred = best_diabetes_rfclf.predict(X_test)
accuracy_score(y_test, pred)

최고 평균 정확도: 0.7413
최적 파라미터:  {'ccp_alpha': 0, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 200}


0.7864583333333334

In [69]:
joblib.dump(best_diabetes_rfclf, 'static/model/best_diabetes_rfclf.pkl')

['static/model/best_diabetes_rfclf.pkl']