# 모델 생성

In [1]:
import pandas as pd
import joblib
import sklearn.datasets as sd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler

## 유방암 데이터

In [2]:
df_train = pd.read_csv('../static/data/cancer_train.csv')
df_train.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,15.66,23.2,110.2,773.5,0.1109,0.3114,0.3176,0.1377,0.2495,0.08104,...,31.64,143.7,1226.0,0.1504,0.5172,0.6181,0.2462,0.3277,0.1019,0
1,16.16,21.54,106.2,809.8,0.1008,0.1284,0.1043,0.05613,0.216,0.05891,...,31.68,129.7,1175.0,0.1395,0.3055,0.2992,0.1312,0.348,0.07619,0
2,22.27,19.67,152.8,1509.0,0.1326,0.2768,0.4264,0.1823,0.2556,0.07039,...,28.01,206.8,2360.0,0.1701,0.6997,0.9608,0.291,0.4055,0.09789,0
3,11.76,18.14,75.0,431.1,0.09968,0.05914,0.02685,0.03515,0.1619,0.06287,...,23.39,85.1,553.6,0.1137,0.07974,0.0612,0.0716,0.1978,0.06915,0
4,17.93,24.48,115.2,998.9,0.08855,0.07027,0.05699,0.04744,0.1538,0.0551,...,34.69,135.1,1320.0,0.1315,0.1806,0.208,0.1136,0.2504,0.07948,0


### 결정 트리

In [3]:
X_train = df_train.drop(columns='target', axis=1)
y_train = df_train.target.values
X_train.shape, y_train.shape

((426, 30), (426,))

In [4]:
df_test = pd.read_csv('../static/data/cancer_test.csv')
X_test = df_test.drop(columns='target', axis=1)
y_test = df_test.target.values

In [5]:
dtc = DecisionTreeClassifier()
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': None,
 'splitter': 'best'}

In [6]:
params = {
    'max_depth': [2, 3, 4, 5, 7],
    'min_samples_split': [2, 3, 4, 5]
}

In [7]:
grid_cv = GridSearchCV(dtc, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9272
최적 파라미터: {'max_depth': 3, 'min_samples_split': 4}


In [8]:
best_dt = grid_cv.best_estimator_
pred = best_dt.predict(X_test)
accuracy_score(y_test, pred)

0.9440559440559441

In [9]:
joblib.dump(best_dt, '../static/model/cancer_dt.pkl')

['../static/model/cancer_dt.pkl']

### 정규화

In [10]:
scaler = MinMaxScaler()
X_train_sc = scaler.fit_transform(df_train.drop(columns='target', axis=1))
X_test_sc = scaler.fit_transform(df_test.drop(columns='target', axis=1))

### SVM

In [11]:
svc = SVC()
svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [24]:
params = {'C': [6, 7, 8]}

In [25]:
grid_cv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train_sc, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9766
최적 파라미터: {'C': 8}


In [26]:
best_svc = grid_cv.best_estimator_
pred = best_svc.predict(X_test_sc)
accuracy_score(y_test, pred)

0.8881118881118881

In [27]:
joblib.dump(best_svc, '../static/model/cancer_sv.pkl')

['../static/model/cancer_sv.pkl']

### Logistic Regression

In [28]:
lr = LogisticRegression()
lr.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [33]:
params = {'C': [5, 6, 7, 8]}

In [34]:
grid_cv = GridSearchCV(lr, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train_sc, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9718
최적 파라미터: {'C': 6}


In [35]:
best_lr = grid_cv.best_estimator_
pred = best_lr.predict(X_test_sc)
accuracy_score(y_test, pred)

0.9300699300699301

In [36]:
joblib.dump(best_lr, '../static/model/cancer_lr.pkl')

['../static/model/cancer_lr.pkl']

## 피마 원주민

In [2]:
import pandas as pd
import numpy as np

pima = pd.read_csv('../static/data/pima/diabetes.csv')
pima.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
zero_features = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
pima[zero_features] = pima[zero_features].replace(0, pima[zero_features].mean())

In [5]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(pima, pima.Outcome, test_size=0.25, random_state=2021)

In [6]:
X_train_df = pd.DataFrame(X_train, columns=pima.columns)
X_test_df = pd.DataFrame(X_test, columns=pima.columns)
X_train_df.to_csv('../static/data/pima_train.csv')
X_test_df.to_csv('../static/data/pima_test.csv')

In [12]:
df_train = pd.read_csv('../static/data/pima_train.csv')
y_train = df_train.Outcome.values
X_train = scaler.fit_transform(df_train.drop('Outcome', axis=1))
df_test = pd.read_csv('../static/data/pima_test.csv')
y_test = df_test.Outcome.values
X_test = scaler.fit_transform(df_test.drop('Outcome', axis=1))
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((576, 9), (576,), (192, 9), (192,))

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

lr = LogisticRegression()
sv = SVC()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()

In [14]:
lr.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [15]:
params = {'C': [0.4, 0.5, 0.6]}

grid_cv = GridSearchCV(lr, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print('최적 파라미터:', grid_cv.best_params_)
print(f'최고 정확도: {grid_cv.best_score_:.4f}')

최적 파라미터: {'C': 0.5}
최고 정확도: 0.7725


In [16]:
from sklearn.metrics import accuracy_score
best_lr = grid_cv.best_estimator_
pred = best_lr.predict(X_test)
accuracy_score(y_test, pred)

0.7708333333333334

In [17]:
joblib.dump(best_lr, '../static/model/pima_lr.pkl')

['../static/model/pima_lr.pkl']

In [18]:
sv.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [21]:
params = {'C': [0.1, 0.2, 0.3, 0.4]}

grid_cv = GridSearchCV(sv, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print('최적 파라미터:', grid_cv.best_params_)
print(f'최고 정확도: {grid_cv.best_score_:.4f}')

최적 파라미터: {'C': 0.3}
최고 정확도: 0.7691


In [22]:
best_sv = grid_cv.best_estimator_
pred = best_sv.predict(X_test)
accuracy_score(y_test, pred)

0.734375

In [23]:
joblib.dump(best_sv, '../static/model/pima_sv.pkl')

['../static/model/pima_sv.pkl']

In [24]:
dt.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': None,
 'splitter': 'best'}

In [25]:
params = {
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [2, 3, 4]
}

grid_cv = GridSearchCV(dt, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print('최적 파라미터:', grid_cv.best_params_)
print(f'최고 정확도: {grid_cv.best_score_:.4f}')

최적 파라미터: {'max_depth': 5, 'min_samples_leaf': 3, 'min_samples_split': 3}
최고 정확도: 0.7638


In [26]:
best_dt = grid_cv.best_estimator_
pred = best_dt.predict(X_test)
accuracy_score(y_test, pred)

0.609375

In [27]:
joblib.dump(best_dt, '../static/model/pima_dt.pkl')

['../static/model/pima_dt.pkl']

In [28]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [29]:
params = {
    'max_depth': [8, 10, 12],
    'min_samples_leaf': [2, 4, 6],
    'min_samples_split': [5, 7, 9]
}

grid_cv = GridSearchCV(rf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print('최적 파라미터:', grid_cv.best_params_)
print(f'최고 정확도: {grid_cv.best_score_:.4f}')

최적 파라미터: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 7}
최고 정확도: 0.7778


In [30]:
best_rf = grid_cv.best_estimator_
pred = best_rf.predict(X_test)
accuracy_score(y_test, pred)

0.7604166666666666

In [31]:
joblib.dump(best_rf, '../static/model/pima_rf.pkl')

['../static/model/pima_rf.pkl']

## 타이타닉

In [1]:
import pandas as pd
import numpy as np
titanic_train = pd.read_csv('../static/data/titanic/train.csv')

In [2]:
titanic_train['sex'] = np.nan
for i in titanic_train.index:
    titanic_train['sex'][i] = 0 if titanic_train['Sex'][i] == 'female' else 1

In [3]:
train_df = pd.DataFrame({
    'Pclass': titanic_train.Pclass,
    'Sex': titanic_train.sex,
    'Age': titanic_train.Age,
    'SibSp': titanic_train.SibSp,
    'Parch': titanic_train.Parch,
    'Fare': titanic_train.Fare,
    'Embarked': titanic_train.Embarked,
    'Survived': titanic_train.Survived
})
train_df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,3,1.0,22.0,1,0,7.25,S,0
1,1,0.0,38.0,1,0,71.2833,C,1
2,3,0.0,26.0,0,0,7.925,S,1
3,1,0.0,35.0,1,0,53.1,S,1
4,3,1.0,35.0,0,0,8.05,S,0


In [4]:
train_grouped = train_df[['Pclass', 'Fare']].groupby(['Pclass'], as_index=False).mean()
train_grouped

Unnamed: 0,Pclass,Fare
0,1,84.154687
1,2,20.662183
2,3,13.67555


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    float64
 2   Age       714 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Embarked  889 non-null    object 
 7   Survived  891 non-null    int64  
dtypes: float64(3), int64(4), object(1)
memory usage: 55.8+ KB


In [6]:
for i in train_df.index:
    train_df['Age'][i] = train_df['Age'][i] * 100 if train_df['Age'][i] < 1 else train_df['Age'][i]
train_df['Age'] = train_df['Age'].fillna(round(train_df['Age'].mean(), 0))
train_df.tail()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
886,2,1.0,27.0,0,0,13.0,S,0
887,1,0.0,19.0,0,0,30.0,S,1
888,3,0.0,30.0,1,2,23.45,S,0
889,1,1.0,26.0,0,0,30.0,C,1
890,3,1.0,32.0,0,0,7.75,Q,0


In [7]:
train_df['Embarked'] = train_df['Embarked'].fillna('S')
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    float64
 2   Age       891 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Embarked  891 non-null    object 
 7   Survived  891 non-null    int64  
dtypes: float64(3), int64(4), object(1)
memory usage: 55.8+ KB


In [8]:
train_df.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Survived
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,2.308642,0.647587,30.333333,0.523008,0.381594,32.204208,0.383838
std,0.836071,0.47799,13.385112,1.102743,0.806057,49.693429,0.486592
min,1.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,22.0,0.0,0.0,7.9104,0.0
50%,3.0,1.0,30.0,0.0,0.0,14.4542,0.0
75%,3.0,1.0,36.0,1.0,0.0,31.0,1.0
max,3.0,1.0,92.0,8.0,6.0,512.3292,1.0


In [9]:
for i in train_df.index:
    if (train_df.Pclass[i] == 1) & (train_df.Fare[i] == 0):
        train_df.Fare[i] = train_grouped['Fare'][0]
    elif (train_df.Pclass[i] == 2) & (train_df.Fare[i] == 0):
        train_df.Fare[i] = train_grouped['Fare'][1]
    elif (train_df.Pclass[i] == 3) & (train_df.Fare[i] == 0):
        train_df.Fare[i] = train_grouped['Fare'][2]

In [10]:
train_df.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Survived
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,2.308642,0.647587,30.333333,0.523008,0.381594,32.87699,0.383838
std,0.836071,0.47799,13.385112,1.102743,0.806057,49.690114,0.486592
min,1.0,0.0,1.0,0.0,0.0,4.0125,0.0
25%,2.0,0.0,22.0,0.0,0.0,7.925,0.0
50%,3.0,1.0,30.0,0.0,0.0,14.5,0.0
75%,3.0,1.0,36.0,1.0,0.0,31.275,1.0
max,3.0,1.0,92.0,8.0,6.0,512.3292,1.0


In [11]:
for i in train_df.index:
    if train_df.Embarked[i] == 'S':
        train_df.Embarked[i] = 0
    elif train_df.Embarked[i] == 'C':
        train_df.Embarked[i] = 1
    else:
        train_df.Embarked[i] = 2

In [12]:
train_df.to_csv('../static/data/titanic_train.csv')

### 학습

In [14]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

lr = LogisticRegression()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
sv = SVC()

X_train, X_test, y_train, y_test = train_test_split(train_df, train_df.Survived, test_size=0.25, random_state=2021)

In [15]:
X_train_df = pd.DataFrame(X_train, columns=train_df.columns)
X_test_df = pd.DataFrame(X_test, columns=train_df.columns)
X_train_df.to_csv('../static/data/titanic_train.csv')
X_test_df.to_csv('../static/data/titanic_test.csv')

In [17]:
df_train = pd.read_csv('../static/data/titanic_train.csv')
y_train = df_train.Survived.values
X_train = df_train.drop('Survived', axis=1)
df_test = pd.read_csv('../static/data/titanic_test.csv')
y_test = df_test.Survived.values
X_test = df_test.drop('Survived', axis=1)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((668, 8), (668,), (223, 8), (223,))

In [18]:
print(lr.get_params())
print(dt.get_params())
print(rf.get_params())
print(sv.get_params())

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': 'deprecated', 'random_state': None, 'splitter': 'best'}
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verb

In [19]:
params = {'C': [0.1, 0.5, 0.7]}

grid_cv = GridSearchCV(lr, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print('최적 파라미터:', grid_cv.best_params_)
print(f'최고 정확도: {grid_cv.best_score_:.4f}')

최적 파라미터: {'C': 0.1}
최고 정확도: 0.7995


In [20]:
best_lr = grid_cv.best_estimator_
pred = best_lr.predict(X_test)
accuracy_score(y_test, pred)

0.7623318385650224

In [21]:
import joblib
joblib.dump(best_lr, '../static/model/titanic_lr.pkl')

['../static/model/titanic_lr.pkl']

In [22]:
params = {'C': [90, 100, 110, 120]}

grid_cv = GridSearchCV(sv, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print('최적 파라미터:', grid_cv.best_params_)
print(f'최고 정확도: {grid_cv.best_score_:.4f}')

최적 파라미터: {'C': 100}
최고 정확도: 0.6752


In [23]:
best_sv = grid_cv.best_estimator_
pred = best_sv.predict(X_test)
accuracy_score(y_test, pred)

0.6636771300448431

In [24]:
joblib.dump(best_sv, '../static/model/titanic_sv.pkl')

['../static/model/titanic_sv.pkl']

In [26]:
params = {
    'max_depth': [2, 3, 4, 5],
    'min_samples_leaf': [1, 2, 3, 4],
    'min_samples_split': [1, 2, 3, 4]
}

grid_cv = GridSearchCV(dt, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print('최적 파라미터:', grid_cv.best_params_)
print(f'최고 정확도: {grid_cv.best_score_:.4f}')

최적 파라미터: {'max_depth': 3, 'min_samples_leaf': 3, 'min_samples_split': 2}
최고 정확도: 0.8144


In [27]:
best_dt = grid_cv.best_estimator_
pred = best_dt.predict(X_test)
accuracy_score(y_test, pred)

0.7802690582959642

In [28]:
joblib.dump(best_dt, '../static/model/titanic_dt.pkl')

['../static/model/titanic_dt.pkl']

In [29]:
params = {
    'max_depth': [4, 5, 6],
    'min_samples_leaf': [1, 2, 3, 4],
    'min_samples_split': [1, 2, 3, 4]
}

grid_cv = GridSearchCV(rf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print('최적 파라미터:', grid_cv.best_params_)
print(f'최고 정확도: {grid_cv.best_score_:.4f}')

최적 파라미터: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 3}
최고 정확도: 0.8353


In [30]:
best_rf = grid_cv.best_estimator_
pred = best_rf.predict(X_test)
accuracy_score(y_test, pred)

0.7982062780269058

In [31]:
joblib.dump(best_rf, '../static/model/titanic_rf.pkl')

['../static/model/titanic_rf.pkl']

## 붓꽃

In [32]:
iris_train = pd.read_csv('../static/data/iris_train.csv')
iris_test = pd.read_csv('../static/data/iris_test.csv')

In [33]:
X_train = iris_train.drop('target', axis=1)
y_train = iris_train.target.values
X_train.shape, y_train.shape

((112, 4), (112,))

In [34]:
X_test = iris_test.drop('target', axis=1)
y_test = iris_test.target.values
X_test.shape, y_test.shape

((38, 4), (38,))

In [35]:
from sklearn.neighbors import KNeighborsClassifier

dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
kn = KNeighborsClassifier()

In [36]:
dt.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': None,
 'splitter': 'best'}

In [37]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [38]:
kn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [39]:
params = {
    'max_depth': range(2, 10),
    'min_samples_leaf': range(1, 5),
    'min_samples_split': range(1, 5)
}

grid_cv = GridSearchCV(dt, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print('최적 파라미터:', grid_cv.best_params_)
print(f'최고 정확도: {grid_cv.best_score_:.4f}')

최적 파라미터: {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 3}
최고 정확도: 0.9553


In [40]:
best_dt = grid_cv.best_estimator_
pred = best_dt.predict(X_test)
accuracy_score(y_test, pred)

0.9473684210526315

In [41]:
joblib.dump(best_dt, '../static/model/iris_dt.pkl')

['../static/model/iris_dt.pkl']

In [43]:
params = {
    'max_depth': range(1, 5),
    'min_samples_leaf': range(1, 5),
    'min_samples_split': range(2, 11)
}

grid_cv = GridSearchCV(rf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print('최적 파라미터:', grid_cv.best_params_)
print(f'최고 정확도: {grid_cv.best_score_:.4f}')

최적 파라미터: {'max_depth': 2, 'min_samples_leaf': 2, 'min_samples_split': 4}
최고 정확도: 0.9640


In [44]:
best_rf = grid_cv.best_estimator_
pred = best_rf.predict(X_test)
accuracy_score(y_test, pred)

0.9473684210526315

In [45]:
joblib.dump(best_rf, '../static/model/iris_rf.pkl')

['../static/model/iris_rf.pkl']

In [46]:
params = {'n_neighbors': range(3, 10)}

grid_cv = GridSearchCV(kn, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print('최적 파라미터:', grid_cv.best_params_)
print(f'최고 정확도: {grid_cv.best_score_:.4f}')

최적 파라미터: {'n_neighbors': 4}
최고 정확도: 0.9731


In [47]:
best_kn = grid_cv.best_estimator_
pred = best_kn.predict(X_test)
accuracy_score(y_test, pred)

0.9736842105263158

In [48]:
joblib.dump(best_kn, '../static/model/iris_kn.pkl')

['../static/model/iris_kn.pkl']

## 와인

In [49]:
wine_train = pd.read_csv('../static/data/wine_train.csv')
wine_test = pd.read_csv('../static/data/wine_test.csv')

In [50]:
X_train = wine_train.drop('target', axis=1)
y_train = wine_train.target.values
X_test = wine_test.drop('target', axis=1)
y_test = wine_test.target.values
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((133, 13), (45, 13), (133,), (45,))

In [51]:
params = {
    'max_depth': range(2, 10),
    'min_samples_leaf': range(1, 5),
    'min_samples_split': range(1, 5)
}

grid_cv = GridSearchCV(dt, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print('최적 파라미터:', grid_cv.best_params_)
print(f'최고 정확도: {grid_cv.best_score_:.4f}')

최적 파라미터: {'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 4}
최고 정확도: 0.9402


In [52]:
best_dt = grid_cv.best_estimator_
pred = best_dt.predict(X_test)
accuracy_score(y_test, pred)

0.9111111111111111

In [53]:
joblib.dump(best_dt, '../static/model/wine_dt.pkl')

['../static/model/wine_dt.pkl']

In [54]:
params = {
    'max_depth': range(1, 5),
    'min_samples_leaf': range(1, 5),
    'min_samples_split': range(2, 11)
}

grid_cv = GridSearchCV(rf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print('최적 파라미터:', grid_cv.best_params_)
print(f'최고 정확도: {grid_cv.best_score_:.4f}')

최적 파라미터: {'max_depth': 2, 'min_samples_leaf': 2, 'min_samples_split': 5}
최고 정확도: 0.9852


In [55]:
best_rf = grid_cv.best_estimator_
pred = best_rf.predict(X_test)
accuracy_score(y_test, pred)

0.9555555555555556

In [56]:
joblib.dump(best_rf, '../static/model/wine_rf.pkl')

['../static/model/wine_rf.pkl']

In [57]:
params = {'n_neighbors': range(3, 10)}

grid_cv = GridSearchCV(kn, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print('최적 파라미터:', grid_cv.best_params_)
print(f'최고 정확도: {grid_cv.best_score_:.4f}')

최적 파라미터: {'n_neighbors': 8}
최고 정확도: 0.7063


In [58]:
best_kn = grid_cv.best_estimator_
pred = best_kn.predict(X_test)
accuracy_score(y_test, pred)

0.7111111111111111

In [59]:
joblib.dump(best_kn, '../static/model/wine_kn.pkl')

['../static/model/wine_kn.pkl']