<a href="https://colab.research.google.com/github/Super-rookie-Py/_MachineLearning_Class/blob/main/%EB%B0%95%EA%B1%B4%EC%9A%B0_%EC%99%80%EC%9D%B8%EB%8D%B0%EC%9D%B4%ED%84%B0_%EB%A1%9C%EC%A7%80%EC%8A%A4%ED%8B%B1_%ED%9A%8C%EA%B7%80_%EB%AA%A8%EB%8D%B8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 와인데이터 로지스틱 회귀모델

In [63]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler , MinMaxScaler
from sklearn.linear_model import LogisticRegression
import pandas as pd
import multiprocessing

### 정규화 과정을 거친 로지스틱회귀모델

In [108]:
X, y = load_wine(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 20)

model = make_pipeline(
    StandardScaler(),
    LogisticRegression()
)
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logisticregression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [109]:
cross_validate(
    estimator=model,
    X= X, y= y,
    cv= 5,
    n_jobs = multiprocessing.cpu_count(),
    verbose=True
)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.1s finished


{'fit_time': array([0.02232957, 0.0306201 , 0.01530313, 0.01574683, 0.01232195]),
 'score_time': array([0.00083375, 0.00087547, 0.00083876, 0.00090504, 0.00059032]),
 'test_score': array([0.97222222, 0.97222222, 1.        , 0.97142857, 1.        ])}

In [110]:
print("학습 데이터 점수: {}".format(model.score(X_train, y_train)))
print("평가 데이터 점수: {}".format(model.score(X_test, y_test)))

학습 데이터 점수: 1.0
평가 데이터 점수: 0.9777777777777777


In [111]:
scores = cross_val_score(model, X, y, scoring="accuracy", cv = 10)
print("CV 평균 점수: {}".format(scores.mean()))

CV 평균 점수: 0.9833333333333332


### GridSearchCV 이용해 최적의 파라미터찾기

- 데이터 전처리

In [112]:
wine = load_wine()
wine_df = pd.DataFrame(data=wine.data, columns = wine.feature_names)
wine_df.describe()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0


In [113]:
scaler = StandardScaler()
wine_scaled = scaler.fit_transform(wine_df)
wine_df_scaled = pd.DataFrame(data=wine_scaled, columns = wine.feature_names)
wine_df_scaled.describe()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,7.841418e-15,2.444986e-16,-4.059175e-15,-7.110417e-17,-2.4948830000000002e-17,-1.955365e-16,9.443133e-16,-4.178929e-16,-1.54059e-15,-4.129032e-16,1.398382e-15,2.126888e-15,-6.985673e-17
std,1.002821,1.002821,1.002821,1.002821,1.002821,1.002821,1.002821,1.002821,1.002821,1.002821,1.002821,1.002821,1.002821
min,-2.434235,-1.432983,-3.679162,-2.671018,-2.088255,-2.107246,-1.695971,-1.868234,-2.069034,-1.634288,-2.094732,-1.895054,-1.493188
25%,-0.7882448,-0.6587486,-0.5721225,-0.6891372,-0.8244151,-0.8854682,-0.8275393,-0.7401412,-0.5972835,-0.7951025,-0.7675624,-0.9522483,-0.7846378
50%,0.06099988,-0.423112,-0.02382132,0.001518295,-0.1222817,0.09595986,0.1061497,-0.1760948,-0.06289785,-0.1592246,0.03312687,0.2377348,-0.2337204
75%,0.8361286,0.6697929,0.6981085,0.6020883,0.5096384,0.8089974,0.8490851,0.6095413,0.6291754,0.493956,0.7131644,0.7885875,0.7582494
max,2.259772,3.109192,3.156325,3.154511,4.371372,2.539515,3.062832,2.402403,3.485073,3.435432,3.301694,1.960915,2.971473


In [114]:
X_train, X_test, y_train, y_test = train_test_split(wine_df_scaled, wine.target, test_size = 0.2, random_state= 20)

In [115]:
param_grid = [{"penalty": ["l1", 'l2'],
               "C" : [0.01, 0.1, 0.15 ,0.2, 0.3, 0.4]}]

gs = GridSearchCV(estimator = LogisticRegression(), param_grid = param_grid,
                  cv = 10, scoring ="accuracy", n_jobs=multiprocessing.cpu_count())
gs.fit(wine_df_scaled, wine.target)

GridSearchCV(cv=10, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=2,
             param_grid=[{'C': [0.01, 0.1, 0.15, 0.2, 0.3, 0.4],
                          'penalty': ['l1', 'l2']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [116]:
print(gs.best_estimator_)
print("최적 점수: {}".format(gs.best_score_))
print("최적 파라미터: {}".format(gs.best_params_))
pd.DataFrame(gs.cv_results_)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
최적 점수: 0.9888888888888889
최적 파라미터: {'C': 0.1, 'penalty': 'l2'}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001273,0.00093,0.0,0.0,0.01,l1,"{'C': 0.01, 'penalty': 'l1'}",,,,,,,,,,,,,7
1,0.012263,0.00163,0.002105,0.000468,0.01,l2,"{'C': 0.01, 'penalty': 'l2'}",1.0,1.0,0.944444,0.944444,1.0,0.944444,1.0,1.0,1.0,1.0,0.983333,0.025459,6
2,0.001032,0.000226,0.0,0.0,0.1,l1,"{'C': 0.1, 'penalty': 'l1'}",,,,,,,,,,,,,8
3,0.014433,0.002492,0.002065,0.00022,0.1,l2,"{'C': 0.1, 'penalty': 'l2'}",1.0,1.0,0.944444,0.944444,1.0,1.0,1.0,1.0,1.0,1.0,0.988889,0.022222,1
4,0.001714,0.002938,0.0,0.0,0.15,l1,"{'C': 0.15, 'penalty': 'l1'}",,,,,,,,,,,,,9
5,0.014703,0.005163,0.0019,0.000154,0.15,l2,"{'C': 0.15, 'penalty': 'l2'}",1.0,1.0,0.944444,0.944444,1.0,1.0,1.0,1.0,1.0,1.0,0.988889,0.022222,1
6,0.000918,0.000862,0.0,0.0,0.2,l1,"{'C': 0.2, 'penalty': 'l1'}",,,,,,,,,,,,,10
7,0.01549,0.003045,0.001947,0.000353,0.2,l2,"{'C': 0.2, 'penalty': 'l2'}",1.0,1.0,0.944444,0.944444,1.0,1.0,1.0,1.0,1.0,1.0,0.988889,0.022222,1
8,0.000587,7e-05,0.0,0.0,0.3,l1,"{'C': 0.3, 'penalty': 'l1'}",,,,,,,,,,,,,11
9,0.015809,0.002487,0.001998,0.001109,0.3,l2,"{'C': 0.3, 'penalty': 'l2'}",1.0,1.0,0.944444,0.944444,1.0,1.0,1.0,1.0,1.0,1.0,0.988889,0.022222,1


In [117]:
model = make_pipeline(
    StandardScaler(),
    LogisticRegression(penalty="l2", C = 0.1)
)
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logisticregression',
                 LogisticRegression(C=0.1, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [118]:
print("학습 데이터 점수: {}".format(model.score(X_train, y_train)))
print("평가 데이터 점수: {}".format(model.score(X_test, y_test)))

학습 데이터 점수: 0.9929577464788732
평가 데이터 점수: 1.0


In [119]:
cross_validate(
    estimator=model,
    X= X, y= y,
    cv= 5,
    n_jobs = multiprocessing.cpu_count(),
    verbose=True
)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.1s finished


{'fit_time': array([0.01600838, 0.01966238, 0.01165557, 0.01234651, 0.01227832]),
 'score_time': array([0.0009737 , 0.00099826, 0.00087333, 0.00098467, 0.00086474]),
 'test_score': array([0.97222222, 0.94444444, 1.        , 1.        , 1.        ])}

In [120]:
scores = cross_val_score(model, X, y, scoring="accuracy", cv = 10)
print("CV 평균 점수: {}".format(scores.mean()))

CV 평균 점수: 0.9888888888888889


### 경사하강법을 이용한 로지스틱회귀모델

In [121]:
from sklearn.linear_model import SGDClassifier

In [122]:
X, y = load_wine(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 20)

In [123]:
model = make_pipeline(StandardScaler(), SGDClassifier(loss = "log"))
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('sgdclassifier',
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='log',
                               max_iter=1000, n_iter_no_change=5, n_jobs=None,
                               penalty='l2', power_t=0.5, random_state=None,
                               shuffle=True, tol=0.001, validation_fraction=0.1,
                               verbose=0, warm_start=False))],
         verbose=False)

In [124]:
print("학습 데이터 점수: {}".format(model.score(X_train, y_train)))
print("평가 데이터 점수: {}".format(model.score(X_test, y_test)))

학습 데이터 점수: 1.0
평가 데이터 점수: 0.9777777777777777


In [125]:
cross_validate(
    estimator=model,
    X= X, y= y,
    cv= 5,
    n_jobs = multiprocessing.cpu_count(),
    verbose=True
)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.0s finished


{'fit_time': array([0.01115727, 0.01003289, 0.00570416, 0.00526619, 0.00320697]),
 'score_time': array([0.00070858, 0.00102949, 0.00062609, 0.00065303, 0.0004487 ]),
 'test_score': array([0.97222222, 1.        , 0.97222222, 0.94285714, 1.        ])}

In [127]:
scores = cross_val_score(model, X, y, scoring="accuracy", cv = 10)
print("CV 평균 점수: {}".format(scores.mean()))

CV 평균 점수: 0.9833333333333334


### SVC모델

In [129]:
from sklearn.svm import SVC

In [130]:
X, y = load_wine(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 20)

In [131]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [132]:
model = SVC(kernel='rbf')
model.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [133]:
print("학습 데이터 점수: {}".format(model.score(X_train, y_train)))
print("평가 데이터 점수: {}".format(model.score(X_test, y_test)))

학습 데이터 점수: 0.9924812030075187
평가 데이터 점수: 1.0


In [134]:
estimator = make_pipeline(StandardScaler(), SVC(kernel="rbf"))

cross_validate(
    estimator=estimator,
    X= X, y= y,
    cv= 5,
    n_jobs = multiprocessing.cpu_count(),
    verbose=True
)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.0s finished


{'fit_time': array([0.00413537, 0.00390434, 0.00339437, 0.00303912, 0.00287104]),
 'score_time': array([0.00110459, 0.00104451, 0.0009346 , 0.00091362, 0.00078464]),
 'test_score': array([1.        , 0.97222222, 0.94444444, 1.        , 1.        ])}

### 하이퍼 파라미터 값 변경

In [139]:
pipe = Pipeline([("scaler", StandardScaler()),
                 ("model", SVC(kernel='rbf'))])

param_grid = [{'model__gamma': ['scale', 'auto'],
               "model__C": [1.0, 0.1, 0.01],
              "model__degree": [1, 2, 3]}]

gs = GridSearchCV(
    estimator = pipe,
    param_grid = param_grid,
    n_jobs = multiprocessing.cpu_count(),
    cv = 5,
    verbose=True
)

gs.fit(X, y)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  90 out of  90 | elapsed:    0.4s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('model',
                                        SVC(C=1.0, break_ties=False,
                                            cache_size=200, class_weight=None,
                                            coef0=0.0,
                                            decision_function_shape='ovr',
                                            degree=3, gamma='scale',
                                            kernel='rbf', max_iter=-1,
                                            probability=False,
                                            random_state=None, shrinking=True,
                                            tol=0.0

In [140]:
print(gs.best_estimator_)
print("최적 점수: {}".format(gs.best_score_))
print("최적 파라미터: {}".format(gs.best_params_))

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('model',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=1,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)
최적 점수: 0.9833333333333334
최적 파라미터: {'model__C': 1.0, 'model__degree': 1, 'model__gamma': 'scale'}


In [141]:
model = gs.best_estimator_
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('model',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=1,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [143]:
print("학습 데이터 점수: {}".format(model.score(X_train, y_train)))
print("평가 데이터 점수: {}".format(model.score(X_test, y_test)))

학습 데이터 점수: 0.9924812030075187
평가 데이터 점수: 1.0


In [145]:
cross_validate(
    estimator=model,
    X= X, y= y,
    cv= 5,
    n_jobs = multiprocessing.cpu_count(),
    verbose=True
)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.0s finished


{'fit_time': array([0.01114297, 0.00875664, 0.00800133, 0.00310898, 0.0024147 ]),
 'score_time': array([0.00114775, 0.00100422, 0.00089145, 0.00094485, 0.00063348]),
 'test_score': array([1.        , 0.97222222, 0.94444444, 1.        , 1.        ])}