In [None]:
import numpy as np
import pandas as pd
import plotly_express as px
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

In [None]:
#1. Sklearn을 이용해서 iris 데이터를 읽어서 pandas dataframe에 저장한 결과를 캡쳐해서 올려주세요

iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['target'] = iris.target
iris_df.head()

In [None]:
#2. Plotly express를 이용해서 네 개의 특성을 한 그래프에 boxplot으로 그린 결과를 캡쳐해서 올려주세요

fig = px.box(iris_df.iloc[:,:4])
fig.show()

In [None]:
#3. Standard Scaler를 적용한 데이터를 또 다른 pandas dataframe에 저장한 결과를 캡쳐해서 올려주세요

ss = StandardScaler()

iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_ss = ss.fit_transform(iris_df)
iris_ss = pd.DataFrame(iris_ss, columns=iris.feature_names)
iris_ss['target'] = iris.target
iris_ss.head()


In [None]:
#4. Standard Scaler를 적용한 데이터의 네 개의 특성을 한 그래프에 boxplot을 그려주세요. 이때 plotly express를 사용해주세요. 그리고 결과를 캡쳐해서 올려주세요

fig = px.box(iris_ss.iloc[:,:4])
fig.show()


In [None]:
# 5. Standard Scaler를 적용한 데이터를 8:2로 train, test 데이터로 나눈 결과를 캡쳐해서 올려주세요

x = iris_ss.iloc[:,:4]
y = iris_ss['target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5, stratify=y)
print(len(x_train),',',len(x_test),',',len(y_train),',',len(y_test))

In [None]:
# 6. 5번의 train 데이터에 RandomForestClassifier, DecisionTreeClassifier, LogisticRegression, kNN 모델들을 이용해서 분류학습을 시켜주세요.
# 이때 각각의 모델의 test 데이터에 대한 accuracy를 제시해 주세요. 하이퍼파라미터는 알아서 잡아주세요. 그리고 결과를 캡쳐해서 올려주세요

x = iris_ss.drop(['target'],axis=1)
y = iris_ss['target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=50, stratify=y)

rf = RandomForestClassifier(random_state=50, n_estimators=100)
rf.fit(x_train, y_train)
iris_rf = rf.predict(x_test)

tree = DecisionTreeClassifier(max_depth=2, random_state=50)
tree.fit(x_train, y_train)
iris_tree = tree.predict(x_test)

lr = LogisticRegression(solver='liblinear', random_state=50)
lr.fit(x_train, y_train)
iris_lr = lr.predict(x_test)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train, y_train)
iris_knn = knn.predict(x_test)

print("RandomForest Score       :", accuracy_score(iris_rf, y_test))
print('DecisionTree Score       :', accuracy_score(y_test, iris_tree))
print('LogisticRegression Score :', accuracy_score(y_test, iris_lr))
print('KNeighbors Score         :', accuracy_score(y_test,iris_knn))

In [None]:
# 7. 6번의 상황에서 train 데이터와 test 데이터의 accuracy를 모델별로 pandas dataframe에 정리해 주시고, 결과를 캡쳐해서 올려주세요

x = iris_ss.drop(['target'],axis=1)
y = iris_ss['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=50)

rf = RandomForestClassifier(random_state=5, n_estimators=100)
rf.fit(x_train, y_train)

tree = DecisionTreeClassifier(max_depth=2, random_state=5)
tree.fit(x_train, y_train)

lr = LogisticRegression(solver='liblinear', random_state=5)
lr.fit(x_train, y_train)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train, y_train)

scores_df = []
train_df = []
test_df = []

train_df.append(accuracy_score(y_train, rf.predict(x_train)))
train_df.append(accuracy_score(y_train, tree.predict(x_train)))
train_df.append(accuracy_score(y_train, lr.predict(x_train)))
train_df.append(accuracy_score(y_train, knn.predict(x_train)))

test_df.append(accuracy_score(y_test, rf.predict(x_test)))
test_df.append(accuracy_score(y_test, tree.predict(x_test)))
test_df.append(accuracy_score(y_test, lr.predict(x_test)))
test_df.append(accuracy_score(y_test, knn.predict(x_test)))

scores_df.append(train_df)
scores_df.append(test_df)

scores_df = pd.DataFrame(scores_df, columns=['RandomForest', 'DecisionTree', 'LogisticRegression', 'KNeighbors'])
scores_df.index = ['Train', 'Test']

scores_df

In [None]:
# 8. 6번의 상황에서 5겹 kFold해서 cross validation score를 계산해서, 각 모델별로 각 score의 평균과 표준편차를 제시해 주시고, 결과를 캡쳐해서 올려주세요.

x = iris_ss.drop(['target'],axis=1)
y = iris_ss['target']

rf = RandomForestClassifier(random_state=5, n_estimators=100)
tree = DecisionTreeClassifier(max_depth=2, random_state=5)
lr = LogisticRegression(solver='liblinear', random_state=5)
knn = KNeighborsClassifier(n_neighbors=5)

kfold = KFold(n_splits=5)
scores_rf = cross_val_score(rf, x, y, scoring=None, cv=kfold)
scores_tree = cross_val_score(tree, x, y, scoring=None, cv=kfold)
scores_lr = cross_val_score(lr, x, y, scoring=None, cv=kfold)
scores_knn = cross_val_score(knn, x, y, scoring=None, cv=kfold)

scores_list = []
scores_list.append(list(scores_rf))
scores_list.append(list(scores_tree))
scores_list.append(list(scores_lr))
scores_list.append(list(scores_knn))
scores_df = pd.DataFrame(scores_list, index=['Random Forest', 'Decision Tree', 'Logistic Regression', 'K-Nearest Neighbors'])
scores_df.columns = [f'Fold {i+1}' for i in range(5)] 
mean_scores = scores_df.mean(axis=1)
std_scores = scores_df.std(axis=1)
scores_df['Mean'] = mean_scores
scores_df['Std'] = std_scores
scores_df

In [None]:
# 9. 8번의 상황에서 각 모델별 cv score를 boxplot으로 그려서 비교해주세요. 여러분들은 어떤 모델이 가장 좋다고 생각하나요? 간단히 적어주세요.

transpose_df = np.transpose(scores_df)
fig = px.box(transpose_df.iloc[0:5])
fig.show()

In [None]:
# 10. 9번의 상황에서 각 fold별 score를 모델별로 pandas dataframe에 정리한 결과를 캡쳐해서 올려주세요
scores_df

In [None]:
# 11. 다시 원본 데이터에서 데이터를 test와 train으로 나눈 후, Standard scaler와 Decision Tree를 연달아 사용하는 pipeline을 꾸며 주세요.
# 그리고, 해당 pipeline을 cross validation을 수행해서 score를 캡쳐로 제시해 주세요.

x = iris_df
y = iris.target

estimators = [('scaler', StandardScaler()), ('clf', DecisionTreeClassifier(max_depth=2, random_state=50))]
pipe = Pipeline(estimators)

scores_rf = cross_validate(pipe, x, y, scoring='accuracy', cv=5)
scores_rf


In [None]:
# 12. 11번의 상황에서 6번의 모델들을 모두 pipeline에 각각 적용해 주세요. 그리고 해당 pipeline을 cross validation을 수행해서 score를 캡쳐로 제시해 주세요.

iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['target'] = iris.target

x = iris_df.drop(['target'], axis=1)
y = iris_df['target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=50)

pipe_rf = Pipeline([('scaler', StandardScaler()), ('clf', RandomForestClassifier(random_state=50, n_estimators=100))])
pipe_tree = Pipeline([('scaler', StandardScaler()), ('clf', DecisionTreeClassifier(max_depth=2, random_state=50))])
pipe_lr = Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression(solver='liblinear', random_state=50))])
pipe_knn = Pipeline([('scaler', StandardScaler()), ('clf', KNeighborsClassifier(n_neighbors=5))])

kfold = KFold(n_splits=5)
scores_rf = cross_validate(pipe_rf, x, y, scoring='accuracy', cv=5)
scores_tree = cross_validate(pipe_tree, x, y, scoring='accuracy', cv=5)
scores_lr = cross_validate(pipe_lr, x, y, scoring='accuracy', cv=5)
scores_knn = cross_validate(pipe_knn, x, y, scoring='accuracy', cv=5)

scores_list = []
scores_list.append(scores_rf['test_score'])
scores_list.append(scores_tree['test_score'])
scores_list.append(scores_lr['test_score'])
scores_list.append(scores_knn['test_score'])

scores_df = pd.DataFrame(scores_list, index=['Random Forest', 'Decision Tree', 'Logistic Regression', 'K-Nearest Neighbors'])
scores_df

In [None]:
# 13. Pipeline에 분류기를 DecisionTree와 RandomForest, kNN을 적용한 후 GridSearchCV를 통해 최적의 모델과 파라미터를 찾아주세요. 그리고 결과를 캡쳐해서 올려주세요

iris_df['target'] = iris.target
x = iris_df.drop(['target'], axis=1)
y = iris_df['target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=50)

pipe_rf = Pipeline([('scaler', StandardScaler()), ('clf', RandomForestClassifier())])
pipe_tree = Pipeline([('scaler', StandardScaler()), ('clf', DecisionTreeClassifier())])
pipe_knn = Pipeline([('scaler', StandardScaler()), ('clf', KNeighborsClassifier())])

param_grid_rf = {
    'clf__max_depth': [2, 5, 10, 15, 20],
    'clf__n_estimators': [50, 100, 200],
    'clf__min_samples_split': [6, 8, 12],
    'clf__min_samples_leaf': [6, 8, 12],
}
param_grid_tree = {
    'clf__criterion': ['gini', 'entropy'],
    'clf__max_depth': [2, 5, 10, 15, 20],
    'clf__min_samples_split': [6, 8, 12],
    'clf__min_samples_leaf': [6, 8, 12],
}
param_grid_knn = {
    'clf__n_neighbors': [3, 5, 7, 9],
    'clf__weights': ['uniform', 'distance'],
    'clf__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

params = {'max_depth':list(range(6,18,2))}
grid_rf = GridSearchCV(pipe_rf, param_grid=param_grid_rf, scoring='accuracy', cv=5, return_train_score=True)
grid_rf.fit(x_train, y_train)
grid_tree = GridSearchCV(pipe_tree, param_grid=param_grid_tree, scoring='accuracy', cv=5, return_train_score=True)
grid_tree.fit(x_train, y_train)
grid_knn = GridSearchCV(pipe_knn, param_grid=param_grid_knn, scoring='accuracy', cv=5, return_train_score=True)
grid_knn.fit(x_train, y_train)

best_rf_estimator = grid_rf.best_estimator_
best_tree_estimator = grid_tree.best_estimator_
best_knn_estimator = grid_knn.best_estimator_

accuracy_grid_rf = accuracy_score(y_test, best_rf_estimator.predict(x_test))
accuracy_grid_tree = accuracy_score(y_test, best_tree_estimator.predict(x_test))
accuracy_grid_knn = accuracy_score(y_test, best_knn_estimator.predict(x_test))

In [None]:
print(best_rf_estimator, accuracy_grid_rf)
print(best_tree_estimator, accuracy_grid_tree)
print(best_knn_estimator, accuracy_grid_knn)