Logistic Regression

분류 문제는 0과 1로 예측해야하나 Linear Regression은 예측함수 h(x)가 0보다 작거나 1보다 큰 값을 가질 수 있다.\
h(x)가 항상 0과 1사이의 값을 갖도록 Hypothesis 함수를 수정하고 모델을 비교해본다.

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
import seaborn as sns

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

In [None]:
red_url = 'https://raw.githubusercontent.com/PinkWink/forML_study_data/main/data/winequality-red.csv'
white_url = 'https://raw.githubusercontent.com/PinkWink/forML_study_data/main/data/winequality-white.csv'

red_wine = pd.read_csv(red_url, sep=';')
white_wine = pd.read_csv(white_url, sep=';')
red_wine['color']=1.
white_wine['color']=0.
wine = pd.concat([red_wine, white_wine])
wine['taste'] = [1. if grade > 5 else 0. for grade in wine['quality']]
x = wine.drop(['taste', 'quality'], axis=1)
y = wine['taste']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5)

wine_lr = LogisticRegression(solver='liblinear', random_state=14)
wine_lr.fit(x_train, y_train)
y_pred_tr = wine_lr.predict(x_train)
y_pred_test = wine_lr.predict(x_test)
print('Train Acc    : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc     : ', accuracy_score(y_test, y_pred_test))

In [None]:
estimators = [('scaler', StandardScaler()), ('clf', LogisticRegression(solver='liblinear', random_state=14))]
pipe = Pipeline(estimators)
pipe.fit(x_train, y_train)
y_pred_tr = pipe.predict(x_train)
y_pred_test = pipe.predict(x_test)
print('Train Acc    : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc     : ', accuracy_score(y_test, y_pred_test))

In [None]:
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=5)
wine_tree.fit(x_train, y_train)
models = {'logistic regression':pipe, 'decision tree':wine_tree}

In [None]:
plt.figure(figsize=(10,8))
plt.plot([0,1], [0,1])
for model_name, model in models.items() :
    pred = model.predict_proba(x_test)[:,1]
    fpr, tpr, threshold = roc_curve(y_test, pred)
    plt.plot(fpr, tpr, label=model_name)

plt.grid()
plt.legend()
plt.show()

In [None]:
PIMA_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/diabetes.csv'

PIMA = pd.read_csv(PIMA_url)
PIMA = PIMA.astype('float')
plt.figure(figsize=(12,10))
sns.heatmap(PIMA.corr(), annot=True, fmt='.4f', cmap="YlGnBu")
plt.show()

In [None]:
zero_features = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI']
PIMA[zero_features] = PIMA[zero_features].replace(0, PIMA[zero_features].mean())
(PIMA==0).astype(int).sum()


In [None]:

x = PIMA.drop(['Outcome'], axis=1)
y = PIMA['Outcome']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5)

estimators = [('scaler', StandardScaler()), ('clf', LogisticRegression(solver='liblinear', random_state=14))]
pipe = Pipeline(estimators)
pipe.fit(x_train, y_train)
y_pred_tr = pipe.predict(x_train)
y_pred_test = pipe.predict(x_test)

print('Accuarcy     : ', accuracy_score(y_test, y_pred_test))
print('Recall       : ', recall_score(y_test, y_pred_test))
print('Precision    : ', precision_score(y_test, y_pred_test))
print('AUC score    : ', roc_auc_score(y_test, y_pred_test))
print('F1 score     : ', f1_score(y_test, y_pred_test))

In [None]:
coeff = list(pipe['clf'].coef_[0])
labels = list(x_train.columns)
labels

In [None]:
features = pd.DataFrame({'Features':labels, 'importance':coeff})
features.sort_values(by=['importance'], ascending=True, inplace=True)
features['positive'] = features['importance'] > 0
features.set_index('Features', inplace=True)
features['importance'].plot(kind='barh', figsize=(11,6), color=features['positive'].map({True:'blue', False:'red'}))
plt.xlabel('Importance')
plt.show()