# 피마 인디언 모델 만들기

In [1]:
import pandas as pd 
import joblib

In [2]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

### 데이터셋

In [3]:
df_train = pd.read_csv('../static/data/pima_train.csv')
df_train.head()

FileNotFoundError: [Errno 2] File ../static/data/pima_train.csv does not exist: '../static/data/pima_train.csv'

In [4]:
X_train = df_train.iloc[:, :-1].values
y_train = df_train.iloc[:, -1].values
X_train.shape, y_train.shape

((576, 8), (576,))

In [5]:
df_test = pd.read_csv('../static/data/pima_test.csv')
df_test.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,7,133.0,84.0,20.536458,79.799479,40.2,0.696,37,0
1,2,90.0,70.0,17.0,79.799479,27.3,0.085,22,0
2,8,194.0,80.0,20.536458,79.799479,26.1,0.551,67,0
3,1,83.0,68.0,20.536458,79.799479,18.2,0.624,27,0
4,6,125.0,68.0,30.0,120.0,30.0,0.464,32,0


In [6]:
X_test = df_test.iloc[:, :-1].values
y_test = df_test.iloc[:, -1].values
X_test.shape, y_test.shape

((192, 8), (192,))

In [7]:
scaler = StandardScaler()
scaler.fit(df_train.iloc[:, :-1])

StandardScaler()

In [8]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled.shape, X_test_scaled.shape

((576, 8), (192, 8))

In [9]:
joblib.dump(scaler, '../static/model/pima_scaler.pkl')

['../static/model/pima_scaler.pkl']

### 1. 로지스틱 회귀

In [10]:
lr_clf = LogisticRegression()
lr_clf.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [11]:
params = {
    #'C': [0.1, 1, 5, 10]
    'C': [3, 4, 5, 6, 7]
}

In [12]:
grid_cv = GridSearchCV(lr_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train_scaled, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.7605
최적 파라미터: {'C': 3}


In [13]:
best_lr = grid_cv.best_estimator_
pred = best_lr.predict(X_test_scaled)
accuracy_score(y_test, pred)

0.8020833333333334

In [14]:
joblib.dump(best_lr, '../static/model/pima_lr.pkl')

['../static/model/pima_lr.pkl']

### 2. SVM

In [15]:
sv_clf = SVC()
sv_clf.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [16]:
params = {
    #'C': [0.1, 1, 5, 7, 10]
    #'C': [0.5, 0.8, 1, 2, 3]
    'C': [2.5, 3, 3.5, 4]
}

In [17]:
grid_cv = GridSearchCV(sv_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train_scaled, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.7484
최적 파라미터: {'C': 3}


In [18]:
best_sv = grid_cv.best_estimator_
pred = best_sv.predict(X_test_scaled)
accuracy_score(y_test, pred)

0.7447916666666666

In [19]:
joblib.dump(best_lr, '../static/model/pima_sv.pkl')

['../static/model/pima_sv.pkl']

### 3. Random Forest

In [20]:
rf_clf = RandomForestClassifier()
rf_clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [21]:
params = {
    #'max_depth': [4, 6, 8, 10],
    'max_depth': [7, 8, 9],
    #'min_samples_split': [2, 3, 4]
    'min_samples_split': [3, 4, 5, 6]
}

In [22]:
grid_cv = GridSearchCV(rf_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train_scaled, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.7605
최적 파라미터: {'max_depth': 8, 'min_samples_split': 4}


In [23]:
best_rf = grid_cv.best_estimator_
pred = best_rf.predict(X_test_scaled)
accuracy_score(y_test, pred)

0.7916666666666666

In [24]:
joblib.dump(best_rf, '../static/model/pima_rf.pkl')

['../static/model/pima_rf.pkl']

### Test

In [25]:
index = 100
test_data = (df_test.iloc[index, :-1].values).reshape(1,-1)
label = df_test.iloc[index, -1]
test_data, label

(array([[ 3.   , 99.   , 80.   , 11.   , 64.   , 19.3  ,  0.284, 30.   ]]), 0)

In [26]:
new_scaler = joblib.load('../static/model/pima_scaler.pkl')

In [27]:
test_scaled = new_scaler.transform(test_data)

In [28]:
pred_lr = best_lr.predict(test_scaled)
pred_sv = best_sv.predict(test_scaled)
pred_rf = best_rf.predict(test_scaled)

In [29]:
label, pred_lr[0], pred_sv[0], pred_rf[0]

(0, 0, 0, 0)

In [30]:
a = dict(zip(df_test.columns[:-1], df_test.iloc[index, :-1]))
a

{'Pregnancies': 3.0,
 'Glucose': 99.0,
 'BloodPressure': 80.0,
 'SkinThickness': 11.0,
 'Insulin': 64.0,
 'BMI': 19.3,
 'DiabetesPedigreeFunction': 0.284,
 'Age': 30.0}