<a href="https://colab.research.google.com/github/Redwoods/Py/blob/master/pdm2020/my-note/py-tensorflow/DL4-autoML/diabetes_autoML_df_df2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Diabetes
- [dataset from kaggle](https://www.kaggle.com/himanshu86503/dibetes33hi)
## AutoML
- raw data
- imputed data

## 1. Importing Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

## 2. Data Collection

In [None]:
# Get the data from github
url = "https://github.com/Redwoods/Py/raw/master/pdm2020/my-note/py-pandas/data/diabetes.csv"
df = pd.read_csv(url)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
#  int or float ?
df.info()

### Cleaning data
- Check the NaN or missing values
- Clean the null data

In [None]:
# Importing the dataset
# data = pd.read_csv('diabetes.csv')
#CHECK FOR NULL VALUES
df.isnull().values.any(), df.isna().sum()

In [None]:
# Drop unused columns, and drop rows with any missing values. (NOT neceaasry always!)
print(df.shape)
vars = df.columns
print(vars)
df = df[vars].dropna()
df.shape

## 3. Explore Data

### 위의 데이터에서 문제점을 찾으시오.
- 0이 허용되지 않는 특징이 있는가?
- 값 0을 어떤 값으로 변경해야하는가?

In [None]:
# Check zeros in features without Outcome
(df.iloc[:,:8]==0).astype(int).sum()

In [None]:
df.info()

### Check the balance of the data through plot

In [None]:
# Check the balance of the data through plot
classes=df.Outcome
ax=sns.countplot(classes, label='count')
plt.show()
nDB,DB=classes.value_counts()
print('False: non-diabetes',nDB)
print('True: diabetes',DB)

In [None]:
classes.value_counts(), type(classes) # noDM: 500, DM: 268

## correlation plot (상관도표)

In [None]:
#correlation plot
cormat=df.corr()
plt.figure(figsize=(12,10))
g=sns.heatmap(cormat, annot=True, cmap='coolwarm', #cmap= "RdYlGn",
             vmin=-1, vmax=1)

### 상관성 분석 결과
* Age vs. Pregnancies : 0.54
* Glucose vs. Outcome : 0.47
* SkinThickness vs. Insulin : 0.44
* SkinThickness vs. BMI : 0.39

> 좀 더 자세한 시각화가 필요하다.

---

## 각 특징의 내부 정보를 고려한 zero 처리 후 상관성 조사.

In [None]:
# zero 처리 함수
# Clean the data : zero2median()
# 1. Check zeros in features with Pregnancies, Outcome excluded.
# 2. Replace zero with NaN 
# 3. Replace NaN with the median of the corresponding featurs
def zero2median(df):
    columns_with_zero = df.columns[(df==0).sum() > 0][1:-1]
    # Index(['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI'], dtype='object')
    df[columns_with_zero]=df[columns_with_zero].replace(0,np.nan)
    for feature in columns_with_zero:
        df[feature].fillna(df[feature].median(),inplace=True)  # median() -> mean()
    
    return df

# Srart with cleaned dataframe
df2 = zero2median(df)
df2.shape

In [None]:
df2.head()

In [None]:
#correlation plot of df2
# cormat2=df2.corr()
plt.figure(figsize=(12,10))
g3=sns.heatmap(df2.corr(),annot=True,cmap='coolwarm', #cmap= "RdYlGn",
             vmin=-1, vmax=1)

## 데이터프레임 df, df2의 상관성이 달라짐을 확인하시오.
### 상관성 분석 결과 
* Age vs. Pregnancies : 0.54 ->  0.54
* Glucose vs. Outcome : 0.47 ->  0.49
* SkinThickness vs. Insulin : 0.44 -> 0.16
* SkinThickness vs. BMI : 0.39 -> 0.54

> 상관성 최종 개선

---

### 그러면 다음 autoML에서는 어떤 데이터로 ML 모델을 만들어야하나요?

# AutoML
- ## pycaret
- ## data : df, df2

# targets
- ## 'noDM', 'DM'


In [None]:
# !pip install pycaret

In [None]:
# check version
from pycaret.utils import version
version()

In [None]:
from pycaret.classification import *

## autoML using df

In [None]:
df.head()

### Make data for pycaret autoML

In [None]:
data = df.sample(frac=0.8, random_state=786)  # 8:2 split
data_unseen = df.drop(data.index).reset_index(drop=True)  # test data : 20%
data.reset_index(drop=True, inplace=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions ' + str(data_unseen.shape))

In [None]:
clf = setup(data = data, target = 'Outcome', session_id=1104, 
                    normalize=True, 
                    transformation=True, 
                    ignore_low_variance=True,
        #    remove_multicollinearity=True, multicollinearity_threshold=0.95,
                    silent=True)

In [None]:
best_model = compare_models(sort='Accuracy')
# compare_models(sort='AUC')

## Meaning of Prec., Recall,  F1, AUC
- Precision (Pres.) is an indication of how many positive
predictions are correct
- Recall identifies how many actual positive examples are correctly identified. 
> There is always a tradeoff between precision and recall so a new performance measuring parameter F1 score is introduced. 
- F1 score is a harmonic mean of precision and recall which gives a balance value between precision and recall.
- AUC is a measure of the area under the receiver operating characteristic curve.

In [None]:
current_model = 'rf'  # lightgbm
model = create_model(current_model, fold =10)
plot_model(model)

In [None]:
plot_model(model, plot = 'confusion_matrix')

In [None]:
plot_model(model, plot = 'feature')
print(model.feature_importances_)

## SHAP
- https://eair.tistory.com/30?category=0

In [None]:
# !pip install shap
interpret_model(model)  # Interpretability of the model

# Best model

In [None]:
plot_model(best_model)

In [None]:
# LABELS = ['noDM', 'DM']
plot_model(best_model, plot = 'confusion_matrix')

In [None]:
plot_model(best_model, plot = 'feature')
# print(best_model.feature_importances_)

## SHAP - xAI
- https://eair.tistory.com/30?category=0
- tree를 이용한 ML 모델에 적용.

In [None]:
# interpret_model(best_model)

## Pycaret - score
- 테스트 데이터에 적용하여 성능 평가.
- https://towardsdatascience.com/predict-lead-score-the-right-way-using-pycaret-332faa780cfc

In [None]:
type(data_unseen),data_unseen.shape,data_unseen.head(3)

In [None]:
# Predict test data
unseen_best_predictions = predict_model(best_model, data=data_unseen)
unseen_best_predictions

In [None]:
# unseen_predictions
from sklearn.metrics import accuracy_score
y_pred = list(unseen_best_predictions['Label'].values)
y_true = list(unseen_best_predictions['Outcome'].values)
best_accuracy = accuracy_score(y_true, y_pred)
print("Accuracy of the best model: {}".format(best_accuracy))


### 트레이닝 accuracy 76.6% -> **No overfitting**

In [None]:
#confusion matrics
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true, y_pred)
print(cm)

## Summary of the best model

In [None]:
evaluate_model(best_model)

---
---

## autoML using df2
- df2 : zero-preprocessed data

In [None]:
df2.head()

### 위에서 Pregnancies, Age가 categorical(정수) feature. -> 해결 방법은?

In [None]:
# pycaret으로 학습할려면 특징값은 float
df2['Pregnancies'] = df['Pregnancies'].astype(float)
df2['Age'] = df['Age'].astype(float)
df2.head()

In [None]:
data = df2.sample(frac=0.8, random_state=786)  # 8:2 split
data_unseen = df2.drop(data.index).reset_index(drop=True)
data.reset_index(drop=True, inplace=True)

print('Using df2')
print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions ' + str(data_unseen.shape))

In [None]:
clf = setup(data = data, target = 'Outcome', session_id=1104, 
                    normalize=True, 
                    transformation=True, 
                    ignore_low_variance=True,
        #    remove_multicollinearity=True, multicollinearity_threshold=0.95,
                    silent=True)

In [None]:
best_model2 = compare_models(sort='Accuracy')
# compare_models(sort='AUC')

### Accuracy가 약간 높아졌지만 큰 차이는 없다.

In [None]:
plot_model(best_model2)

In [None]:
plot_model(best_model2, 'confusion_matrix')

In [None]:
plot_model(best_model2, plot = 'feature')

---
---

## [도전하기 - DIY]
- 최적의 모델을 찾아서 저장
    - save_model()
- 저장된 모델을 불러와서 테스트 데이터에 적용
    - load_model()
- 모델별로 accuracy를 표로 정리하시오.

---
---