In [None]:
!pip install pycaret==3.0.4  # 書籍執筆時点のバージョンをインストール


# 最新版をインストール場合はこちら
# !pip install pycaret

In [None]:
import pycaret

pycaret.__version__

# 使用するデータ


In [None]:
# load dataset
from pycaret.datasets import get_data


employee = get_data('employee')

# データの準備

##エンコーディング

In [None]:
employee['department'].value_counts()

In [None]:
employee['salary'].value_counts()

In [None]:
employee['time_spend_company'].value_counts()

### one-hotエンコーディング


In [None]:
from pycaret.classification import *


clf1 = setup(data=employee, target='left')

In [None]:
clf1.X_transformed

In [None]:
clf1.X_transformed.sort_index()

### Ordinal（序数）エンコーディング

In [None]:
# init setup
from pycaret.classification import *


clf2 = setup(data=employee, target='left',
             ordinal_features={'salary' : ['low', 'medium', 'high']},
             categorical_features=['department'])

In [None]:
clf2.X_transformed.sort_index()

### カテゴリ数が多い場合のエンコーディング

In [None]:
from pycaret.classification import *

# init setup with custom encoding settings
clf3 = setup(data=employee, target='left', max_encoding_ohe=3)
clf3.X_transformed.sort_index()

In [None]:
clf3.pipeline

##データ型の指定

In [None]:
from pycaret.datasets import get_data
from pycaret.classification import *


employee = get_data('employee')

clf1 = setup(data=employee, target='left', categorical_features=['time_spend_company'])

In [None]:
clf1.X_transformed.sort_index()

In [None]:
clf2 = setup(data=employee, target ='left', ignore_features=['department', 'salary'])
clf2.X_transformed.sort_index()

In [None]:
clf1.X_transformed.sort_index()

## 欠損値置換

In [None]:
import numpy as np
from pycaret.datasets import get_data


employee = get_data('employee')

employee['satisfaction_level'][1] = np.nan  # 欠損値の代入
employee['satisfaction_level'][3] = np.nan  # 欠損値の代入

In [None]:
employee

In [None]:
# init setup
from pycaret.classification import *


clf1 = setup(data=employee, target='left')

In [None]:
clf1.X_transformed.sort_index()

In [None]:
clf2 = setup(data=employee, target='left', numeric_imputation='median')
clf2.X_transformed.sort_index()

In [None]:
clf2.X_transformed.sort_index()

## 不均衡データ

In [None]:
# load dataset
from pycaret.datasets import get_data


employee = get_data('employee')

# init setup
from pycaret.classification import *


clf1 = setup(data=employee, target='left', fix_imbalance=True)

In [None]:
employee['left'].value_counts()


In [None]:
clf1.y_transformed.value_counts()


# スケーリング

In [None]:
# load dataset
from pycaret.datasets import get_data


employee = get_data('employee')

# init setup
from pycaret.classification import *


clf1 = setup(data=employee,
             target='left', normalize=True, normalize_method='zscore')

In [None]:
clf1.X_transformed.sort_index()

In [None]:
clf1.X_transformed['average_montly_hours'].hist()

## ターゲット変数変換

In [None]:
# load dataset
from pycaret.datasets import get_data


employee = get_data('employee')

# init setup
from pycaret.regression import *


reg1 = setup(data=employee, target='average_montly_hours', transform_target=True)

In [None]:
reg1.y_transformed.sort_index()

In [None]:
reg1.y_transformed.hist()

# 特徴量エンジニアリング

## 多項式特徴量

In [None]:
# load dataset
from pycaret.datasets import get_data


employee = get_data('employee')

# init setup
from pycaret.classification import *


clf1 = setup(data=employee, target='left', polynomial_features=True, polynomial_degree=2)

In [None]:
clf1.X_transformed.sort_index()

## Group特徴量

In [None]:
# load dataset
from pycaret.datasets import get_data


employee = get_data('employee')

# init setup
from pycaret.classification import *


clf1 = setup(data=employee, target='left', group_features=['satisfaction_level', 'last_evaluation'])

In [None]:
clf1.X_transformed.sort_index()

## binning

In [None]:
# load dataset
from pycaret.datasets import get_data


employee = get_data('employee')

# init setup
from pycaret.classification import *


clf1 = setup(data=employee, target='left', bin_numeric_features=['average_montly_hours'])

In [None]:
clf1.X_transformed.sort_index()

In [None]:
employee['average_montly_hours'].describe()

In [None]:
clf1.X_transformed['average_montly_hours'].hist()

## combine rare level

In [None]:
# load dataset
from pycaret.datasets import get_data


employee = get_data('employee')

# init setup
from pycaret.classification import *


clf1 = setup(data=employee, target='left', rare_to_value=0.1, rare_value='other')

In [None]:
clf1.X_transformed.sort_index()

In [None]:
employee['department'].value_counts()

# 特徴量選択

## 特徴量選択

In [None]:
# load dataset
from pycaret.datasets import get_data


employee = get_data('employee')

# init setup
from pycaret.regression import *


clf1 = setup(data=employee, target='left')

In [None]:
clf1.X_transformed.sort_index()

In [None]:
clf2 = setup(data=employee, target='left', feature_selection=True, n_features_to_select=0.9)

clf2.X_transformed.sort_index()

## 多重共線性

In [None]:
# load dataset
from pycaret.datasets import get_data


employee = get_data('employee')

# init setup
from pycaret.regression import *


clf1 = setup(data=employee, target='left',
             remove_multicollinearity=True, multicollinearity_threshold=0.4)

In [None]:
clf1.X_transformed.sort_index()

## 主成分分析

In [None]:
# load dataset
from pycaret.datasets import get_data


income = get_data('income')

# init setup
from pycaret.classification import *


clf1 = setup(data=income, target='income >50K', pca=True, pca_components=10)

## 分散の低い特徴量の削除

In [None]:
# load dataset
from pycaret.datasets import get_data


mice = get_data('mice')
# filter dataset
mice = mice[mice['Genotype'] == 'Control']

# init setup
from pycaret.classification import *


clf1 = setup(data=mice, target='class', low_variance_threshold=0.1)