## 退会顧客を予測する 10 本ノック

データの読み込みと整形

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
import sklearn.model_selection

customer = pd.read_csv('data/customer_join.csv')
uselog_months = pd.read_csv('data/use_log_months.csv')

In [2]:
year_months = list(uselog_months['年月'].unique())
uselog = pd.DataFrame()

for i in range(1, len(year_months)):
    tmp = uselog_months.loc[uselog_months['年月'] == year_months[i]]
    tmp.rename(columns={'count':'count_0'}, inplace=True)
    tmp_before = uselog_months.loc[uselog_months['年月'] == year_months[i-1]]
    del tmp_before['年月']

    tmp_before.rename(columns={'count':'count_1'}, inplace=True)
    tmp = pd.merge(tmp, tmp_before, on='customer_id', how='left')
    uselog = pd.concat([uselog, tmp], ignore_index=True)

print(len(uselog))
uselog.head()

33851


Unnamed: 0,年月,customer_id,count_0,count_1
0,201805,AS002855,5,4.0
1,201805,AS009373,4,3.0
2,201805,AS015233,7,
3,201805,AS015315,3,6.0
4,201805,AS015739,5,7.0


退会前月の退会顧客データを作成

In [3]:
from dateutil.relativedelta import relativedelta
exit_customer = customer.loc[customer['is_deleted'] == 1]
exit_customer['exit_date'] = None
exit_customer['end_date'] = pd.to_datetime(exit_customer['end_date'])

for i in range(len(exit_customer)):
    exit_customer["exit_date"].iloc[i] = exit_customer["end_date"].iloc[i] -  relativedelta(months=1)

exit_customer["年月"] = exit_customer["exit_date"].dt.strftime("%Y%m")
uselog["年月"] = uselog["年月"].astype(str)

exit_uselog = pd.merge(uselog, exit_customer, on=["customer_id", "年月"], how="left")
print(len(uselog))
exit_uselog.head()

33851


Unnamed: 0,年月,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date
0,201805,AS002855,5,4.0,,,,,NaT,,...,,,,,,,,,,
1,201805,AS009373,4,3.0,,,,,NaT,,...,,,,,,,,,,
2,201805,AS015233,7,,,,,,NaT,,...,,,,,,,,,,
3,201805,AS015315,3,6.0,,,,,NaT,,...,,,,,,,,,,
4,201805,AS015739,5,7.0,,,,,NaT,,...,,,,,,,,,,


In [4]:
exit_uselog = exit_uselog.dropna(subset=["name"])
print(len(exit_uselog))
print(len(exit_uselog["customer_id"].unique()))
exit_uselog.head()

1104
1104


Unnamed: 0,年月,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date
19,201805,AS055680,3,3.0,XXXXX,C01,M,2018-03-01,2018-06-30,CA1,...,10500.0,通常,3.0,3.0,3.0,3.0,0.0,2018-06-30,3.0,2018-05-30 00:00:00
57,201805,AS169823,2,3.0,XX,C01,M,2017-11-01,2018-06-30,CA1,...,10500.0,通常,3.0,3.0,4.0,2.0,1.0,2018-06-30,7.0,2018-05-30 00:00:00
110,201805,AS305860,5,3.0,XXXX,C01,M,2017-06-01,2018-06-30,CA1,...,10500.0,通常,3.333333,3.0,5.0,2.0,0.0,2018-06-30,12.0,2018-05-30 00:00:00
128,201805,AS363699,5,3.0,XXXXX,C01,M,2018-02-01,2018-06-30,CA1,...,10500.0,通常,3.333333,3.0,5.0,2.0,0.0,2018-06-30,4.0,2018-05-30 00:00:00
147,201805,AS417696,1,4.0,XX,C03,F,2017-09-01,2018-06-30,CA1,...,6000.0,通常,2.0,1.0,4.0,1.0,0.0,2018-06-30,9.0,2018-05-30 00:00:00


継続顧客のデータも作成

In [5]:
conti_customer = customer.loc[customer['is_deleted'] == 0]
conti_uselog = pd.merge(uselog, conti_customer, on=['customer_id'], how='left')

print(len(conti_uselog))

conti_uselog = conti_uselog.dropna(subset=['name'])
print(len(conti_uselog))

33851
27422


とりあえず件数がすごい。  
継続客の方が実質多いので、このままだと継続客ばかり過学習する。

ここでは１顧客１レコードに絞り込む。

sample は [ランダムサンプリング](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sample.html)。  
flac は Flaction(割合)の意味。

ここではこうすることで、順序をランダム化している。

`drop_duplicates` は [重複排除](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html) で subset はカラムラベル。  
これで重複行…というか

In [6]:
conti_uselog = conti_uselog.sample(frac=1).reset_index(drop=True)
conti_uselog = conti_uselog.drop_duplicates(subset="customer_id")

print(len(conti_uselog))
conti_uselog.head()

2842


Unnamed: 0,年月,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,class_name,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period
0,201809,GD619319,7,10.0,XXXXX,C01,M,2018-03-01,,CA1,...,オールタイム,10500.0,通常,8.333333,8.0,11.0,6.0,1.0,2019-04-30,13.0
1,201812,PL487301,1,5.0,XXXX,C01,M,2015-06-01,,CA1,...,オールタイム,10500.0,通常,4.583333,5.0,6.0,1.0,1.0,2019-04-30,46.0
2,201811,TS772424,9,8.0,XXXXX,C03,M,2018-09-01,,CA1,...,ナイト,6000.0,通常,8.0,8.0,10.0,5.0,1.0,2019-04-30,7.0
3,201809,GD437531,1,5.0,XXXXX,C01,M,2015-06-01,,CA1,...,オールタイム,10500.0,通常,4.416667,4.5,7.0,1.0,1.0,2019-04-30,46.0
4,201807,GD413362,5,6.0,XXXXX,C03,M,2016-12-01,,CA1,...,ナイト,6000.0,通常,5.0,6.0,7.0,2.0,1.0,2019-04-30,28.0


継続顧客データと退会顧客データの結合

In [7]:
predict_data = pd.concat([conti_uselog, exit_uselog], ignore_index=True)

print(len(predict_data))
predict_data.head()

3946


Unnamed: 0,年月,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date
0,201809,GD619319,7,10.0,XXXXX,C01,M,2018-03-01,,CA1,...,10500.0,通常,8.333333,8.0,11.0,6.0,1.0,2019-04-30,13.0,
1,201812,PL487301,1,5.0,XXXX,C01,M,2015-06-01,,CA1,...,10500.0,通常,4.583333,5.0,6.0,1.0,1.0,2019-04-30,46.0,
2,201811,TS772424,9,8.0,XXXXX,C03,M,2018-09-01,,CA1,...,6000.0,通常,8.0,8.0,10.0,5.0,1.0,2019-04-30,7.0,
3,201809,GD437531,1,5.0,XXXXX,C01,M,2015-06-01,,CA1,...,10500.0,通常,4.416667,4.5,7.0,1.0,1.0,2019-04-30,46.0,
4,201807,GD413362,5,6.0,XXXXX,C03,M,2016-12-01,,CA1,...,6000.0,通常,5.0,6.0,7.0,2.0,1.0,2019-04-30,28.0,


予測する月の在籍期間を作成する



In [8]:
predict_data["period"] = 0
predict_data["now_date"] = pd.to_datetime(predict_data["年月"], format="%Y%m")
predict_data["start_date"] = pd.to_datetime(predict_data["start_date"])

for i in range(len(predict_data)):
    delta = relativedelta(predict_data["now_date"][i], predict_data["start_date"][i])
    predict_data["period"][i] = int(delta.years*12 + delta.months)

predict_data.head()

Unnamed: 0,年月,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date,period,now_date
0,201809,GD619319,7,10.0,XXXXX,C01,M,2018-03-01,,CA1,...,8.333333,8.0,11.0,6.0,1.0,2019-04-30,13.0,,6,2018-09-01
1,201812,PL487301,1,5.0,XXXX,C01,M,2015-06-01,,CA1,...,4.583333,5.0,6.0,1.0,1.0,2019-04-30,46.0,,42,2018-12-01
2,201811,TS772424,9,8.0,XXXXX,C03,M,2018-09-01,,CA1,...,8.0,8.0,10.0,5.0,1.0,2019-04-30,7.0,,2,2018-11-01
3,201809,GD437531,1,5.0,XXXXX,C01,M,2015-06-01,,CA1,...,4.416667,4.5,7.0,1.0,1.0,2019-04-30,46.0,,39,2018-09-01
4,201807,GD413362,5,6.0,XXXXX,C03,M,2016-12-01,,CA1,...,5.0,6.0,7.0,2.0,1.0,2019-04-30,28.0,,19,2018-07-01


In [9]:
predict_data.isna().sum() # 欠損確認

年月                      0
customer_id             0
count_0                 0
count_1               263
name                    0
class                   0
gender                  0
start_date              0
end_date             2842
campaign_id             0
is_deleted              0
class_name              0
price                   0
campaign_name           0
mean                    0
median                  0
max                     0
min                     0
routine_flg             0
calc_date               0
membership_period       0
exit_date            2842
period                  0
now_date                0
dtype: int64

`count_1` は全体比較で見ると量的に軽微なので除去してしまう

In [10]:
predict_data = predict_data.dropna(subset=["count_1"])
predict_data.isna().sum()

年月                      0
customer_id             0
count_0                 0
count_1                 0
name                    0
class                   0
gender                  0
start_date              0
end_date             2631
campaign_id             0
is_deleted              0
class_name              0
price                   0
campaign_name           0
mean                    0
median                  0
max                     0
min                     0
routine_flg             0
calc_date               0
membership_period       0
exit_date            2631
period                  0
now_date                0
dtype: int64

文字列型というかラベルは機械学習上は扱えないデータ型なので、数字ないしはカラムとして独立させる

In [11]:
target_col = ["campaign_name", "class_name", "gender", "count_1", "routine_flg", "period", "is_deleted"]
predict_data = predict_data[target_col]
predict_data.head()

Unnamed: 0,campaign_name,class_name,gender,count_1,routine_flg,period,is_deleted
0,通常,オールタイム,M,10.0,1.0,6,0.0
1,通常,オールタイム,M,5.0,1.0,42,0.0
2,通常,ナイト,M,8.0,1.0,2,0.0
3,通常,オールタイム,M,5.0,1.0,39,0.0
4,通常,ナイト,M,6.0,1.0,19,0.0


[get_dummies](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html) を利用すると、select 型として分離してしまう。  
便利になったもんだ。

In [12]:
predict_data = pd.get_dummies(predict_data)
predict_data.head()

Unnamed: 0,count_1,routine_flg,period,is_deleted,campaign_name_入会費半額,campaign_name_入会費無料,campaign_name_通常,class_name_オールタイム,class_name_デイタイム,class_name_ナイト,gender_F,gender_M
0,10.0,1.0,6,0.0,0,0,1,1,0,0,0,1
1,5.0,1.0,42,0.0,0,0,1,1,0,0,0,1
2,8.0,1.0,2,0.0,0,0,1,0,0,1,0,1
3,5.0,1.0,39,0.0,0,0,1,1,0,0,0,1
4,6.0,1.0,19,0.0,0,0,1,0,0,1,0,1


このとき作成されるカラムで、よくよく考えると

* 入会費半額ではない & 入会費無料ではない = 通常 じゃね？

もう少しわかりやすいところだと、

* 男ではない = 女じゃね？

ってことだから、各１列は削除しても問題ない（というか無駄に計算させる）

In [13]:
del predict_data["campaign_name_通常"]
del predict_data["class_name_ナイト"]
del predict_data["gender_M"]
predict_data.head()

Unnamed: 0,count_1,routine_flg,period,is_deleted,campaign_name_入会費半額,campaign_name_入会費無料,class_name_オールタイム,class_name_デイタイム,gender_F
0,10.0,1.0,6,0.0,0,0,1,0,0
1,5.0,1.0,42,0.0,0,0,1,0,0
2,8.0,1.0,2,0.0,0,0,0,0,0
3,5.0,1.0,39,0.0,0,0,1,0,0
4,6.0,1.0,19,0.0,0,0,0,0,0


決定木を用いて退会予測モデルを作る

個人的にはランダムフォレストの方が多くの場合で精度が良いのだけど…ここはまずは教科書に従う。

In [14]:
# 辞めた人データ
exit = predict_data.loc[predict_data["is_deleted"]==1]
# 継続してる人データ(件数を辞めた人データに合わせる)
conti = predict_data.loc[predict_data["is_deleted"]==0].sample(len(exit))

X = pd.concat([exit, conti], ignore_index=True)
y = X["is_deleted"]
del X["is_deleted"]
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X,y)

model = DecisionTreeClassifier(random_state=0)
model.fit(X_train, y_train)

y_test_pred = model.predict(X_test)
print(y_test_pred)

[0. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 0. 1. 0. 1. 0. 0.
 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 1. 1. 1. 1. 0.
 0. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 1. 1. 0.
 1. 0. 0. 0. 0. 1. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 1. 1. 0. 0.
 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0.
 0. 0. 1. 1. 0. 1. 1. 1. 0. 1. 1. 0. 0. 1. 1. 1. 0. 0. 1. 1. 0. 0. 0. 1.
 1. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 0.
 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 0. 1. 1. 0. 0. 1.
 1. 1. 1. 0. 1. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 1.
 1. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 0. 1.
 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 1. 0. 1. 0. 0. 0.
 1. 0. 1. 0. 0. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 1. 1. 1.
 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0.
 1. 1. 0. 0. 1. 1. 0. 1. 1. 0. 1. 1. 0. 0. 1. 1. 1.

In [15]:
# つーか正答率は？
print(model.score(X_test, y_test))

0.9163498098859315


8 割出るので良いほうかな？

実際の値と、予測の比較 dataframe 作成

In [16]:
results_test = pd.DataFrame({"y_test":y_test ,"y_pred":y_test_pred })
results_test.head()

Unnamed: 0,y_test,y_pred
1475,0.0,0.0
1566,0.0,1.0
1848,0.0,0.0
1637,0.0,0.0
1046,1.0,1.0


In [17]:
# 教科書ではコレだけど、実際には既にやってしまったし…
# correct = len(results_test.loc[results_test["y_test"]==results_test["y_pred"]])
# data_count = len(results_test)
# score_test = correct / data_count
# print(score_test)

# 検証データと、学習データをそれぞれモデルでスコアしてみると
print(model.score(X_test, y_test))
print(model.score(X_train, y_train))

0.9163498098859315
0.9740177439797212


まぁ学習データ食わせて高精度なのは当たり前か…  
同じデータセットでも検証用ともとデータで結果が乖離してるのは過学習状態に陥ってるから。

教科書では決定木の深さを調整しているのだけど、そもそも論それが正しいのかわからん。

ので、パラメータの有効度を算出する [こんへん](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html) 見ながら確認してみる。

In [18]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2104 entries, 0 to 2103
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   count_1              2104 non-null   float64
 1   routine_flg          2104 non-null   float64
 2   period               2104 non-null   int64  
 3   campaign_name_入会費半額  2104 non-null   uint8  
 4   campaign_name_入会費無料  2104 non-null   uint8  
 5   class_name_オールタイム    2104 non-null   uint8  
 6   class_name_デイタイム     2104 non-null   uint8  
 7   gender_F             2104 non-null   uint8  
dtypes: float64(2), int64(1), uint8(5)
memory usage: 59.7 KB


In [19]:
print('重量度: {0}'.format(model.feature_importances_))
print('判定要素の最大数: {0}'.format(model.max_features_))
print('結果の種類数: {0}'.format(model.n_classes_))
print('分岐数: {0}'.format(model.n_features_))
print('1データに対する出力数: {0}'.format(model.n_outputs_))

重量度: [0.32794845 0.10332179 0.48587001 0.01490257 0.00883519 0.02710315
 0.00823638 0.02378245]
判定要素の最大数: 8
結果の種類数: 2
分岐数: 8
1データに対する出力数: 1


なるほど、重要度を見てみると、 8 パラメータの中でも重要性の低いデータ(0.01 台)が 3 つある。

In [20]:
expect_df = pd.DataFrame({ 'feature_names':X.columns, 'importance':model.feature_importances_ })
expect_df

Unnamed: 0,feature_names,importance
0,count_1,0.327948
1,routine_flg,0.103322
2,period,0.48587
3,campaign_name_入会費半額,0.014903
4,campaign_name_入会費無料,0.008835
5,class_name_オールタイム,0.027103
6,class_name_デイタイム,0.008236
7,gender_F,0.023782


In [21]:
X = pd.concat([exit, conti], ignore_index=True)
y = X["is_deleted"]
del X["is_deleted"]
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X,y)

model = DecisionTreeClassifier(random_state=0, max_depth=5)
model.fit(X_train, y_train)
print(model.score(X_test, y_test))
print(model.score(X_train, y_train))

0.9049429657794676
0.9283903675538656


In [22]:
expect_df = pd.DataFrame({ 'feature_names':X.columns, 'importance':model.feature_importances_ })
expect_df

Unnamed: 0,feature_names,importance
0,count_1,0.349701
1,routine_flg,0.113588
2,period,0.535951
3,campaign_name_入会費半額,0.0
4,campaign_name_入会費無料,0.0
5,class_name_オールタイム,0.000652
6,class_name_デイタイム,0.0
7,gender_F,0.000109


この比率見ると 3 パラメータだけで判断して良くないか？

In [23]:
model = DecisionTreeClassifier(random_state=0, max_depth=3)
model.fit(X_train, y_train)
print(model.score(X_test, y_test))
print(model.score(X_train, y_train))

0.9220532319391636
0.9239543726235742


だよね。

モデルに寄与してるデータを探る…て…教科書先取りしちまった

In [24]:
importance = pd.DataFrame({ 'feature_names':X.columns, 'importance':model.feature_importances_ })
importance

Unnamed: 0,feature_names,importance
0,count_1,0.34442
1,routine_flg,0.112703
2,period,0.542877
3,campaign_name_入会費半額,0.0
4,campaign_name_入会費無料,0.0
5,class_name_オールタイム,0.0
6,class_name_デイタイム,0.0
7,gender_F,0.0


ノック50 顧客の退会予測

それっぽいデータ作成して

In [25]:
count_1 = 3
routing_flg = 1
period = 10
campaign_name = "入会費無料"
class_name = "オールタイム"
gender = "M"

if campaign_name == "入会費半額":
    campaign_name_list = [1, 0]
elif campaign_name == "入会費無料":
    campaign_name_list = [0, 1]
elif campaign_name == "通常":
    campaign_name_list = [0, 0]

if class_name == "オールタイム":
    class_name_list = [1, 0]
elif class_name == "デイタイム":
    class_name_list = [0, 1]
elif class_name == "ナイト":
    class_name_list = [0, 0]

if gender == "F":
    gender_list = [1]
elif gender == "M":
    gender_list = [0]

input_data = [count_1, routing_flg, period]
input_data.extend(campaign_name_list)
input_data.extend(class_name_list)
input_data.extend(gender_list)

実行してみる。

In [26]:
print(model.predict([input_data]))
print(model.predict_proba([input_data]))

[1.]
[[0.01167315 0.98832685]]


予測: このユーザは退会する (1)  
0 の確率 1.2% 1 の確率 98.8%