데이터 다운로드

https://www.kaggle.com/c/kkbox-churn-prediction-challenge/data

분석예시

https://github.com/TAEJIN-AHN/Churn-Prediction/blob/main/eda.ipynb

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### train 데이터 병합 전 처리

In [32]:
data_dir = 'C:\\KIMUJUNG\\team_project\\data\\'
df_members = pd.read_csv(data_dir + 'members_v3.csv')
train_v1 = pd.read_csv(data_dir + 'train.csv')
train_v2 = pd.read_csv(data_dir + 'train_v2.csv')
transactions_v1 = pd.read_csv(data_dir + 'transactions.csv')
transactions_v2 = pd.read_csv(data_dir + 'transactions_v2.csv')
unique_label = pd.read_csv(data_dir + 'members_encoded.csv')
log_data = pd.read_csv(data_dir + 'user_logs_summary.csv')

In [15]:
train_v1['version'] = 'v1'
train_v2['version'] = 'v2'
train = pd.concat([train_v1, train_v2])
train = train.sort_values(['msno', 'version'], ascending = [True, False]).reset_index(drop=True)
train.head()

Unnamed: 0,msno,is_churn,version
0,+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,0,v2
1,+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,0,v1
2,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,0,v2
3,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,0,v1
4,+++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=,0,v2


In [16]:
aa = train['msno'][(train['is_churn'] == 0) & (train['version'] == 'v2')]
bb = train['msno'][(train['is_churn'] == 1) & (train['version'] == 'v1')]
cc = list(set(aa) & set(bb))
train['is_back'] = train['msno'].isin(cc).astype(int)

In [17]:
# 중복값을 제거하되, 중복 Row 중 첫번째 Row를 남기는 옵션을 선택함
train.drop_duplicates(subset='msno', keep='first', inplace=True)
train.query('msno in "+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o="')
# version 컬럼 제거
train = train.drop('version', axis=1)

In [18]:
# 구매 기록 데이터 합치기
transactions = pd.concat([transactions_v1, transactions_v2], ignore_index=True)
transactions = transactions.sort_values(['msno', 'transaction_date']).reset_index(drop=True)

### transaction 데이터 처리

In [19]:
# 0으로 나누는 오류 방지: plan_list_price가 0인 경우는 NaN 처리
transactions["discount_rate"] = np.where(
    transactions["plan_list_price"] != 0, 
    1 - (transactions["actual_amount_paid"] / transactions["plan_list_price"]),
    np.nan  # 원래 가격이 0이면 NaN (이후 평균 계산 시 자동 제외됨)
)

# 사용자 ID(msno) 기준으로 그룹화하여 새로운 데이터프레임 생성
df_transaction = transactions.groupby("msno").agg(
    payment_plan_sum=("payment_plan_days", "sum"),
    plan_list_price=("plan_list_price", "sum"),
    actual_amount_paid=("actual_amount_paid", "sum"),
    discount_rate=("discount_rate", "mean"),  # 개별 거래별 할인율 평균
    is_auto_renew=("is_auto_renew", "mean"),
    membership_expire_date=("membership_expire_date", "max"),
    is_cancel=("is_cancel", "mean"),
    transaction_count=("msno", "count")
).reset_index()

In [20]:
df_transaction.columns

Index(['msno', 'payment_plan_sum', 'plan_list_price', 'actual_amount_paid',
       'discount_rate', 'is_auto_renew', 'membership_expire_date', 'is_cancel',
       'transaction_count'],
      dtype='object')

In [21]:
unique_label.head(2)

Unnamed: 0,msno,msno_encoded
0,Rb9UwLQTrxzBVwCB6+bCcSQWZ9JiNLC9dXtM1oEsZA8=,3132477
1,+tJonkh+O1CA796Fm5X60UMOtB6POHAwPjbTRVl/EuU=,94959


In [24]:
# 기존 데이터프레임에 msno_num 추가
train_en = train.merge(unique_label, on='msno', how='inner').drop(columns=['msno'])
members_en = df_members.merge(unique_label, on='msno', how='inner').drop(columns=['msno'])
df_transaction_en = df_transaction.merge(unique_label, on='msno', how='inner').drop(columns=['msno'])

In [27]:
train_en.head(2)

Unnamed: 0,is_churn,is_back,msno_encoded
0,0,0,12
1,0,0,13


In [25]:
train_en.shape, members_en.shape

((961431, 3), (6769473, 6))

In [42]:
kkbox_transaction_merge = pd.merge(train_en, df_transaction_en, on='msno_encoded', how='inner',)
kkbox_merge = pd.merge(members_en, kkbox_transaction_merge, on='msno_encoded', how='inner')
kkbox_merge['msno'] = kkbox_merge['msno_encoded']
kkbox_merge = kkbox_merge[['msno', 'city', 'bd', 'gender', 'registered_via', 'registration_init_time',
        'is_churn', 'is_back', 'payment_plan_sum',
        'plan_list_price', 'actual_amount_paid', 'discount_rate',
        'is_auto_renew', 'membership_expire_date', 'is_cancel', 'transaction_count']]
kkbox_merge.to_csv(data_dir + "kkbox_data_not_log.csv", index=False)

In [43]:
kkbox_merge.shape, kkbox_merge.columns

((961431, 16),
 Index(['msno', 'city', 'bd', 'gender', 'registered_via',
        'registration_init_time', 'is_churn', 'is_back', 'payment_plan_sum',
        'plan_list_price', 'actual_amount_paid', 'discount_rate',
        'is_auto_renew', 'membership_expire_date', 'is_cancel',
        'transaction_count'],
       dtype='object'))

In [46]:
kkbox_merge.head(2)

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time,is_churn,is_back,payment_plan_sum,plan_list_price,actual_amount_paid,discount_rate,is_auto_renew,membership_expire_date,is_cancel,transaction_count
0,94959,1,0,,7,20110914,0,0,870,3981,4130,0.0,1.0,20170428,0.033333,30
1,6597367,4,30,male,9,20110916,0,0,603,2980,3129,0.0,1.0,20170517,0.0,21


In [36]:
log_data.head(2)

Unnamed: 0,msno,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs,log_start,log_end
0,6.0,903,260,237,200,59826,54134,14782750.0,20150101,20170331
1,12.0,763,379,359,648,2820,3690,912410.6,20161116,20170331


In [48]:
log_data['msno'] = log_data['msno'].astype('int')

In [None]:
kkbox_merge_final = pd.merge(kkbox_merge, log_data, on='msno', how='inner')
kkbox_merge_final.to_csv(data_dir + "kkbox_data_total.csv", index=False)

In [None]:
# # msno 고유값 추출
# unique_msno = pd.concat([
#     df_members[['msno']],
#     train[['msno']],
#     df_transaction[['msno']]
# ]).drop_duplicates().reset_index(drop=True)

# # msno에 대한 고유 번호 할당
# unique_msno['msno_num'] = range(1, len(unique_msno) + 1)

# # 기존 데이터프레임에 msno_num 추가
# train = train.merge(unique_msno, on='msno', how='left').drop(columns=['msno'])
# members = df_members.merge(unique_msno, on='msno', how='left').drop(columns=['msno'])
# df_transaction = df_transaction.merge(unique_msno, on='msno', how='left').drop(columns=['msno'])

# # 필요한 컬럼만 선택
# train = train[['msno_num', 'is_churn', 'is_back']]
# members = members[['msno_num', 'city', 'bd', 'gender', 'registered_via', 'registration_init_time']]
# df_transaction = df_transaction[['msno_num', 'payment_plan_sum', 'plan_list_price', 'actual_amount_paid',
#        'discount_rate', 'is_auto_renew', 'membership_expire_date', 'is_cancel',
#        'transaction_count']]

# # 변환된 데이터 확인
# print(train.head())
# print(members.head())
# print(df_transaction.head())
# # print(user_logs.head())

   msno_num  is_churn  is_back
0   1243772         0        0
1   5813287         0        0
2   3574275         0        0
3   3176456         0        0
4    397420         0        0
   msno_num  city  bd  gender  registered_via  registration_init_time
0         1     1   0     NaN              11                20110911
1         2     1   0     NaN               7                20110914
2         3     1   0     NaN              11                20110915
3         4     1   0     NaN              11                20110915
4         5     6  32  female               9                20110915
   msno_num  payment_plan_sum  plan_list_price  actual_amount_paid  \
0    681048                 7                0                   0   
1   3323218               805             3387                3387   
2   1243772               150              495                 495   
3   5813287               603             2980                3129   
4   3574275               780             38

In [None]:
# kkbox_transaction_merge = pd.merge(train, df_transaction, on='msno_num', how='inner',)
# kkbox_merge = pd.merge(members, kkbox_transaction_merge, on='msno_num', how='inner')
# kkbox_merge.to_csv(data_dir + "kkbox_data_not_log.csv", index=False)