In [170]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import math
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# 경고 메시지 숨기기
warnings.filterwarnings("ignore")

In [171]:
df = pd.read_csv("./data/data_total.csv")

### 날짜 타입 변경 및 년/월/일 분리

In [172]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 856566 entries, 0 to 856565
Data columns (total 26 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Unnamed: 0              856566 non-null  int64  
 1   msno                    856566 non-null  int64  
 2   city                    856566 non-null  int64  
 3   bd                      856566 non-null  int64  
 4   gender                  368210 non-null  object 
 5   registered_via          856566 non-null  int64  
 6   registration_init_time  856566 non-null  object 
 7   is_churn                856566 non-null  int64  
 8   is_back                 856566 non-null  int64  
 9   payment_plan_sum        856566 non-null  int64  
 10  plan_list_price         856566 non-null  int64  
 11  actual_amount_paid      856566 non-null  int64  
 12  discount_rate           856566 non-null  float64
 13  is_auto_renew           856566 non-null  float64
 14  membership_expire_da

In [173]:
def int_to_date(df):
    # membership_expire_date와 registration_init_time은 'YYYY-MM-DD' 형식으로 되어 있으므로, 그에 맞춰 변환
    df['membership_expire_date'] = pd.to_datetime(df['membership_expire_date'], errors='coerce')
    df['registration_init_time'] = pd.to_datetime(df['registration_init_time'], errors='coerce')
    
    # log_start와 log_end는 'YYYYMMDD' 형식이므로, 그에 맞춰 변환
    df['log_start'] = pd.to_datetime(df['log_start'].astype(str), format='%Y%m%d', errors='coerce')
    df['log_end'] = pd.to_datetime(df['log_end'].astype(str), format='%Y%m%d', errors='coerce')

    return df

# 함수 실행
df = int_to_date(df)

In [174]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 856566 entries, 0 to 856565
Data columns (total 26 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   Unnamed: 0              856566 non-null  int64         
 1   msno                    856566 non-null  int64         
 2   city                    856566 non-null  int64         
 3   bd                      856566 non-null  int64         
 4   gender                  368210 non-null  object        
 5   registered_via          856566 non-null  int64         
 6   registration_init_time  856566 non-null  datetime64[ns]
 7   is_churn                856566 non-null  int64         
 8   is_back                 856566 non-null  int64         
 9   payment_plan_sum        856566 non-null  int64         
 10  plan_list_price         856566 non-null  int64         
 11  actual_amount_paid      856566 non-null  int64         
 12  discount_rate           856566

* 년,월,일 분리

In [175]:

def split_date_columns(df, column_name):
    # 컬럼을 datetime 형식으로 변환 (변환 실패시 NaT로 처리)
    df[column_name] = pd.to_datetime(df[column_name], errors='coerce')
    
    # 연도, 월, 일 컬럼 추가
    df[f'{column_name}_year'] = df[column_name].dt.year
    df[f'{column_name}_month'] = df[column_name].dt.month
    df[f'{column_name}_day'] = df[column_name].dt.day
    
    # 원본 컬럼 삭제
    df = df.drop(columns=[column_name])
    
    return df

In [176]:
df = split_date_columns(df, 'registration_init_time')
df = split_date_columns(df, 'membership_expire_date')
df = split_date_columns(df, 'log_end')
df = split_date_columns(df, 'log_start')

In [177]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 856566 entries, 0 to 856565
Data columns (total 34 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Unnamed: 0                    856566 non-null  int64  
 1   msno                          856566 non-null  int64  
 2   city                          856566 non-null  int64  
 3   bd                            856566 non-null  int64  
 4   gender                        368210 non-null  object 
 5   registered_via                856566 non-null  int64  
 6   is_churn                      856566 non-null  int64  
 7   is_back                       856566 non-null  int64  
 8   payment_plan_sum              856566 non-null  int64  
 9   plan_list_price               856566 non-null  int64  
 10  actual_amount_paid            856566 non-null  int64  
 11  discount_rate                 856566 non-null  float64
 12  is_auto_renew                 856566 non-nul

### 성별 NAN값 제거 후 인코딩

In [178]:
df_filtered = df[df['gender'].notna()]

In [179]:
def encode_gender(df):
    # 성별 인코딩: M -> 1, F -> 0, nan -> -1 (또는 원하는 값으로 변경)
    df['gender_encoded'] = df['gender'].map({'male': 1, 'female': 0})
    df = df.drop(columns=["gender"])
    
    return df

In [180]:
# df['gender'].isnull().sum()

In [181]:
df = encode_gender(df_filtered)

In [182]:
df["gender_encoded"]

0         1
1         1
2         1
4         0
5         0
         ..
856540    1
856547    1
856548    0
856559    1
856564    0
Name: gender_encoded, Length: 368210, dtype: int64

### 나이 전처리

In [183]:
df["bd"] = df["bd"].abs()

In [184]:
# 0인 값이 많아서 Age 중앙값이 0으로 나옴. 10보다 큰 Age 값의 중앙값

median_df_bd = df[df["bd"] > 11]["bd"].median()
median_df_bd

28.0

In [185]:
df["bd"] = df["bd"].apply(lambda x: median_df_bd if (x != 0 and (x < 11 or x > 100)) else x)

In [186]:
df_filtered = df[df['bd'].notna()]

In [187]:
df['bd'].unique()

array([ 30.,  34.,  63.,  28.,  38.,  26.,  58.,  21.,  39.,  27.,  22.,
        20.,  17.,  40.,  24.,  23.,  25.,  18.,  29.,  33.,  31.,  41.,
        19.,  35.,  37.,  47.,  36.,  32.,  53.,  43.,  62.,  44.,  42.,
        45.,  48.,   0.,  56.,  52.,  49.,  51.,  16.,  50.,  46.,  54.,
        68.,  72.,  14.,  66.,  55.,  60.,  70.,  15.,  57.,  82.,  59.,
        64.,  61.,  94.,  65.,  67.,  71.,  79.,  96.,  74.,  77.,  97.,
        13.,  95.,  75.,  84.,  83.,  69.,  78., 100.,  93.,  11.,  73.,
        81.,  12.,  89.,  86.,  92.,  80.,  76.,  91.,  85.,  98.,  87.,
        88.,  90.,  99.])

### 정규화

In [188]:
# X_train, X_test, y_train, y_test = train_test_split(train_data.drop('gender',axis=1), train_data['gender'], test_size=0.3, random_state=42)

# std_scaler = StandardScaler()
# train_scaled = std_scaler.fit_transform(X_train)
# test_scaled = std_scaler.transform(X_test)

### 데이터 저장

In [None]:
# df_filtered.to_csv("./data/data_total_split_datetime.csv")

In [192]:
df_filtered.describe()

Unnamed: 0.1,Unnamed: 0,msno,city,bd,registered_via,is_churn,is_back,payment_plan_sum,plan_list_price,actual_amount_paid,...,membership_expire_date_year,membership_expire_date_month,membership_expire_date_day,log_end_year,log_end_month,log_end_day,log_start_year,log_start_month,log_start_day,gender_encoded
count,368210.0,368210.0,368210.0,368210.0,368210.0,368210.0,368210.0,368210.0,368210.0,368210.0,...,368210.0,368210.0,368210.0,368210.0,368210.0,368210.0,368210.0,368210.0,368210.0,368210.0
mean,460350.246196,3382892.0,10.703778,28.867334,6.761813,0.175413,0.008343,520.178806,2495.207124,2628.273477,...,2017.017922,4.131276,16.859401,2016.956079,3.089139,27.110668,2015.299766,3.38157,9.094229,0.525067
std,277471.87728,1953541.0,5.964901,10.277352,2.558991,0.380321,0.090959,242.780554,1167.049674,1233.30414,...,0.132668,1.066317,9.371334,0.243621,1.185756,6.905647,0.537661,3.552977,9.205526,0.499372
min,1.0,13.0,1.0,0.0,3.0,0.0,0.0,7.0,1.0,0.0,...,2017.0,1.0,1.0,2015.0,1.0,1.0,2015.0,1.0,1.0,0.0
25%,215965.25,1690717.0,5.0,23.0,4.0,0.0,0.0,330.0,1584.0,1560.0,...,2017.0,4.0,9.0,2017.0,3.0,27.0,2015.0,1.0,1.0,0.0
50%,458556.0,3385528.0,13.0,28.0,7.0,0.0,0.0,514.0,2533.0,3129.0,...,2017.0,4.0,17.0,2017.0,3.0,30.0,2015.0,1.0,5.0,1.0
75%,696970.75,5070692.0,14.0,34.0,9.0,0.0,0.0,740.0,3427.0,3558.0,...,2017.0,4.0,25.0,2017.0,3.0,31.0,2016.0,5.0,16.0,1.0
max,955124.0,6769459.0,22.0,100.0,13.0,1.0,1.0,2032.0,7390.0,7426.0,...,2018.0,12.0,31.0,2017.0,12.0,31.0,2017.0,12.0,31.0,1.0


In [None]:
df_filtered.info()

In [194]:
df = df_filtered.drop(['Unnamed: 0'], axis=1)
df.to_csv('./data/data_totla_split_non_index.csv', index=False)