In [17]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import math
import seaborn as sns


# 경고 메시지 숨기기
warnings.filterwarnings("ignore")

In [18]:
df = pd.read_csv("./data/data_total.csv")

### 기간 파생변수 생성

In [19]:
df.columns

Index(['Unnamed: 0', 'msno', 'city', 'bd', 'gender', 'registered_via',
       'registration_init_time', 'is_churn', 'is_back', 'payment_plan_sum',
       'plan_list_price', 'actual_amount_paid', 'discount_rate',
       'is_auto_renew', 'membership_expire_date', 'is_cancel',
       'transaction_count', 'num_25', 'num_50', 'num_75', 'num_985', 'num_100',
       'num_unq', 'total_secs', 'log_start', 'log_end'],
      dtype='object')

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 856566 entries, 0 to 856565
Data columns (total 26 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Unnamed: 0              856566 non-null  int64  
 1   msno                    856566 non-null  int64  
 2   city                    856566 non-null  int64  
 3   bd                      856566 non-null  int64  
 4   gender                  368210 non-null  object 
 5   registered_via          856566 non-null  int64  
 6   registration_init_time  856566 non-null  object 
 7   is_churn                856566 non-null  int64  
 8   is_back                 856566 non-null  int64  
 9   payment_plan_sum        856566 non-null  int64  
 10  plan_list_price         856566 non-null  int64  
 11  actual_amount_paid      856566 non-null  int64  
 12  discount_rate           856566 non-null  float64
 13  is_auto_renew           856566 non-null  float64
 14  membership_expire_da

In [21]:
def duration(df):
    # membership_expire_date와 registration_init_time은 'YYYY-MM-DD' 형식으로 되어 있으므로, 그에 맞춰 변환
    df['membership_expire_date'] = pd.to_datetime(df['membership_expire_date'], errors='coerce')
    df['registration_init_time'] = pd.to_datetime(df['registration_init_time'], errors='coerce')
    
    # log_start와 log_end는 'YYYYMMDD' 형식이므로, 그에 맞춰 변환
    df['log_start'] = pd.to_datetime(df['log_start'].astype(str), format='%Y%m%d', errors='coerce')
    df['log_end'] = pd.to_datetime(df['log_end'].astype(str), format='%Y%m%d', errors='coerce')

    # 등록기간 계산: membership_expire_date와 registration_init_time 차이
    df['registration_duration'] = (df['membership_expire_date'] - df['registration_init_time']).dt.days

    # 음악 청취 기간 계산: log_end와 log_start 차이
    df['listening_duration'] = (df['log_end'] - df['log_start']).dt.days

    # 필요 없는 컬럼 삭제
    df.drop(['membership_expire_date', 'registration_init_time', 'log_start', 'log_end'], axis=1, inplace=True)

    return df

# 함수 실행
df = duration(df)


In [23]:
df[['registration_duration', 'listening_duration']].head()

Unnamed: 0,registration_duration,listening_duration
0,2070,781
1,2168,220
2,2047,817
3,2048,817
4,2021,761


### 성별 NAN삭제

In [25]:
df['gender'].isna().sum()

488356

In [26]:
df = df[df['gender'].notna()]

In [27]:
df['gender'].isna().sum()

0

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 368210 entries, 0 to 856564
Data columns (total 24 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             368210 non-null  int64  
 1   msno                   368210 non-null  int64  
 2   city                   368210 non-null  int64  
 3   bd                     368210 non-null  int64  
 4   gender                 368210 non-null  object 
 5   registered_via         368210 non-null  int64  
 6   is_churn               368210 non-null  int64  
 7   is_back                368210 non-null  int64  
 8   payment_plan_sum       368210 non-null  int64  
 9   plan_list_price        368210 non-null  int64  
 10  actual_amount_paid     368210 non-null  int64  
 11  discount_rate          368210 non-null  float64
 12  is_auto_renew          368210 non-null  float64
 13  is_cancel              368210 non-null  float64
 14  transaction_count      368210 non-null  i

* 성별 인코딩

In [30]:
df['gender']

0           male
1           male
2           male
4         female
5         female
           ...  
856540      male
856547      male
856548    female
856559      male
856564    female
Name: gender, Length: 368210, dtype: object

In [31]:
def encode_gender(df):
    # 성별 인코딩: M -> 1, F -> 0, nan -> -1 (또는 원하는 값으로 변경)
    df['gender'] = df['gender'].map({'male': 1, 'female': 0})
    
    return df

In [32]:
df = encode_gender(df)
df['gender'].unique()

array([1, 0], dtype=int64)

### 나이 전처리

* 나이 절대값

In [29]:
df["bd"] = df["bd"].abs()

* 나이 10이상 100이하 추출

In [33]:
df = df[(df["bd"] >= 10) & (df["bd"] <= 100)]

### 필요없는 컬럼 드롭

In [35]:
df.drop(['Unnamed: 0', 'is_back', 'msno'], axis=1, inplace=True)

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 356277 entries, 0 to 856564
Data columns (total 21 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   city                   356277 non-null  int64  
 1   bd                     356277 non-null  int64  
 2   gender                 356277 non-null  int64  
 3   registered_via         356277 non-null  int64  
 4   is_churn               356277 non-null  int64  
 5   payment_plan_sum       356277 non-null  int64  
 6   plan_list_price        356277 non-null  int64  
 7   actual_amount_paid     356277 non-null  int64  
 8   discount_rate          356277 non-null  float64
 9   is_auto_renew          356277 non-null  float64
 10  is_cancel              356277 non-null  float64
 11  transaction_count      356277 non-null  int64  
 12  num_25                 356277 non-null  int64  
 13  num_50                 356277 non-null  int64  
 14  num_75                 356277 non-null  i

### 최종 데이터 저장

In [37]:
df.to_csv("Real_Total_Data.csv", index=False)