# SMOTENC X 학습용 전처리

In [23]:
import numpy as np
import pandas as pd

newspaper_df = pd.read_csv('preprocessed_newspaper.csv')

---

## 소득 구간 & 나이 구간 수치형으로 변환하기

### 1.나이 구간 수치형 데이터로 변환
- 성인 기준: 18살 이상
    - https://www.law.cornell.edu/wex/age_of_majority#:~:text=The%20age%20of%20majority%20refers%20to%20the,or%20the%20right%20to%20sign%20a%20contract.
- 미국 2024년 평균 기대수명: 78.4살
    - https://www.cdc.gov/nchs/fastats/life-expectancy.htm#:~:text=Both%20sexes:%2078.4%20years.%20Males:%2075.8%20years.%20Females:%2081.1%20years.

In [None]:
age_group_bounds = {
    '24 years or less': (18, 24), # 24세 이하 (최소값은 임의로 18로 설정)
    '25-29': (25, 29),
    '30-34': (30, 34),
    '35-39': (35, 39),
    '40-44': (40, 44),
    '45-49': (45, 49),
    '50-54': (50, 54),
    '55-59': (55, 59),
    '60-64': (60, 64),
    '65-69': (65, 69),
    '70-74': (70, 74), 
    '75 years or more': (75, 80) # 나이 최대 값은 80세
}

# 나이 부여 함수
def sample_age(age_range, n_samples):
    low, high = age_group_bounds[age_range]
    mean = (low + high) / 2  # 평균은 범위의 중앙값
    std = (high - low) / 4   # 표준편차는 범위의 1/4 정도로 설정 (약 95%가 범위 안에 들도록)
    sampled = np.random.normal(loc=mean, scale=std, size=n_samples) # 정규분포 샘플링
    return np.clip(np.round(sampled), low, high).astype(int) # 정수로 반올림하고 범위 내로 제한


# Age 컬럼 생성
newspaper_df['Age'] = newspaper_df['Age range'].apply(
    lambda group: sample_age(group, 1)[0] if pd.notnull(group) else np.nan
)

### 2. 소득 구간 수치형 데이터로 변환

In [29]:
# 1. 소득 구간별 하한/상한 정의
income_bounds = {
    'Under $20,000': (10000, 19999),
    '$  20,000 - $29,999': (20000, 29999),
    '$  30,000 - $39,999': (30000, 39999),
    '$  40,000 - $49,999': (40000, 49999),
    '$  50,000 - $59,999': (50000, 59999),
    '$  60,000 - $69,999': (60000, 69999),
    '$  70,000 - $79,999': (70000, 79999),
    '$  80,000 - $89,999': (80000, 89999),
    '$  90,000 - $99,999': (90000, 99999),
    '$100,000 - $124,999': (100000, 124999),
    '$125,000 - $149,999': (125000, 149999),
    '$150,000 - $174,999': (150000, 174999),
    '$175,000 - $199,999': (175000, 199999),
    '$200,000 - $249,999': (200000, 249999),
    '$250,000 - $299,999': (250000, 299999),
    '$300,000 - $399,999': (300000, 399999),
    '$400,000 - $499,999': (400000, 499999),
    '$500,000 Plus': (500000, 750000),  # 상위 1% 소득 하한($631,500)을 감안해서 유연함 반영 (출처: DQYDJ, 2024)
}

# 2. 로그 정규분포 기반 샘플링 함수 정의
def sample_log_normal_income(income_range, n_samples=1):
    if income_range not in income_bounds or pd.isnull(income_range):
        return np.nan

    low, high = income_bounds[income_range]
    mu = np.log((low + high) / 2)
    sigma = 0.4  # 분포의 퍼짐 정도 (값이 작을수록 중앙에 집중, 클수록 상한/하한 근처까지 다양하게 퍼짐 => 0.4, 0.5가 중간값 중심 분포를 유지하면서도 현실적인 다양성을 부여하는 수준)

    sample = np.random.lognormal(mean=mu, sigma=sigma, size=n_samples)
    clipped = np.clip(sample, low, high)
    return int(round(clipped[0]))

# 3. 적용: 새로운 'Income' 컬럼 생성
newspaper_df['Income'] = newspaper_df['HH Income'].apply(sample_log_normal_income)

In [30]:
# 기존 소득 구간 & 나이 구간 컬럼 제거
converted_df = newspaper_df.drop(['HH Income', 'Age range'], axis=1)

## Label 인코딩

In [31]:
y = converted_df['is_churned'].astype(int)
X = converted_df.drop(columns=['is_churned']).copy()

# 범주형 컬럼
cat_cols_list = [
    'Home Ownership', 'County', 'dummy for Children',
    'Ethnicity', 'Language', 'City', 'Deliveryperiod',
    'Source Channel', 'weekly fee', 'Nielsen Prizm'
]

# 실제 데이터에 존재하는 컬럼만 남김
cat_cols = [c for c in cat_cols_list if c in X.columns]

# 수치형 컬럼 (범주형에 포함되지 않은 int/float 컬럼만)
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_cols = [c for c in numeric_cols if c not in cat_cols]

# 인코더 & 스케일러 설정
scaler = StandardScaler()
ord_enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# ColumnTransformer 구성
preprocess_encoding = ColumnTransformer(
    transformers=[
        ('num', scaler, numeric_cols),  # 수치형 → 표준화
        ('ord', ord_enc, cat_cols)      # 범주형 → 라벨 인코딩
    ],
    remainder='drop'
)

# 데이터셋 변환
X_encoded = preprocess_encoding.fit_transform(X)

# 컬럼명 구성 (라벨 인코딩은 원래 컬럼명 유지)
encoded_cols = numeric_cols + cat_cols

# 변환된 DataFrame 생성
no_SMOTE_yes_convert_df = pd.DataFrame(X_encoded, columns=encoded_cols, index=X.index)

# 타깃 변수 추가
no_SMOTE_yes_convert_df['is_churned'] = y.values

In [32]:
no_SMOTE_yes_convert_df

Unnamed: 0,Year Of Residence,reward program,Age,Income,Home Ownership,County,dummy for Children,Ethnicity,Language,City,Deliveryperiod,Source Channel,weekly fee,Nielsen Prizm,is_churned
0,-1.067116,-0.115900,-1.504528,-0.729433,1.0,0.0,0.0,23.0,12.0,32.0,0.0,6.0,11.0,1.0,1
1,0.046968,-0.115900,0.034557,5.485383,0.0,1.0,1.0,72.0,37.0,38.0,18.0,37.0,2.0,5.0,0
2,-0.552923,-0.115900,-0.350215,0.111530,0.0,1.0,1.0,33.0,7.0,21.0,18.0,37.0,2.0,5.0,0
3,0.818257,-0.012403,0.419328,0.775464,0.0,1.0,0.0,17.0,7.0,28.0,18.0,27.0,4.0,5.0,1
4,0.818257,-0.115900,0.547585,-0.481364,0.0,1.0,0.0,33.0,7.0,42.0,20.0,9.0,12.0,4.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15433,-0.895718,-0.115900,0.226942,-0.729442,1.0,1.0,0.0,17.0,7.0,1.0,0.0,9.0,2.0,1.0,1
15434,-0.895718,-0.115900,-1.696914,-0.203383,0.0,1.0,0.0,33.0,7.0,16.0,18.0,37.0,2.0,5.0,1
15435,-0.895718,-0.115900,-1.632786,-0.198302,0.0,1.0,0.0,33.0,7.0,16.0,18.0,37.0,1.0,5.0,1
15436,-0.124429,-0.115900,0.611714,0.332847,0.0,1.0,1.0,32.0,7.0,12.0,0.0,18.0,4.0,5.0,1


In [33]:
# csv 변환
no_SMOTE_yes_convert_df.to_csv('no_SMOTE_yes_convert.csv', index=False)