# SMOTENC O 학습용 전처리

In [58]:
import numpy as np
import pandas as pd

newspaper_df = pd.read_csv('preprocessed_newspaper.csv')

---

## 소득 구간 & 나이 구간 수치형으로 변환하기

### 1.나이 구간 수치형 데이터로 변환
- 성인 기준: 18살 이상
    - https://www.law.cornell.edu/wex/age_of_majority#:~:text=The%20age%20of%20majority%20refers%20to%20the,or%20the%20right%20to%20sign%20a%20contract.
- 미국 2024년 평균 기대수명: 78.4살
    - https://www.cdc.gov/nchs/fastats/life-expectancy.htm#:~:text=Both%20sexes:%2078.4%20years.%20Males:%2075.8%20years.%20Females:%2081.1%20years.

In [None]:
age_group_bounds = {
    '24 years or less': (18, 24), # 24세 이하 (최소값은 임의로 18로 설정)
    '25-29': (25, 29),
    '30-34': (30, 34),
    '35-39': (35, 39),
    '40-44': (40, 44),
    '45-49': (45, 49),
    '50-54': (50, 54),
    '55-59': (55, 59),
    '60-64': (60, 64),
    '65-69': (65, 69),
    '70-74': (70, 74), 
    '75 years or more': (75, 80) # 나이 최대 값은 80세
}

# 나이 부여 함수
def sample_age(age_range, n_samples):
    low, high = age_group_bounds[age_range]
    mean = (low + high) / 2  # 평균은 범위의 중앙값
    std = (high - low) / 4   # 표준편차는 범위의 1/4 정도로 설정 (약 95%가 범위 안에 들도록)
    sampled = np.random.normal(loc=mean, scale=std, size=n_samples) # 정규분포 샘플링
    return np.clip(np.round(sampled), low, high).astype(int) # 정수로 반올림하고 범위 내로 제한


# Age 컬럼 생성
newspaper_df['Age'] = newspaper_df['Age range'].apply(
    lambda group: sample_age(group, 1)[0] if pd.notnull(group) else np.nan
)


### 2. 소득 구간 수치형 데이터로 변환

In [None]:
# 1. 소득 구간별 하한/상한 정의
income_bounds = {
    'Under $20,000': (10000, 19999),
    '$  20,000 - $29,999': (20000, 29999),
    '$  30,000 - $39,999': (30000, 39999),
    '$  40,000 - $49,999': (40000, 49999),
    '$  50,000 - $59,999': (50000, 59999),
    '$  60,000 - $69,999': (60000, 69999),
    '$  70,000 - $79,999': (70000, 79999),
    '$  80,000 - $89,999': (80000, 89999),
    '$  90,000 - $99,999': (90000, 99999),
    '$100,000 - $124,999': (100000, 124999),
    '$125,000 - $149,999': (125000, 149999),
    '$150,000 - $174,999': (150000, 174999),
    '$175,000 - $199,999': (175000, 199999),
    '$200,000 - $249,999': (200000, 249999),
    '$250,000 - $299,999': (250000, 299999),
    '$300,000 - $399,999': (300000, 399999),
    '$400,000 - $499,999': (400000, 499999),
    '$500,000 Plus': (500000, 750000),  # 상위 1% 소득 하한($631,500)을 감안해서 유연함 반영 (출처: DQYDJ, 2024)
}

# 2. 로그 정규분포 기반 샘플링 함수 정의
def sample_log_normal_income(income_range, n_samples=1):
    if income_range not in income_bounds or pd.isnull(income_range):
        return np.nan

    low, high = income_bounds[income_range]
    mu = np.log((low + high) / 2)
    sigma = 0.4  # 분포의 퍼짐 정도 (값이 작을수록 중앙에 집중, 클수록 상한/하한 근처까지 다양하게 퍼짐 => 0.4, 0.5가 중간값 중심 분포를 유지하면서도 현실적인 다양성을 부여하는 수준)

    sample = np.random.lognormal(mean=mu, sigma=sigma, size=n_samples)
    clipped = np.clip(sample, low, high)
    return int(round(clipped[0]))

# 3. 적용: 새로운 'Income' 컬럼 생성
newspaper_df['Income'] = newspaper_df['HH Income'].apply(sample_log_normal_income)


# 기존 불필요 컬럼 제거
converted_df = newspaper_df.drop(['HH Income', 'Age range'], axis=1)

## SMOTENC 적용

In [63]:
from imblearn.over_sampling import SMOTENC
from collections import Counter

# 타깃 / 피처 분리
y = converted_df['is_churned'].astype(int)
X = converted_df.drop(columns=['is_churned']).copy()

# 범주형 / 수치형 컬럼 지정
cat_cols = [
    'Home Ownership','Ethnicity','dummy for Children',
    'Language','City','County','weekly fee',
    'Deliveryperiod','Nielsen Prizm','Source Channel'
]
num_cols = ['Year Of Residence', 'reward program', 'Age', 'Income']

# 실제 존재하는 컬럼만 남기기
cat_cols = [c for c in cat_cols if c in X.columns]
num_cols = [c for c in num_cols if c in X.columns]

# 범주형을 정수 코드로 변환 (문자열 복원 없음)
X_enc = X.copy()
for c in cat_cols:
    X_enc[c] = pd.Categorical(X_enc[c]).codes

# SMOTENC용 범주형 컬럼 인덱스
cat_idx = [X_enc.columns.get_loc(c) for c in cat_cols]

# k_neighbors 설정 (소수 클래스 크기 고려)
class_counts = Counter(y)
minority_n = min(class_counts.values())
k_neighbors = max(1, min(5, minority_n - 1))

# SMOTENC 적용 (전체 데이터셋)
smote = SMOTENC(
    categorical_features=cat_idx,
    random_state=42,
    k_neighbors=k_neighbors
)
X_res, y_res = smote.fit_resample(X_enc, y)

# DataFrame 변환 (원래 컬럼명 유지, float 그대로 둠)
X_res_df = pd.DataFrame(X_res, columns=X_enc.columns)
y_res_sr = pd.Series(y_res, name='is_churned')

# 최종 DataFrame 합치기
yes_SMOTENC_yes_convert_df = pd.concat([X_res_df, y_res_sr], axis=1)

# 결과 확인
print("클래스 개수(변경 전):", dict(class_counts))
print("클래스 개수(변경 후):", y_res_sr.value_counts().to_dict())
print("원본 데이터 크기:", converted_df.shape)
print("오버샘플링 데이터 크기:", yes_SMOTENC_yes_convert_df.shape)
print("CSV 저장 완료: newspaper_oversampled_raw.csv")

클래스 개수(변경 전): {1: 12434, 0: 3004}
클래스 개수(변경 후): {1: 12434, 0: 12434}
원본 데이터 크기: (15438, 15)
오버샘플링 데이터 크기: (24868, 15)
CSV 저장 완료: newspaper_oversampled_raw.csv


In [64]:
yes_SMOTENC_yes_convert_df

Unnamed: 0,Home Ownership,Ethnicity,dummy for Children,Year Of Residence,Language,City,County,weekly fee,Deliveryperiod,Nielsen Prizm,reward program,Source Channel,Age,Income,is_churned
0,1,23,0,1,12,32,0,11,0,1,0,6,26,30000,1
1,0,72,1,14,37,38,1,2,18,5,0,37,53,513408,0
2,0,33,1,7,7,21,1,2,18,5,0,37,46,124999,0
3,0,17,0,23,7,28,1,4,18,5,1,27,58,249999,1
4,0,33,0,23,7,42,1,12,20,4,0,9,62,50000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24863,0,17,0,17,7,17,1,5,0,1,0,14,67,40000,0
24864,1,27,0,4,7,40,1,11,0,4,0,14,57,19999,0
24865,0,17,1,18,7,21,1,1,18,2,0,37,47,174999,0
24866,0,15,0,14,7,21,1,7,18,5,0,37,52,509962,0


In [65]:
# CSV 저장
yes_SMOTENC_yes_convert_df.to_csv("yes_SMOTENC_yes_convert.csv", index=False)