In [37]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# 데이터 생성
np.random.seed(42)
n_samples = 1000
df = pd.DataFrame({
    'num_orders': np.random.randint(0, 20, n_samples),
    'avg_order_cnt': np.random.uniform(5, 50, n_samples),
    'last_order_days': np.random.randint(1, 365, n_samples),
    'coupon_used': np.random.randint(0, 10, n_samples),
    'delivery_distance': np.random.uniform(0.5, 15, n_samples),
    'reorder': np.random.choice([0, 1], n_samples)  # 1: 재주문, 0: 미주문
})

In [31]:
df.head()

Unnamed: 0,num_orders,avg_order_cnt,last_order_days,coupon_used,delivery_distance,reorder
0,6,25.956506,255,2,5.530959,0
1,19,26.637664,266,1,6.16595,0
2,14,46.330463,149,7,2.800844,1
3,10,31.41807,164,6,4.24435,1
4,7,6.4781,140,8,9.27847,1


In [32]:
df.describe()

Unnamed: 0,num_orders,avg_order_cnt,last_order_days,coupon_used,delivery_distance,reorder
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,9.136,27.667966,181.831,4.489,7.752527,0.482
std,5.895953,12.822197,104.047067,2.906601,4.176099,0.499926
min,0.0,5.010689,1.0,0.0,0.502705,0.0
25%,4.0,16.50142,93.0,2.0,4.174122,0.0
50%,9.0,28.171204,176.0,5.0,7.812515,0.0
75%,14.0,38.749018,271.0,7.0,11.470944,1.0
max,19.0,49.970907,364.0,9.0,14.965531,1.0


In [33]:
#월평균 주문 횟수 (monthly_avg_orders)
df['monthly_avg_orders'] = df['num_orders'] / df['last_order_days']
#조건
df['monthly_avg_orders'] = df['last_order_days'].apply(lambda x: 30 if x < 30 else x)

In [42]:
#할인 쿠폰 사용률 (coupon_usage_rate)
df['coupon_usage_rate'] = df['num_orders'] / df['coupon_used']
display(df['coupon_usage_rate'].values)

#조건
df['coupon_usage_rate'] = df['num_orders'].apply(lambda x: 0 if x == 0 else x)
display(df['coupon_usage_rate'].values)


array([ 3.        , 19.        ,  2.        ,  1.66666667,  0.875     ,
        6.        ,  6.        ,  2.        ,  5.        ,  0.42857143,
        3.5       ,  2.        ,  0.5       ,  2.2       ,  5.        ,
        0.14285714,         nan,  1.83333333, 11.        ,  1.77777778,
        1.8       , 15.        , 14.        ,  1.55555556,  2.57142857,
       11.        ,  3.16666667,  1.        ,  0.57142857,  3.        ,
        0.85714286,  1.33333333,  1.2       ,  2.83333333,  0.75      ,
        2.16666667,         inf,         inf,  0.125     ,  4.75      ,
        3.5       ,  3.        ,  2.75      ,  3.5       ,  1.75      ,
        0.25      ,  2.6       ,         inf,  0.5       ,  2.83333333,
        0.77777778,  0.33333333,  0.33333333,  0.71428571,  1.8       ,
        1.        ,  5.66666667, 11.        ,  1.        ,  3.        ,
        0.75      ,  1.85714286,  3.        ,  4.66666667,  1.16666667,
        1.85714286,  7.        ,  2.5       , 12.        , 17.  

array([ 6, 19, 14, 10,  7,  6, 18, 10, 10,  3,  7,  2,  1, 11,  5,  1,  0,
       11, 11, 16,  9, 15, 14, 14, 18, 11, 19,  2,  4, 18,  6,  8,  6, 17,
        3, 13, 17,  8,  1, 19, 14,  6, 11,  7, 14,  2, 13, 16,  3, 17,  7,
        3,  1,  5,  9,  3, 17, 11,  1,  9,  3, 13, 15, 14,  7, 13,  7, 15,
       12, 17, 14, 12,  8, 14, 12,  0,  6,  8,  0, 11,  7, 10, 18, 16,  7,
        2,  2,  0,  4,  9,  6,  8,  6,  8,  7, 11,  1,  0, 15,  4,  2, 11,
        7,  2,  0,  2,  4, 14, 13,  2,  0,  4, 13,  6,  8, 14, 14,  9, 12,
       18,  6, 16, 19,  3,  4,  6, 12, 14, 10,  3, 12,  6, 18,  1,  9, 12,
        5, 11, 11, 19, 10,  6,  0,  0, 19, 12,  8,  2,  6,  5,  7,  8,  4,
        0, 18,  9, 11, 14,  8, 19, 16, 16, 19, 11,  6,  1,  2, 16,  4, 16,
       16, 16,  1,  1,  4,  0,  0, 18,  1, 11,  5,  3, 10, 16,  5,  4, 19,
        1,  5, 10, 15, 15,  0,  8,  5, 15,  2, 19,  3, 18,  2, 18, 19,  6,
       19,  8,  0,  7,  6, 17,  7,  0, 10, 17,  9,  2,  6, 15, 15, 19, 16,
        1,  0, 15, 11,  4

In [35]:
# 주문 횟수 세그먼트(order_category): num_orders 활용 범주형 변수 생성
def categorize_orders(x):
    if x <= 16:
        return "Low"
    elif x <= 28 :
        return 'Medium'
    else:
        return "High"

df['order_category'] = df['num_orders'].apply(categorize_orders)
df['order_category']

0         Low
1      Medium
2         Low
3         Low
4         Low
        ...  
995       Low
996    Medium
997       Low
998       Low
999       Low
Name: order_category, Length: 1000, dtype: object

In [38]:
# 범주형 변수를 원-핫 인코딩
df = pd.get_dummies(df, columns=['order_category'], drop_first=True)

# x, y 분리
X = df.drop(columns=['reorder'])
y = df['reorder']

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBM 모델 학습을 위한 파라미터 탐색
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [20, 31, 40],
    'boosting_type': ['gbdt'],
    'objective': ['binary'],
    'metric': ['binary_error'],
}

lgb_model = lgb.LGBMClassifier(verbose=-1)
grid_search = GridSearchCV(lgb_model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# 최적 모델로 재학습
best_params = grid_search.best_params_
lgb_model = lgb.LGBMClassifier(**best_params, verbose=-1)
lgb_model.fit(X_train, y_train)

# 예측 수행
y_pred = lgb_model.predict(X_test)

# 혼동 행렬 계산 및 시각화
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Reordered', 'Reordered'], yticklabels=['Not Reordered', 'Reordered'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# 평가 지표 출력
print("Best Parameters:", best_params)
print("Accuracy:", accuracy_score(y_test, y_pred))

KeyError: "None of [Index(['order_category'], dtype='object')] are in the [columns]"