## 전자상거래 배송 데이터

### 제품 배송 시간에 맞춰 배송되었는지 예측모델 만들기
학습용 데이터 (X_train, y_train)을 이용하여 배송 예측 모형을 만든 후, 이를 평가용 데이터(X_test)에 적용하여 얻은 예측값을 다음과 같은 형식의 CSV파일로 생성하시오(제출한 모델의 성능은 ROC-AUC 평가지표에 따라 채점)

![](./extrafiles/exam02.png)

[시험용 데이터셋 만들기] 
코드는 예시문제와 동일한 형태의 X_train, y_train, X_test 데이터를 만들기 위함임

(유의사항)

성능이 우수한 예측모형을 구축하기 위해서는 적절한 데이터 전처리, 피처엔지니어링,   
분류알고리즘, 하이퍼파라미터 튜닝, 모형 앙상블 등이 수반되어야 한다.   
수험번호.csv파일이 만들어지도록 코드를 제출한다.
제출한 모델의 성능은 ROC-AUC형태로 읽어드린다.

## 분류모형
확률적 모형
- 확률적 생성 모형 : LDA, QDA, 나이브 베이지안
- 확률적 판별 모형 : 로지스틱회귀, 의사결정나무, **랜덤포레스트**

판별함수모형
- 인공신경망
- 커널SVM
- 퍼셉트론


In [2]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 20)
pd.set_option("display.max_rows", 20)
pd.set_option("display.width", 2000)

# 라이브러리 및 데이터 불러오기
df = pd.read_csv('./extrafiles/Train.csv', engine='python')
df

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,1,D,Flight,4,2,177,3,low,F,44,1233,1
1,2,F,Flight,4,5,216,2,low,M,59,3088,1
2,3,A,Flight,2,2,183,4,low,M,48,3374,1
3,4,B,Flight,3,3,176,4,medium,M,10,1177,1
4,5,C,Flight,2,2,184,3,medium,F,46,2484,1
...,...,...,...,...,...,...,...,...,...,...,...,...
10994,10995,A,Ship,4,1,252,5,medium,F,1,1538,1
10995,10996,B,Ship,4,1,232,5,medium,F,6,1247,0
10996,10997,C,Ship,5,4,242,5,low,F,4,1155,0
10997,10998,F,Ship,5,2,223,6,medium,M,2,1210,0


In [3]:
# 시험용 데이터 분리
X_train = df.loc[:8000, 'Warehouse_block':'Weight_in_gms']
y_train = df.loc[:8000, 'Reached.on.Time_Y.N']
X_test = df.loc[8000:, 'Warehouse_block':'Weight_in_gms']
y_test = df.loc[8000:, 'Reached.on.Time_Y.N']
X_test_id = df.loc[8000:, 'ID']

In [4]:
# 이상치 교정
# X_train 의 Discount_offered 컬럼의 이상치 확인

# 사분위 편차를 확인 하여 조정
X_25 = X_train['Discount_offered'].quantile(0.25)
X_75 = X_train['Discount_offered'].quantile(0.75)
X_diff = (X_75 - X_25) * 1.5

# 이상치의 제거
# X_train = X_train[(X_train['Discount_offered'] < X_75 + X_diff) &
#                   (X_train['Discount_offered'] > X_25 - X_diff) ]
# X_train

# 이상치 대체
X_train['Discount_offered'][(X_train['Discount_offered'] > X_75 + X_diff)] = X_75 + X_diff
X_train['Discount_offered'][(X_train['Discount_offered'] < X_25 - X_diff)] = X_25 - X_diff
X_train

Unnamed: 0,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms
0,D,Flight,4,2,177,3,low,F,44.0,1233
1,F,Flight,4,5,216,2,low,M,51.5,3088
2,A,Flight,2,2,183,4,low,M,48.0,3374
3,B,Flight,3,3,176,4,medium,M,10.0,1177
4,C,Flight,2,2,184,3,medium,F,46.0,2484
...,...,...,...,...,...,...,...,...,...,...
7996,C,Ship,5,3,262,5,low,F,8.0,1493
7997,F,Ship,6,1,306,4,low,F,4.0,1449
7998,D,Ship,6,4,276,4,low,F,2.0,1120
7999,F,Ship,3,2,183,5,low,F,2.0,1158


In [5]:
# 더미변수화
X_train.columns
# 'ID', 'Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases', 'Product_importance', 'Gender', 'Discount_offered', 'Weight_in_gms

# train 데이터 변환
X_num_train = X_train[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product', 
           'Prior_purchases', 'Discount_offered', 'Weight_in_gms']]
X_cat_train = X_train[['Warehouse_block', 'Mode_of_Shipment', 'Product_importance', 'Gender']]
X_cat_train = pd.get_dummies(X_cat_train)
X_cat_train

# test 데이터 변환
X_num_test = X_test[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product', 
           'Prior_purchases', 'Discount_offered', 'Weight_in_gms']]
X_cat_test = X_test[['Warehouse_block', 'Mode_of_Shipment', 'Product_importance', 'Gender']]
X_cat_test = pd.get_dummies(X_cat_test)
X_cat_test

Unnamed: 0,Warehouse_block_A,Warehouse_block_B,Warehouse_block_C,Warehouse_block_D,Warehouse_block_F,Mode_of_Shipment_Flight,Mode_of_Shipment_Road,Mode_of_Shipment_Ship,Product_importance_high,Product_importance_low,Product_importance_medium,Gender_F,Gender_M
8000,1,0,0,0,0,0,0,1,0,1,0,0,1
8001,0,1,0,0,0,0,0,1,0,1,0,0,1
8002,0,0,1,0,0,0,0,1,0,1,0,1,0
8003,0,0,0,0,1,0,0,1,0,0,1,0,1
8004,0,0,0,1,0,0,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10994,1,0,0,0,0,0,0,1,0,0,1,1,0
10995,0,1,0,0,0,0,0,1,0,0,1,1,0
10996,0,0,1,0,0,0,0,1,0,1,0,1,0
10997,0,0,0,0,1,0,0,1,0,0,1,0,1


In [6]:
# 스케일링
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler().fit(X_num_train)

# train 데이터 변환
X_train_scaled = scaler.transform(X_num_train)
X_train_scaled

# test 데이터 변환
X_test_scaled = scaler.transform(X_num_test)
X_test_scaled

array([[0.6       , 0.25      , 0.6682243 , 0.375     , 0.        ,
        0.07552958],
       [1.        , 0.25      , 0.88317757, 0.25      , 0.01980198,
        0.0715851 ],
       [0.6       , 0.75      , 0.62149533, 0.5       , 0.15841584,
        0.10693937],
       ...,
       [0.6       , 0.75      , 0.68224299, 0.375     , 0.05940594,
        0.02249817],
       [0.6       , 0.25      , 0.59345794, 0.5       , 0.01980198,
        0.03053324],
       [0.        , 1.        , 0.27570093, 0.375     , 0.0990099 ,
        0.09320672]])

In [7]:
# 데이터 병합
X_train = pd.concat([X_num_train, X_cat_train], axis=1)
X_test = pd.concat([X_num_test, X_cat_test], axis=1)

# 데이터 준비 완성
X_train

Unnamed: 0,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Discount_offered,Weight_in_gms,Warehouse_block_A,Warehouse_block_B,Warehouse_block_C,Warehouse_block_D,Warehouse_block_F,Mode_of_Shipment_Flight,Mode_of_Shipment_Road,Mode_of_Shipment_Ship,Product_importance_high,Product_importance_low,Product_importance_medium,Gender_F,Gender_M
0,4,2,177,3,44.0,1233,0,0,0,1,0,1,0,0,0,1,0,1,0
1,4,5,216,2,51.5,3088,0,0,0,0,1,1,0,0,0,1,0,0,1
2,2,2,183,4,48.0,3374,1,0,0,0,0,1,0,0,0,1,0,0,1
3,3,3,176,4,10.0,1177,0,1,0,0,0,1,0,0,0,0,1,0,1
4,2,2,184,3,46.0,2484,0,0,1,0,0,1,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7996,5,3,262,5,8.0,1493,0,0,1,0,0,0,0,1,0,1,0,1,0
7997,6,1,306,4,4.0,1449,0,0,0,0,1,0,0,1,0,1,0,1,0
7998,6,4,276,4,2.0,1120,0,0,0,1,0,0,0,1,0,1,0,1,0
7999,3,2,183,5,2.0,1158,0,0,0,0,1,0,0,1,0,1,0,1,0


In [9]:
# 모델 학습
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier('max_features': 'sqrt', 'n_estimators': 10).fit(X_train, y_train)
pred_train = model.predict(X_train)

In [15]:
# predict_proba 존재여부
pred_train_proba = model.predict_proba(X_train)
pred_train_proba

array([[0.  , 1.  ],
       [0.  , 1.  ],
       [0.  , 1.  ],
       ...,
       [0.2 , 0.8 ],
       [0.15, 0.85],
       [0.23, 0.77]])

In [10]:
# 모델 평가
print("Model Score : ", model.score(X_train, y_train))

# ROC_AUC_SCORE 확인
from sklearn.metrics import roc_auc_score
socre = roc_auc_score(y_train, pred_train)
print("ROC AUC Score : ", socre)

Model Score :  1.0
ROC AUC Score :  1.0


In [11]:
# 일반화 성능 확인 cross_val_score 확인
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

kfold = KFold(n_splits =5, random_state=1234, shuffle=True)
score = cross_val_score(RandomForestClassifier(), X_train, y_train, cv=kfold)
print(score)
print(score.mean())

[0.69706433 0.705625   0.700625   0.70875    0.698125  ]
0.7020378669581511


In [24]:
# 하이퍼 파라미터 최적화 - Random Search
from scipy.stats import randint
param_grid = {'n_estimators': range(10, 500, 50),'max_features' : ['auto', 'sqrt', 'log2']}

from sklearn.model_selection import GridSearchCV
random_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
random_search.fit(X_train, y_train)





GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': range(10, 500, 50)})

In [25]:
print('Best Parameter : {}'.format(random_search.best_params_))
print('Best Score : {:.4f}'.format(random_search.best_score_))
print()

Best Parameter : {'max_features': 'sqrt', 'n_estimators': 10}
Best Score : 0.6555



In [12]:
# test 데이터로 예측 정답을 생성
pred_test = model.predict(X_test)

In [13]:
# 모델 평가
print("Model Score : ", model.score(X_test, y_test))

# ROC_AUC_SCORE 확인
from sklearn.metrics import roc_auc_score
socre = roc_auc_score(y_test, pred_test)
print("ROC AUC Score : ", socre)

Model Score :  0.5151717239079693
ROC AUC Score :  0.49911499707504825
