# 분류 모델 - 주문자의 성별 예측

1. 각 분류기별 성능을 정렬해본다.
2. 앙상블로 조합해본다


In [103]:
import numpy as np
import pandas as pd

df = pd.read_csv('./extrafiles/Train.csv')
print(df.columns)
df

Index(['ID', 'Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases', 'Product_importance', 'Gender', 'Discount_offered', 'Weight_in_gms', 'Reached.on.Time_Y.N'], dtype='object')


Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,1,D,Flight,4,2,177,3,low,F,44,1233,1
1,2,F,Flight,4,5,216,2,low,M,59,3088,1
2,3,A,Flight,2,2,183,4,low,M,48,3374,1
3,4,B,Flight,3,3,176,4,medium,M,10,1177,1
4,5,C,Flight,2,2,184,3,medium,F,46,2484,1
...,...,...,...,...,...,...,...,...,...,...,...,...
10994,10995,A,Ship,4,1,252,5,medium,F,1,1538,1
10995,10996,B,Ship,4,1,232,5,medium,F,6,1247,0
10996,10997,C,Ship,5,4,242,5,low,F,4,1155,0
10997,10998,F,Ship,5,2,223,6,medium,M,2,1210,0


In [104]:
# 이상치와 결측치 확인 - 10시 시작
df.isna().sum() # 결측치 없음

df.describe() # Discount_offered 하나가 오차 범위에 있음

# 이상치 조정
items = ['Discount_offered']
for col in items:
    print(col, " 이상치 제거")
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    q_diff = (q3 - q1) * 1.5
    mask_min = df[col] < q1 - q_diff
    mask_max = df[col] > q3 + q_diff
    df.loc[mask_min, col] = q1 - q_diff
    df.loc[mask_max, col] = q3 + q_diff

Discount_offered  이상치 제거


In [105]:
# 카테고리 및 연속형 변수 분리
X_cat = df[['Warehouse_block', 'Mode_of_Shipment', 'Product_importance', 'Gender']]
X_num = df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product', 
            'Prior_purchases', 'Discount_offered', 'Weight_in_gms']]

# 더미변수화
X_cat_dummy = pd.get_dummies(X_cat)

# 정규화 - 스케일링
from sklearn.preprocessing import MinMaxScaler
X_num_scaled = MinMaxScaler().fit_transform(X_num)
X_num_scaled = pd.DataFrame(X_num_scaled, index=X_num.index, columns=X_num.columns)

# 데이터셋 병합
X_temp = pd.concat([X_num_scaled, X_cat_dummy], axis=1)

# 변수 셋팅
X_id = df['ID']
y = df['Reached.on.Time_Y.N']

# train test 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_temp, y, random_state=3345, stratify=y)

print(y_train.mean())
print(y_test.mean())

0.5966783852588192
0.5967272727272728


In [106]:
# 모델 적용 - Stacking
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier

# model fitting
estimators = [('a', SVC()), ('b', RandomForestClassifier())]
model_stack = StackingClassifier(estimators=estimators, 
                                 final_estimator=LogisticRegression()).fit(X_train, y_train)

# cross-validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5, random_state=456, shuffle=True)
score = cross_val_score(StackingClassifier(estimators=estimators, 
                                           final_estimator=LogisticRegression()), X_train, y_train, cv=kfold)

print("CV Score : ", score)
print("CV Score mean : ", score.mean())

CV Score :  [0.65272727 0.65090909 0.64060606 0.64545455 0.63614312]
CV Score mean :  0.64516801734752


In [18]:
# LogisticRegression
score = cross_val_score(LogisticRegression(), X_train, y_train, cv=kfold)
print("CV Score : ", score)
print("CV Score mean : ", score.mean())

CV Score :  [0.63878788 0.62242424 0.64       0.64181818 0.62886598]
CV Score mean :  0.6343792564823493


In [46]:
# predict
pred_train = model_stack.predict(X_train)

In [55]:
# classification_report
from sklearn.metrics import classification_report
print(classification_report(y_train, pred_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3327
           1       1.00      1.00      1.00      4922

    accuracy                           1.00      8249
   macro avg       1.00      1.00      1.00      8249
weighted avg       1.00      1.00      1.00      8249



In [21]:
# test data predict
pred_test = model_stack.predict_proba(X_test)
print(pred_test[:, 1])

# X_id 와 결합
result = pd.DataFrame({'':, '':pred_test})

[0.94304179 0.93141322 0.65469664 ... 0.42263296 0.59736818 0.81450549]


In [45]:
# 분류 결과와 평가 까지 풀 코스로
result = pd.DataFrame({'cust_id': X_test.index, 'reacheYN': pred_test[:, 1]})
print(result)
result.to_csv('003000678.csv', index=False)

import os.path
print(os.path.exists('003000678.csv'))

      cust_id  reacheYN
0        2350  0.943042
1          55  0.931413
2        7396  0.654697
3        7408  0.210633
4        4577  0.519672
...       ...       ...
2745     1109  0.928326
2746     9326  0.392122
2747     6202  0.422633
2748     3516  0.597368
2749     7077  0.814505

[2750 rows x 2 columns]
True


In [109]:
# 나이브베이즈 - 분류문제에서는 성능이 좋은 듯?!?
from sklearn.naive_bayes import GaussianNB

score = cross_val_score(GaussianNB(), X_train, y_train, cv=kfold)

print("CV Score : ", score)
print("CV Score mean : ", score.mean())

CV Score :  [0.66242424 0.65272727 0.64181818 0.66242424 0.65433596]
CV Score mean :  0.6547459801165078


In [None]:
 # XGBoost

# 회귀분석 모델 - 집값 예측

1. 각 회귀분석 모델별 설명력(R-Square)를 예측해본다.
2. 앙상블을 적용해본다.

In [56]:
# 데이터 로드
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np

df = pd.read_csv('./extrafiles/house_price.csv')
print(df.columns)
df

Index(['housing_age', 'income', 'bedrooms', 'households', 'rooms',
       'house_value'],
      dtype='object')


Unnamed: 0,housing_age,income,bedrooms,households,rooms,house_value
0,23,6.7770,0.141112,2.442244,8.103960,500000
1,49,6.0199,0.160984,2.726688,5.752412,500000
2,35,5.1155,0.249061,1.902676,3.888078,500000
3,32,4.7109,0.231383,1.913669,4.508393,500000
4,21,4.5625,0.255583,3.092664,4.667954,500000
...,...,...,...,...,...,...
17684,34,2.3013,0.214583,2.748299,4.897959,26600
17685,33,2.6750,0.246622,3.428571,4.698413,22500
17686,39,2.3667,0.340771,1.876812,3.572464,17500
17687,19,2.1000,0.386107,2.987805,3.774390,14999


In [61]:
# 화면 넓이 조정
pd.set_option("display.max_columns", 20)
pd.set_option("display.width", 2000)

# 이상치 및 결측치 확인
# 결측치 확인 - 없음
print(df.isna().sum())

# 이상치 확인 - 없음
print(df.describe())

# 컬럼 정보 확인 및 카테고리 연속형 변수 분리
df.columns

housing_age    0
income         0
bedrooms       0
households     0
rooms          0
house_value    0
dtype: int64
        housing_age        income      bedrooms    households         rooms    house_value
count  17689.000000  17689.000000  17689.000000  17689.000000  17689.000000   17689.000000
mean      27.378823      3.671141      0.213278      2.952117      5.244001  189043.439313
std       11.280230      1.525937      0.051167      0.731573      1.184922   95487.122628
min        1.000000      0.499900      0.100000      0.750000      1.640000   14999.000000
25%       18.000000      2.532900      0.177464      2.470270      4.426829  114400.000000
50%       28.000000      3.453900      0.204104      2.854962      5.190779  171100.000000
75%       36.000000      4.591800      0.240157      3.316092      5.953728  242700.000000
max       51.000000      9.905500      0.498127      6.954023     11.901869  500000.000000


Index(['housing_age', 'income', 'bedrooms', 'households', 'rooms', 'house_value'], dtype='object')

In [65]:
# 변수 분리 - 카테고리 값이 없음
X = df[['housing_age', 'income', 'bedrooms', 'households', 'rooms']]
y = df[['house_value']]

# Scaling 정규화 작업을 회귀 분석에서는 해주지 않아도 된다.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1234)

In [99]:
# 모델적용 - 선형회귀는 1종만 / 앙상블 스태킹 적용 / XGB / Voting
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import VotingRegressor

estimators = [('sv', SVR()), ('rf', RandomForestRegressor())]

model_stack = StackingRegressor(estimators = estimators, final_estimator=LinearRegression()).fit(X_train, y_train)
model_voting = VotingRegressor(estimators = estimators).fit(X_train, y_train)
model_lr = LinearRegression().fit(X_train, y_train)

In [100]:
# 교차검증 스코어 확인
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

kfold = KFold(n_splits=4, shuffle=True, random_state=4444)

# 선형회귀모형
score = cross_val_score(LinearRegression(), X_train, y_train, cv=kfold)

print("CV Score : ", score)
print("CV Score mean : ", score.mean())
print()

# 랜덤포레스트
score = cross_val_score(RandomForestRegressor(), X_train, y_train, cv=kfold)

print("CV Score : ", score)
print("CV Score mean : ", score.mean())
print()

# Voting
score = cross_val_score(VotingRegressor(estimators = estimators), X_train, y_train, cv=kfold)

print("CV Score : ", score)
print("CV Score mean : ", score.mean())
print()

# Stacking
score = cross_val_score(StackingRegressor(estimators = estimators, 
                                           final_estimator=LinearRegression()), X_train, y_train, cv=kfold)

print("CV Score : ", score)
print("CV Score mean : ", score.mean())

CV Score :  [0.55315719 0.5617157  0.57474603 0.59217179]
CV Score mean :  0.570447678417513

CV Score :  [0.5922045  0.60808151 0.61879652 0.63279122]
CV Score mean :  0.6129684341882069

CV Score :  [0.45725684 0.46254498 0.45586897 0.47306164]
CV Score mean :  0.46218310921917083

CV Score :  [0.59285439 0.60757685 0.61736092 0.63281697]
CV Score mean :  0.612652282789369


In [79]:
# 테스트 데이터 예측
pred_test = model_stack.predict(X_test)

In [98]:
# MSE
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, pred_test)
print("MSE : ", mse)

# RMSE
print("RMSE : ", np.sqrt(mse))

# MAE
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, pred_test)
print("MAE : ", mae)

# MAPE
def MAPE(y_val, pred_val):
    return np.mean(np.abs((y_val-pred_val)/y_val)*100)
print("MAPE : ", MAPE(y_test['house_value'], pred_test))


MSE :  3266649433.2375526
RMSE :  57154.60990364253
MAE :  41982.40013925488
MAPE :  26.84956805916255


In [None]:
# XGBoost # BayesianRidge

In [101]:
from xgboost import XGBRegressor
model_xgb = XGBRegressor().fit(X_train, y_train)

score = cross_val_score(XGBRegressor(), X_train, y_train, cv=kfold)
print("CV Score : ", score)
print("CV Score mean : ", score.mean())

CV Score :  [0.58818426 0.59263163 0.61191002 0.61833003]
CV Score mean :  0.6027639876075861


In [102]:
from sklearn.linear_model import BayesianRidge

score = cross_val_score(BayesianRidge(), X_train, y_train, cv=kfold)
print("CV Score : ", score)
print("CV Score mean : ", score.mean())

CV Score :  [0.55319941 0.56169082 0.57473422 0.59215583]
CV Score mean :  0.5704450683071715


In [None]:
# Stacking 에 사용할 알고리즘

- LogisticRegression
- SVC
- RandomForestClassifier
- GaussianNB

- LinearRegression
- SVR
- RandomForestRegressor
- RidgeBayese