In [2]:
# module 임포트
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import copy
import random

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

#Our Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

In [2]:
def select_important_feautre(columns, feature_importance, num):
    
    #딕셔너리
    dic = {}
    for value in [list(v) for v in zip(columns, feature_importance)]:
        dic[value[1]] = value[0] 
        
    sorted_list = sorted(dic)
    sorted_list.reverse()
    
    i = 0
    result = []
    for y in sorted_list:
        result.append(dic[y])
        i = i+1
        if(i == num):
            break
    
    return result

In [3]:
def cross_val_score_custom(model, x, y, cv, dev = 0):
    
    print('\n\n')
    
    result = []
    
    for train_idx, test_idx in cv.split(x):
        
        new_x_train = x.iloc[train_idx, :]
        new_y_train = y.iloc[train_idx]
        
        new_x_test = x.iloc[test_idx, :]
        new_y_test = y.iloc[test_idx]
        
        model.fit(new_x_train, new_y_train)
        
        temp = np.mean(f1_score(new_y_test, model.predict(new_x_test), average=None))
        
        if dev:
            print('dev Mode # : {}'.format(temp))
        
        result.append(temp)
    
    print('\n\n')
    
    return np.mean(result)

## 데이터셋

In [4]:
x_train_load_fin_v2 = pd.read_csv('C://Users//zeus_//Desktop//champion_data//train/train_activity_final_v2.csv').sort_values(by='acc_id', ascending=True)
y_train_load = pd.read_csv('C://Users//zeus_//Desktop//champion_data//train/train_label.csv').sort_values(by='acc_id', ascending=True)

In [5]:
x_train_fin_v2 = x_train_load_fin_v2
x_train_fin_v2 = x_train_load_fin_v2.drop(['acc_id'], axis=1)

In [6]:
y_data = y_train_load.label

## 여러 데이터셋 만들기
- 675 => 0.7377
- 656 => 0.7377
- 588 => 0.7377
- 583 => 0.7377
- 560 => 0.7379
- 519 => 0.7379
- 518 => 0.7377
- 467 => 0.7378

##### 공통 피처 만들기

In [7]:
forest_feature = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=1, max_depth=30)
forest_feature.fit(x_train_fin_v2, y_data)
feature_list = select_important_feautre(x_train_fin_v2.columns, forest_feature.feature_importances_, 677)

##### 675개 피처 선정

In [14]:
temp = feature_list[0: 675]
x_train_675 = x_train_fin_v2.loc[:, temp]

##### 656개 피처 선정

In [16]:
temp = feature_list[0: 656]
x_train_656 = x_train_fin_v2.loc[:, temp]

##### 588개 피처 선정

In [17]:
temp = feature_list[0: 588]
x_train_588 = x_train_fin_v2.loc[:, temp]

##### 583개 피처 선정

In [18]:
temp = feature_list[0: 583]
x_train_583 = x_train_fin_v2.loc[:, temp]

##### 560개 피처 선정

In [19]:
temp = feature_list[0: 560]
x_train_560 = x_train_fin_v2.loc[:, temp]

##### 519개 피처 선정

In [20]:
temp = feature_list[0: 519]
x_train_519 = x_train_fin_v2.loc[:, temp]

##### 518개 피처 선정

In [21]:
temp = feature_list[0: 518]
x_train_518 = x_train_fin_v2.loc[:, temp]

##### 467개 피처 선정

In [22]:
temp = feature_list[0: 467]
x_train_467 = x_train_fin_v2.loc[:, temp]

## Model Stacking

In [1]:
def get_oof(model, x_train, y_train):
    
    le = LabelEncoder()
    le.fit(y_train)
    y_train = pd.Series(le.transform(y_train))
    
    oof_train = np.zeros((x_train_fin_v2.shape[0],))
    
    for i, (train_idx, test_idx) in enumerate(KFold(n_splits=5).split(x_train)):
        
        x_tr = x_train.iloc[train_idx, :]
        y_tr = y_train.iloc[train_idx]
        x_te = x_train.iloc[test_idx, :]
        
        model.fit(x_tr, y_tr)

        oof_train[test_idx] = model.predict(x_te)
    
    oof_train = [int(v) for v in oof_train]
    oof_train = le.inverse_transform(oof_train)
    
    return oof_train.reshape(-1,1)

##### 모델들

In [199]:
rf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=30)
et = ExtraTreesClassifier(n_estimators=100, max_depth=20)
gb = GradientBoostingClassifier(max_depth=2, learning_rate=0.01, max_features=30, n_estimators=100)

- 675 => 0.7377
- 656 => 0.7377
- 588 => 0.7377
- 583 => 0.7377
- 560 => 0.7379
- 519 => 0.7379
- 518 => 0.7377
- 467 => 0.7378

In [68]:
rf_675_oof_train = get_oof(rf, x_train_675, y_data)
rf_656_oof_train = get_oof(rf, x_train_656, y_data)
rf_588_oof_train = get_oof(rf, x_train_588, y_data)
rf_583_oof_train = get_oof(rf, x_train_583, y_data)
rf_560_oof_train = get_oof(rf, x_train_560, y_data)
rf_519_oof_train = get_oof(rf, x_train_519, y_data)
rf_518_oof_train = get_oof(rf, x_train_518, y_data)
rf_467_oof_train = get_oof(rf, x_train_467, y_data)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


In [200]:
rf_oof_train = get_oof(rf, x_train_560, y_data)
et_oof_train = get_oof(et, x_train_560, y_data)
gb_oof_train = get_oof(gb, x_train_560, y_data)

  if diff:
  if diff:
  if diff:


In [201]:
x_train = np.concatenate((rf_675_oof_train, rf_656_oof_train, rf_588_oof_train, rf_583_oof_train, rf_560_oof_train, rf_519_oof_train, rf_518_oof_train, rf_467_oof_train, rf_oof_train, et_oof_train, gb_oof_train), axis=1)
x_train = pd.DataFrame(x_train)

In [202]:
x_train.columns = ['rf1', 'rf2', 'rf3', 'rf4', 'rf5', 'rf6', 'rf7', 'rf8', 'rf', 'et', 'gb', ]

##### 숫자로 라벨링

In [208]:
le = LabelEncoder()
le.fit(x_train.rf1)

LabelEncoder()

In [209]:
for v in x_train.columns:
    x_train[v] = le.transform(x_train[v])

In [228]:
temp = get_oof(rf_t, x_train, y_data)

  if diff:


In [236]:
le.classes_

array(['2month', 'month', 'retained', 'week'], dtype=object)

In [235]:
temp = pd.DataFrame(temp)
temp['l'] = y_data
pd.merge(temp, x_train, on=temp.index)

Unnamed: 0,key_0,0,l,rf1,rf2,rf3,rf4,rf5,rf6,rf7,rf8,rf,et,gb
0,0,week,week,3,3,3,3,3,3,3,3,3,3,3
1,1,retained,retained,2,2,2,2,2,2,2,2,2,2,2
2,2,retained,retained,2,2,2,2,0,2,2,2,0,0,0
3,3,month,month,1,1,1,1,1,1,1,1,1,1,1
4,4,month,2month,1,3,1,0,3,3,1,1,3,0,0
5,5,month,month,1,1,1,1,1,1,1,1,1,1,1
6,6,month,month,1,1,1,1,1,1,1,1,1,0,0
7,7,2month,2month,0,0,0,0,0,0,0,0,0,0,0
8,8,month,retained,2,2,1,1,1,1,2,1,1,0,0
9,9,month,month,1,1,1,1,1,1,1,1,1,1,2


In [265]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

In [270]:
ohe.fit([[0],[1],[2],[3]])

OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [292]:
tt = pd.DataFrame(ohe.transform(x_train.rf1.values.reshape(-1,1)).toarray())
tt.columns = ['rf1_1', 'rf1_2', 'rf1_3', 'rf1_4']

In [293]:
for v in x_train.columns[1:]:
    ttt = pd.DataFrame(ohe.transform(x_train[v].values.reshape(-1,1)).toarray())
    ttt.columns = [v +'_'+ str(1), v +'_'+ str(2), v +'_'+ str(3), v +'_'+ str(4)]
    tt = pd.concat([tt, ttt], axis= 1)

#### Model 2

In [224]:
rf_t = RandomForestClassifier(n_estimators=170, random_state=42, max_depth=5)

In [225]:
cross_val_score_custom(rf_t, x_train, y_data, KFold(n_splits=5), dev = 1)




dev Mode # : 0.7412406906041716
dev Mode # : 0.7332112004176161
dev Mode # : 0.7427773136804482
dev Mode # : 0.741424253937681
dev Mode # : 0.7408310985004443





0.7398969114280722

In [302]:
ttttttt = pd.concat([x_train, tt], axis = 1)

In [318]:
tttttt = RandomForestClassifier(n_estimators=210, random_state=42, max_depth=5)

In [319]:
cross_val_score_custom(tttttt, ttttttt, y_data, KFold(n_splits=5), dev = 1)




dev Mode # : 0.7418566933610754
dev Mode # : 0.7333295111087752
dev Mode # : 0.7427221156294195
dev Mode # : 0.7412995892898797
dev Mode # : 0.7406981600757623





0.7399812138929824

In [154]:
result = x_train.apply(lambda v: v.value_counts().idxmax(), axis = 1)

### 소프트맥스 회귀

In [177]:
from sklearn.linear_model import LogisticRegression

In [180]:
from sklearn.preprocessing import scale, robust_scale, minmax_scale, maxabs_scale

In [182]:
mm = pd.DataFrame(minmax_scale(x_train_560), columns=x_train_560.columns)

In [178]:
soft_lr = LogisticRegression(multi_class="multinomial", solver="lbfgs", C = 10)

In [183]:
cross_val_score_custom(soft_lr, mm, y_data, KFold(n_splits=5), dev = 1)




dev Mode # : 0.6517233390731428
dev Mode # : 0.6390300904490771
dev Mode # : 0.6495475319261467
dev Mode # : 0.6491192909647248
dev Mode # : 0.6492424191242105





0.6477325343074605