In [1]:
import numpy as np
import pandas as pd
import os
import argparse
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input, concatenate, Embedding, Reshape
from tensorflow.keras.layers import Flatten, concatenate, Lambda, Dropout
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2, l1_l2

In [2]:
# 데이터를 호출하거나 없으면 다운 받는 함수
def maybe_download(train_data, test_data):
    '''만약 adult data가 없다면 다운 받으세요'''
    
    ## 여기서 다양한 feature을 사용하는 것 보니까 Lpoint 데이터도 이런 형식으로 가능할듯.
    COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
               "marital_status", "occupation", "relationship", "race", "gender",
               "capital_gain", "capital_loss", "hours_per_week", "native_country",
               "income_bracket"]
    
    if not os.path.exists(train_data):
        print('training data를 다운로드 합니다....')
        df_train = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data",
            names=COLUMNS, skipinitialspace=True)
    else: 
        df_train = pd.read_csv('train.csv')
            
    if not os.path.exists(test_data):
        print('testing data를 다운로드 합니다....')
        df_test = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.test",
            names=COLUMNS, skipinitialspace=True, skiprows=1) 
    else:
        df_test = pd.read_csv('test.csv')
        
    return df_train, df_test

In [3]:
def cross_columns(x_cols):
    '''pandas dataframe에서 crossed columns를 쉽게 만들어주는 함수
    colomn list를 dict 형식으로 만들어 준다'''
    
    crossed_columns = dict()
    colnames = ['_'.join(x_c) for x_c in x_cols]
    for cname, x_c in zip(colnames, x_cols):
        crossed_columns[cname] = x_c
    return crossed_columns

In [15]:
# 예시
test= [['a', 'b'], 
       ['c', 'd']]
print(cross_columns(test))

{'a_b': ['a', 'b'], 'c_d': ['c', 'd']}


In [5]:
def val2inx(df, cols):
    ''' 카테고리 columns를 임베딩 하기 전에 index로 만들어준다.'''
    val_type = dict()
    for c in cols:
        val_types[c] = df[c].unique()
        
    val_to_idx = dict()
    for k, v in val_types.items():
        val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}
        
    for k, v in val_to_idx.items():
        df[k] = df[k].apply(lambda x: v[x])
        
    unique_vals = dict()
    for c in cols:
        unique_vals[c] = df[c].nunique()
        
    return df, unique_vals

In [6]:
def onehot(x):
    '''onehotencoding 후 행렬로 변환'''
    return np.array(OneHotEncoder().fit_transform(x).todense())

![img]("../img/model_structure.png")

In [7]:
def embeddin_input(name, n_in, n_out, reg):
    inp = Input(shape(1, ), dtype='int64', name=name)
    return inp, Embedding(n_in, n_out, input_length=1, embeddins_regularizer=l2(reg)(inp))

In [8]:
def continous_input(name):
    inp = Input(shape(1,), dtype='float32', name=name)
    return inp, Reshape((1,1))(inp)

In [None]:
def wide(df_train, df_test, wide_cols, x_cols, target, model_type, method):
    '''Run the wide (linear) model
    
    Params:
    ----
    wide_cols    : wide model에 맞게 사용되는 columns
    x_cols       : crossed에 맞게 사용되는 columns
    target       : the target feature
    model_type   : wide 와 wide_deep 모델 둘다 수용. 만약 'wide_depp'이라면 
                   build 하고 inputs를 반환한다. 나머지는 안됨
    method       : regression, logistic, multiclass 중 선택
    
    Returns:
    -----
    if 'wide':
        test set을 얻은 결과를 print 한다. 
    if 'wide_deep':
        X_train, y_train, X_test, y_test: the inputs required to build wide and deep
    '''
    
    df_train['IS_TRAIN'] = 1
    df_test['IS_TRAIN'] = 0
    df_wide = pd.concat([df_train, df_test])
    
# 여기서 crossed_columns란 무엇을 의미할까?
    crossed_columns_d = cross_columns(x_cols)
    categorical_columns = list(
        df_wide.select_dtypes(include=['object']).columns)
    
    wide_cols += list(crossed_columns_d.keys())
    
    for k, v in crossed_columns_d.items():
        df_wide[k] = df_wide[v].apply(lambda x: '-'.join(x), axis=1)
    
    df_wide = df_wide[wide_cols + [target] + ['IS_TRAIN']]
    
    dummy_cols = [
        c for c in wide_cols if c in categorical_columns + list(crossed_columns_d.key())]
    