In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error

from xgboost import XGBRegressor 
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor


In [3]:
def get_rmse(model,X_test,y_test):
    pred = model.predict(X_test.astype(float))
    mse = mean_squared_error(y_test.astype(float), pred)
    rmse = np.sqrt(mse)
    # print(model.__class__.__name__, ' RMSE: ', np.round(rmse,3))
    return rmse

In [4]:
def get_mae(model,X_test,y_test):
    pred = model.predict(X_test.astype(float))
    score = mean_absolute_error(y_test.astype(float),pred)
    # print(model.__class__.__name__, " MAE: ", np.round(score,3))
    return score

In [5]:
def get_rmses(models,X_test,y_test):
    rmses = []
    for model in models:
        rmse = get_rmse(model,X_test,y_test)
        rmses.append(rmse)
    return rmses

In [6]:
def get_maes(models,X_test,y_test):
    scores = []
    for model in models:
        score = get_mae(model,X_test,y_test)
        scores.append(score)
    return scores

In [7]:
def testing_model(f_name,adv,dadv):
    tmp = {}
    df = pd.read_csv(f_name)
    df = df.drop(["Unnamed: 0"], axis=1)

    # Drop Errors and NAN values
    df = df[df['turn_over_rate']<1]
    error_firm = ['동원홈푸드','휠라홀딩스','트리','와디즈','키위컴퍼니','줌인터넷','시선인터내셔널','브이티코스메틱','유니슨이테크','씨엠비대전방송','서울비젼','더메인즈','조은시스템']

    for i in error_firm:
        df = df[df.company_name != i]

    # Rename the columns
    df = df.rename(columns={"average_salary":"average_salary(만원)", "total_sale":"total_sale(억원)"})

    y_target = df['turn_over_rate']
    x_data = df.drop(['company_name','turn_over_rate'], axis = 1, inplace = False)
    X_train, X_test, y_train, y_test = train_test_split(x_data, y_target, test_size = 0.1, random_state = 7)


    lr_reg = LinearRegression()
    lr_reg.fit(X_train, y_train)
    tmp['lr_mae'] = get_mae(lr_reg,X_test,y_test)
    tmp['lr_rmse'] = get_rmse(lr_reg,X_test,y_test)

    ridge_reg = Ridge(alpha=0.05)
    ridge_reg.fit(X_train, y_train)
    tmp['ridge_reg_mae'] = get_mae(ridge_reg,X_test,y_test)
    tmp['ridge_reg_rmse'] = get_rmse(ridge_reg,X_test,y_test)

    lasso_reg = Lasso(alpha=0.001)
    lasso_reg.fit(X_train,y_train)
    tmp['lasso_reg_mae'] = get_mae(lasso_reg,X_test,y_test)
    tmp['lasso_reg_rmse'] = get_rmse(lasso_reg,X_test,y_test)

    # xgb
    xgb_reg = XGBRegressor(n_estimators = 500, learning_rate = 0.05, max_depth = 4, colsample_bytree = 0.5, subsample = 0.8)
    xgb_reg.fit(X_train.astype(float),y_train.astype(float))

    tmp['xgb_reg_mae'] =get_mae(xgb_reg,X_test,y_test)
    tmp['xgb_reg_rmse'] =get_rmse(xgb_reg,X_test,y_test)

    # lgbm
    lgbm_reg = LGBMRegressor(n_estimators=500, learning_rate= 0.05, num_leaves=4,
                            subsample=0.6, colsample_bytree=0.4, reg_lambda=10, n_jobs=-1)
    lgbm_reg.fit(X_train.astype(float),y_train.astype(float))

    tmp['lgbm_reg_mae'] =get_mae(lgbm_reg,X_test,y_test)
    tmp['lgbm_reg_rmse'] =get_rmse(lgbm_reg,X_test,y_test)

    # gradient
    gb_reg = GradientBoostingRegressor(n_estimators=750, learning_rate= 0.05, subsample=0.6, max_depth=10 )
    gb_reg.fit(X_train.astype(float),y_train.astype(float))

    tmp['gb_reg_mae'] =get_mae(gb_reg,X_test,y_test)
    tmp['gb_reg_rmse'] =get_rmse(gb_reg,X_test,y_test)

    # RF
    rf_reg = RandomForestRegressor(n_estimators=500)
    rf_reg.fit(X_train,y_train)

    tmp['rf_reg_mae'] =get_mae(rf_reg,X_test,y_test)
    tmp['rf_reg_rmse'] =get_rmse(rf_reg,X_test,y_test)

    # Decision Tree
    dt_reg = DecisionTreeRegressor(max_depth=4)
    dt_reg.fit(X_train,y_train)

    tmp['dt_reg_mae'] =get_mae(dt_reg,X_test,y_test)
    tmp['dt_reg_rmse'] =get_rmse(dt_reg,X_test,y_test)
    return tmp

In [8]:
adv = [6,7,8,9,10]
dadv = [6,7,8,9,10]

result = {}
for i in adv:
    for j in dadv:
        f = str(i)+'_'+str(j)
        f_name = 'bert_all_cases/bert_tokenizing_'+str(i)+'_'+str(j)+'.csv'
        print(f_name)
        val = testing_model(f_name,i,j)
        print(val)
        result[f] = val

bert_all_cases/bert_tokenizing_6_6.csv
{'lr_mae': 0.1381437764608774, 'lr_rmse': 0.17247352720349735, 'ridge_reg_mae': 0.13814515940862188, 'ridge_reg_rmse': 0.1724739805883331, 'lasso_reg_mae': 0.13886636812385963, 'lasso_reg_rmse': 0.17282297611947883, 'xgb_reg_mae': 0.12752613476215505, 'xgb_reg_rmse': 0.16760281895482584, 'lgbm_reg_mae': 0.1267956193419334, 'lgbm_reg_rmse': 0.1676765044400499, 'gb_reg_mae': 0.13105883728881512, 'gb_reg_rmse': 0.17345886649072612, 'rf_reg_mae': 0.12582320346320347, 'rf_reg_rmse': 0.16572774750367153, 'dt_reg_mae': 0.122983981156712, 'dt_reg_rmse': 0.1641383242316827}
bert_all_cases/bert_tokenizing_6_7.csv
{'lr_mae': 0.137463246993948, 'lr_rmse': 0.1723784202330629, 'ridge_reg_mae': 0.13746131822371005, 'ridge_reg_rmse': 0.17237480936898666, 'lasso_reg_mae': 0.13760911742569637, 'lasso_reg_rmse': 0.1720871407405004, 'xgb_reg_mae': 0.12644134651530872, 'xgb_reg_rmse': 0.17203548821046338, 'lgbm_reg_mae': 0.12251592587241451, 'lgbm_reg_rmse': 0.1648844

In [31]:
# bert_result = pd.DataFrame({'lr_mae': [], 'lr_rmse': [], 'ridge_reg_mae': [], 
#                'ridge_reg_rmse': [], 'lasso_reg_mae': [], 'lasso_reg_rmse': [], 
#                'xgb_reg_mae': [], 'xgb_reg_rmse': [], 'lgbm_reg_mae': [], 
#                'lgbm_reg_rmse': [], 'gb_reg_mae': [], 'gb_reg_rmse': [], 
#                'rf_reg_mae': [], 'rf_reg_rmse': [], 'dt_reg_mae': [], 
#                'dt_reg_rmse': [], 'adv/dadv': '6_6'})

for i in result:
    result[i]['adv/dadv'] = i
bert_result = pd.DataFrame(list(result.values()))
last_column = bert_result.iloc[:, -1]  # Select the last column
bert_result = pd.concat([last_column, bert_result.iloc[:, :-1]], axis=1)  # Concatenate
bert_result

Unnamed: 0,adv/dadv,lr_mae,lr_rmse,ridge_reg_mae,ridge_reg_rmse,lasso_reg_mae,lasso_reg_rmse,xgb_reg_mae,xgb_reg_rmse,lgbm_reg_mae,lgbm_reg_rmse,gb_reg_mae,gb_reg_rmse,rf_reg_mae,rf_reg_rmse,dt_reg_mae,dt_reg_rmse
0,6_6,0.138144,0.172474,0.138145,0.172474,0.138866,0.172823,0.127526,0.167603,0.126796,0.167677,0.131059,0.173459,0.125823,0.165728,0.122984,0.164138
1,6_7,0.137463,0.172378,0.137461,0.172375,0.137609,0.172087,0.126441,0.172035,0.122516,0.164884,0.130018,0.179243,0.125032,0.170149,0.124037,0.164591
2,6_8,0.137543,0.17221,0.137543,0.17221,0.137968,0.172564,0.124173,0.168513,0.121793,0.16411,0.131097,0.177146,0.123096,0.166807,0.123539,0.165401
3,6_9,0.137101,0.171996,0.137109,0.171994,0.137968,0.172813,0.123791,0.16493,0.119764,0.161345,0.128644,0.173704,0.121659,0.16426,0.121945,0.164602
4,6_10,0.138576,0.172711,0.138572,0.172705,0.137881,0.172563,0.126753,0.16866,0.120169,0.160574,0.132449,0.1766,0.122001,0.163435,0.119509,0.158946
5,7_6,0.138147,0.172622,0.138149,0.172624,0.138795,0.173253,0.132562,0.173022,0.122476,0.163632,0.134548,0.177651,0.126372,0.167781,0.124419,0.167266
6,7_7,0.139203,0.174455,0.139199,0.174447,0.138201,0.172749,0.127292,0.170805,0.121596,0.164711,0.136286,0.182488,0.127301,0.169532,0.124806,0.169261
7,7_8,0.136904,0.17208,0.13691,0.172073,0.13786,0.172335,0.124171,0.165474,0.122077,0.162745,0.127585,0.174007,0.12365,0.163902,0.123919,0.165383
8,7_9,0.137858,0.172606,0.137859,0.172596,0.138299,0.172667,0.120931,0.161998,0.120205,0.159857,0.129771,0.170827,0.122659,0.163388,0.125391,0.168039
9,7_10,0.138932,0.17283,0.13893,0.172828,0.138245,0.173227,0.124624,0.165761,0.120743,0.162395,0.122842,0.164659,0.120819,0.162045,0.121964,0.163262


In [32]:
bert_result.to_csv('bert_result.csv')