In [20]:
import numpy as np
import pandas as pd

# **DATA IMPORT**

In [21]:
# Preprocessing for topic modeling result
def tp_preprocessing(tp, type_check):
    tp_df = tp
    cn = tp_df['cmp']
    
    for i, val in enumerate(cn):
        if '(주)' in val:
            tmp = val.replace('(주)','')
            tp_df.loc[i,'cmp'] = tmp
    
    col_list = list(tp_df.columns)
    col_list[0] = 'company_name'
    for i in range(1, len(tp_df.columns)):
        if type_check == 'adv':
            col_list[i] = 'adv_topic ' + str(col_list[i])
        if type_check == 'dadv':
            col_list[i] = 'dadv_topic ' + str(col_list[i])
    tp_df.columns = col_list

    return tp_df

In [22]:
# Preprocessing for financial variable
def fv_preprocessing(fv):
  fv_df = fv
  afv = fv_df['average_salary']
  tfv = fv_df['total_sale']

  for i, val in enumerate(afv):
    if '만원' in val:
      tmp = int(val[:-2].replace(',',''))
      fv_df.loc[i,'average_salary'] = tmp
    if '회사' in val:
      fv_df.loc[i,'average_salary'] = np.NAN
    if '수집' in val:
      fv_df.loc[i,'average_salary'] = np.NAN


  for i, val in enumerate(tfv):
    if '조원' in val:
      tmp = float(val[:-2].replace(',',''))*1000
      fv_df.loc[i,'total_sale'] = tmp
    if '억원' in val:
      tmp = float(val[:-2].replace(',',''))
      fv_df.loc[i,'total_sale'] = tmp
    if '회사' in val:
      fv_df.loc[i,'total_sale'] = np.NAN


  
  return fv_df

In [23]:
# Preprocessing for turn over rate
def tor_preprocessing(tor):
  tor_df = tor
  tor_val = tor_df['turn_over_rate']

  for i, val in enumerate(tor_val):
    if '정보' in val:
      tor_df.loc[i,'turn_over_rate'] = np.NAN
    elif val == '(9999%)' :
      tor_df.loc[i,'turn_over_rate'] = np.NAN
    elif '(' in val:
      tmp = float(val[1:-1].replace('%',''))*0.01
      tor_df.loc[i,'turn_over_rate'] = tmp

  return tor_df

In [24]:
tp_adv = pd.read_csv("input/topic_modeling_result/bank_adv_topic_modeling.csv")
tp_dadv = pd.read_csv("input/topic_modeling_result/bank_dadv_topic_modeling.csv")

fv = pd.read_csv("input/financial_variable/bank_financial_financial_variable.csv")
tor = pd.read_csv("input/turn_over_rate/bank_financial_business_turn_over_rate.csv")

In [25]:
tp_adv = tp_adv.drop(['Unnamed: 0'], axis=1)
tp_adv = tp_preprocessing(tp_adv,'adv')
tp_dadv = tp_dadv.drop(['Unnamed: 0'], axis=1)
tp_dadv = tp_preprocessing(tp_dadv,'dadv')
fv = fv.drop(['Unnamed: 0'], axis=1)
fv = fv_preprocessing(fv)
tor = tor.drop(['Unnamed: 0'], axis=1)
tor = tor_preprocessing(tor)

In [26]:
df = pd.merge(tp_adv, tp_dadv, on = 'company_name', how = 'inner')
df = pd.merge(df, fv, on = 'company_name', how = 'inner')
df = pd.merge(df, tor, on = 'company_name', how = 'inner')
df = df.dropna()
df

Unnamed: 0,company_name,adv_topic 0,adv_topic 1,adv_topic 2,adv_topic 3,adv_topic 4,adv_topic 5,dadv_topic 0,dadv_topic 1,dadv_topic 2,dadv_topic 3,dadv_topic 4,average_salary,total_sale,turn_over_rate
0,국민건강보험공단,0.000000,0.999386,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.999572,6441,75400.0,0.2
1,근로복지공단,0.000000,0.999270,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.999492,5817,9406.2,0.13
3,농협은행,0.000000,0.000000,0.000000,0.260527,0.000000,0.738698,0.181343,0.000000,0.000000,0.000000,0.812849,6182,13100.0,0.15
4,현대카드,0.000000,0.433746,0.000000,0.565337,0.000000,0.000000,0.000000,0.999159,0.000000,0.000000,0.000000,8737,2300.0,0.35
5,중소기업은행,0.000000,0.010276,0.000000,0.000000,0.000000,0.988675,0.000000,0.012572,0.040652,0.000000,0.946242,10065,14800.0,0.16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,바로크레디트대부,0.015249,0.923682,0.015247,0.015275,0.015289,0.015258,0.000000,0.000000,0.970678,0.000000,0.000000,4354,1133.0,0.22
139,글로벌금융판매,0.000000,0.000000,0.000000,0.000000,0.990223,0.000000,0.000000,0.000000,0.000000,0.996440,0.000000,3442,4473.3,0.46
140,현대하이카손해사정,0.973719,0.000000,0.000000,0.000000,0.000000,0.000000,0.317989,0.000000,0.000000,0.000000,0.664502,6082,1294.9,0.08
141,아이비케이캐피탈,0.000000,0.000000,0.000000,0.000000,0.000000,0.992217,0.000000,0.000000,0.000000,0.899822,0.093894,9880,3933.2,0.06


# **MACHINE LEARNING MODEL**

In [27]:
# Linear Regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

y_target = df['turn_over_rate']
x_data = df.drop(['company_name','turn_over_rate'], axis = 1, inplace = False)

X_train, X_test, y_train, y_test = train_test_split(x_data, y_target, test_size = 0.3, random_state = 7)

lr = LinearRegression()
lr.fit(X_train, y_train)

y_preds = lr.predict(X_test)
mse = mean_squared_error(y_test, y_preds)
rmse = np.sqrt(mse)

print('MSE: {0: .3f}, RMSE : {1: .3F}'.format(mse, rmse))
print('Variance score : {0:.3f}'.format(r2_score(y_test,y_preds)))


MSE:  0.039, RMSE :  0.196
Variance score : 0.089


In [28]:
from sklearn.model_selection import cross_val_score
def get_model_cv_prediction(model, X_data, y_target):
    neg_mse_scores = cross_val_score(model, X_data, y_target, scoring = "neg_mean_squared_error", cv = 5)
    rmse_scores = np.sqrt(-1 * neg_mse_scores)
    avg_rmse = np.mean(rmse_scores)
    print('#### ', model.__class__.__name__,'####')
    print('5 cv average RMSE : {0:.3f}'.format(avg_rmse))

In [33]:
# Various Types of Regressor Tree

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor 
from lightgbm import LGBMRegressor

dt_reg = DecisionTreeRegressor(random_state= 7, max_depth=4)
rf_reg = RandomForestRegressor(random_state= 7, n_estimators= 1000)
gb_reg = GradientBoostingRegressor(random_state= 7, n_estimators= 1000)
xgb_reg = XGBRegressor(n_estimators = 1000)
lgb_reg = LGBMRegressor(n_estimators=1000)

models = [dt_reg, rf_reg, gb_reg, lgb_reg]
for model in models:
    get_model_cv_prediction(model,x_data, y_target)

####  DecisionTreeRegressor ####
5 cv average RMSE : 0.214
####  RandomForestRegressor ####
5 cv average RMSE : 0.150
####  GradientBoostingRegressor ####
5 cv average RMSE : 0.167


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/myeongseop.kim/miniconda3/envs/textmining/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/myeongseop.kim/miniconda3/envs/textmining/lib/python3.8/site-packages/lightgbm/sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "/Users/myeongseop.kim/miniconda3/envs/textmining/lib/python3.8/site-packages/lightgbm/sklearn.py", line 748, in fit
    self._Booster = train(
  File "/Users/myeongseop.kim/miniconda3/envs/textmining/lib/python3.8/site-packages/lightgbm/engine.py", line 271, in train
    booster = Booster(params=params, train_set=train_set)
  File "/Users/myeongseop.kim/miniconda3/envs/textmining/lib/python3.8/site-packages/lightgbm/basic.py", line 2605, in __init__
    train_set.construct()
  File "/Users/myeongseop.kim/miniconda3/envs/textmining/lib/python3.8/site-packages/lightgbm/basic.py", line 1815, in construct
    self._lazy_init(self.data, label=self.label,
  File "/Users/myeongseop.kim/miniconda3/envs/textmining/lib/python3.8/site-packages/lightgbm/basic.py", line 1474, in _lazy_init
    data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(data,
  File "/Users/myeongseop.kim/miniconda3/envs/textmining/lib/python3.8/site-packages/lightgbm/basic.py", line 594, in _data_from_pandas
    raise ValueError("DataFrame.dtypes for data must be int, float or bool.\n"
ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in the following fields: average_salary, total_sale
