In [12]:
import numpy as np
import pandas as pd

## IMPORT DATA

Data type should be made in a csv file in this order: 
   
ex:  
  
company_name,adv,dadv,average_salary(만원),total_sale(억원),turn_over_rate  
삼성전자, 삼성 전자의 모든 장점 리뷰 데이터, 삼성 전자의 모든 단점 리뷰 데이터, 삼성전자의 평균 연봉 (만원), 삼성전자의 연 매출액 (억원), 삼성전자의 이직율

In [13]:
df = pd.read_csv("sample_input.csv")
df = df.drop(["Unnamed: 0"], axis=1)
df

Unnamed: 0,company_name,adv,dadv,average_salary,total_sale,turn_over_rate
0,현대카드,일단 오피스 환경이 쾌적한것은 장점대기업만의 성과급이나 복지의 혜택등이 좋다생각보다...,꼰대문화 부서에따라 존재. 굽신굽신 매일 퇴근은 정시에 포기 할일 다하면 다른업무...,8737,2300.0,0.35


## BOW

## Topic Modeling

create topic modeling result

In [14]:
from gensim.models.word2vec import Word2Vec
from konlpy.tag import Okt
from gensim.models import KeyedVectors

import pickle

model_adv = KeyedVectors.load("tp_model/tp_adv/tp_adv_model")
model_dadv = KeyedVectors.load("tp_model/tp_dadv/tp_dadv_model")

with open('input/corpus_adv.pkl', 'rb') as lf:
    corpus_adv = pickle.load(lf)
with open('input/corpus_dadv.pkl', 'rb') as lf:
    corpus_dadv = pickle.load(lf)

In [15]:
num_topics_adv = 6
num_topics_dadv = 7

output_df_adv = pd.DataFrame({'company_name':df.company_name})
for col in range(num_topics_adv):
    tmp = str(col)
    output_df_adv[tmp] = float(0)
output_df_adv.index = [i for i in range(len(output_df_adv))]

output_df_dadv = pd.DataFrame({'company_name':df.company_name})
for col in range(num_topics_dadv):
    tmp = str(col)
    output_df_dadv[tmp] = float(0)
output_df_dadv.index = [i for i in range(len(output_df_dadv))]

for i, dt in enumerate(model_adv.get_document_topics(corpus_adv)):
    for val in dt:

        idx = str(val[0])
        pt = val[1]

        output_df_adv.at[i,idx] = pt

for i, dt in enumerate(model_dadv.get_document_topics(corpus_dadv)):
    for val in dt:

        idx = str(val[0])
        pt = val[1]

        output_df_dadv.at[i,idx] = pt

In [16]:
# output_df

df = df.drop(['adv','dadv'], axis = 1)

# adv
output_df_adv = output_df_adv.dropna()

col_adv = list(output_df_adv.columns)

for i in range(1, len(col_adv)):
    col_adv[i] = "adv_topic_"+col_adv[i]
output_df_adv.columns = col_adv

# dadv
output_df_dadv = output_df_dadv.dropna()

col_dadv = list(output_df_dadv.columns)

for i in range(1, len(col_dadv)):
    col_dadv[i] = "dadv_topic_"+col_dadv[i]
output_df_dadv.columns = col_dadv

output_df = pd.merge(output_df_adv, output_df_dadv ,how='inner')
output_df = pd.merge(output_df, df ,how='inner')
output_df

Unnamed: 0,company_name,adv_topic_0,adv_topic_1,adv_topic_2,adv_topic_3,adv_topic_4,adv_topic_5,dadv_topic_0,dadv_topic_1,dadv_topic_2,dadv_topic_3,dadv_topic_4,dadv_topic_5,dadv_topic_6,average_salary,total_sale,turn_over_rate
0,현대카드,0.978254,0.0,0.01019,0.0,0.0,0.0,0.0,0.073691,0.0,0.013542,0.0,0.91249,0.0,8737,2300.0,0.35


In [17]:
# log transformation of output_df

for feature in list(output_df.columns)[1:-1]:
    if feature != 'dadv_topic_4':
        nonzero_indices = output_df[feature] != 0  # 0이 아닌 값의 인덱스를 찾음
        output_df[feature] = np.log1p(output_df[feature])
output_df = output_df.rename(columns={"average_salary":"average_salary(만원)", "total_sale":"total_sale(억원)"})
output_df

Unnamed: 0,company_name,adv_topic_0,adv_topic_1,adv_topic_2,adv_topic_3,adv_topic_4,adv_topic_5,dadv_topic_0,dadv_topic_1,dadv_topic_2,dadv_topic_3,dadv_topic_4,dadv_topic_5,dadv_topic_6,average_salary(만원),total_sale(억원),turn_over_rate
0,현대카드,0.682214,0.0,0.010139,0.0,0.0,0.0,0.0,0.071102,0.0,0.013451,0.0,0.648406,0.0,9.075437,7.741099,0.35


run it into machine learning

In [18]:
import joblib
lgbm_model = joblib.load('ml_model/tp_lgbm_reg_model.pkl')

y_target = output_df['turn_over_rate']
x_data = output_df.drop(['company_name','turn_over_rate'], axis = 1, inplace = False)

print("Predicted turnover rate of the company using lgbm model is :",lgbm_model.predict(x_data))
print("Real turnover of the company is : ", y_target[0])

Predicted turnover rate of the company using lgbm model is : [0.23826028]
Real turnover of the company is :  0.35


In [19]:
xgb_model = joblib.load('ml_model/tp_xgb_reg_model.pkl')

y_target = output_df['turn_over_rate']
x_data = output_df.drop(['company_name','turn_over_rate'], axis = 1, inplace = False)

print("Predicted turnover rate of the company using lgbm model is :",xgb_model.predict(x_data))
print("Real turnover of the company is : ", y_target[0])

Predicted turnover rate of the company using lgbm model is : [0.2879109]
Real turnover of the company is :  0.35
