In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import re
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn')
sns.set(font_scale=1)

import warnings
warnings.filterwarnings('ignore')

In [2]:
def to_tokenize(text_Series, dict_change_word): # input은 '기사내용'을 pd.Series 타입으로 줌. out도 Series 타입.
    import re

    # 1. URL 부분 제거.
    print("\nURL 제거 과정.\n")
    text_Series = [re.sub('((http(s?))\:\/\/)([0-9a-zA-Z\-]+\.*\/*(\\n)*)+|url|URL', '', text) for text in tqdm(text_Series)]

    # 2. e-mail 주소 제거.
    print("\ne-mail 제거 과정.\n")
    text_Series = [re.sub('(\<.[a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+)(\.[a-zA-Z]{2,4})', '', text) for text in tqdm(text_Series)] 

    # 3. 고유어 변경
    print("\n고유어 변경 과정.\n")
    #text_Series = text_Series.apply(lambda x: change_hiphen_word(x, dict_change_word))
    text_Series = [change_hiphen_word(text, dict_change_word) for text in tqdm(text_Series)]
    
    # 4. 특수문자 제거. 모두 제거.
    print("\n특수문자 제거 과정.\n")
    text_Series = [re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', ' ', text) for text in tqdm(text_Series)]

    # 5. 문자 외 모두 제거(공백, \, 숫자 사라짐)
    print("\n문자 외 제거 과정.\n")
    text_Series = [re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣]', ' ', text) for text in tqdm(text_Series)]

    return text_Series


In [3]:
#dict_change_word에 해당하는 단어를 변경하는 함수

def change_hiphen_word(text, dictionary):
    com_list = [".replace('{}', '{}')".format(word, dictionary[word]) for word in dictionary.keys() if word in text]
    com_list.insert(0, "'{}'.format(text)")
    command = ''.join(com_list)
    text = eval(command)

    return text

In [4]:
#Corpus 만드는 함수 (딕셔너리 반영)
def make_corpus(text_col):
    from gensim import corpora
    
    dictionary = pd.read_pickle('dictionary.pkl')
    dictionary.filter_extremes(no_below = 20)
    corpus = [dictionary.doc2bow(text) for text in text_col]
    
    return dictionary, corpus

In [5]:
#하나로 합쳐진 데이터 불러오기

news_entire = pd.read_csv('Mrs.txt')

news_company = []

news_list = ['CSR','과징금','공정위','지역사회','사회공헌',
             '스타트업','청년','창업','중소기업','상생','협력사','갑질',
             '봉사','기부','ESG','사망','사고','취업','과태료','노조','개인정보']

for i in news_list:
    news_company.append(news_entire[news_entire.title.str.contains(i) == True])

news_company = pd.concat(news_company)    

news_company = news_company.drop_duplicates(['contents','company'])

In [6]:
#딕셔너리 통합 (고유어들을 하나의 주제로 분류할 수 있게끔 설정)

dict_change_word= {'경북':'지역사회','광주':'지역사회','대구' : '지역사회' 
                   ,'외국인':'취약계층', '장애인':'취약계층', '공정거래위원회':'공정위',
                   '사회적 책임':'사회공헌','CSR':'사회공헌', '고객정보':'개인정보',
                   'ESG':'지속가능', '이노베이션':'혁신',
                   '동반성장' : '상생', '갑질' : '불공정', '甲' : '갑',
                   '산재':'산업재해', '산업 재해' : '산업재해'}

In [7]:
#토큰화하기 위해 데이터클리닝을 해줌

news_entire_to_tokenize = pd.DataFrame(to_tokenize(news_company['title'], dict_change_word))

news_entire_to_tokenize.reset_index(drop=True, inplace=True)

news_entire_to_tokenize.head(10)

100%|████████████████████████████████████████████████████████████████████████████| 591/591 [00:00<00:00, 118900.31it/s]
100%|████████████████████████████████████████████████████████████████████████████| 591/591 [00:00<00:00, 265712.69it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 591/591 [00:00<00:00, 28676.60it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 591/591 [00:00<00:00, 86262.31it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 591/591 [00:00<00:00, 46195.19it/s]


URL 제거 과정.


e-mail 제거 과정.


고유어 변경 과정.


특수문자 제거 과정.


문자 외 제거 과정.






Unnamed: 0,0
0,공정위 프랜차이즈 가맹점주 핫라인 구축
1,공정위 프랜차이즈 가맹본부 불공정행위 직접 제보받는다 핫라인 구축해
2,공정위 조사 카드로 압박하는 미스터피자 업주들
3,국감 공정위 국감 유통업계 대표들 긴장
4,정무위 공정위 감사 깔창 생리대 폭리 미스터피자 통행세 검토 촉구
5,국감 공정위 국감 유통업계 대표들 긴장
6,공정위 조사 카드로 압박하는 미스터피자 업주들
7,정재찬 공정위원장 미스터피자 가맹계약 위법성 검토 착수
8,공정위 미스터피자 도미노피자 이디야커피 롯데리아 프랜차이즈본부 직권조사
9,공정위 미스터피자 등 외식 프랜차이즈 직권조사


In [8]:
#Stopword에 추가할 리스트 (조사, 명사, 일부 동사)

stopword =  pd.read_csv("https://raw.githubusercontent.com/yoonkt200/FastCampusDataset/master/korean_stopwords.txt")['아'].tolist()

stopwords_post = ['이', '가', '께서', '에서', '서', '이다', '의', '을', '를', '이', '가', '에', '에게', '께', '한테', '에서', '에게서', '로서', '로써', '고', '이라고', '라고', '와', '과', '이랑', '에', '같이', '처럼', '만큼', '만치', '야',
                  '아', '여', '이여', '이시여', '와', '과', '하고', '이다', '이며', '에다', '에다가', '이랑', '랑']

my_stopwords = pd.read_table("stopword_list.txt")['목록'].values.tolist()

stopword = set(stopword + stopwords_post + my_stopwords)

In [9]:
#Tagging 작업

from konlpy.tag import Okt

okt = Okt()

news_tagged = news_entire_to_tokenize[0].apply(lambda x : okt.morphs(x, stem = True)).tolist()

#불용어 제거

news_tagged = [[word for word in word_list if word not in stopword] for word_list in tqdm(news_tagged)]

#한글자 제거

news_tagged = [[word for word in word_list if len(word) > 1] for word_list in tqdm(news_tagged)]

100%|████████████████████████████████████████████████████████████████████████████| 591/591 [00:00<00:00, 290806.39it/s]
100%|████████████████████████████████████████████████████████████████████████████| 591/591 [00:00<00:00, 230460.55it/s]


In [10]:
#태깅 완료 후 제목을 태깅한 것으로 대체해줌

news_ready_for_LDA = news_company
news_ready_for_LDA['title'] = news_tagged

news_ready_for_LDA

Unnamed: 0.1,Unnamed: 0,date,title,contents,company
689,1205.0,2013.07.30.,"[공정위, 프랜차이즈, 가맹, 점주, 핫라인, 구축]","간담회에는 뚜레쥬르, 카페베네, GS25, 피자헛, 놀부보쌈, 미니스톱, 미스터피자...",미스터피자
690,1206.0,2013.07.29.,"[공정위, 프랜차이즈, 가맹, 본부, 불공정, 행위, 직접, 제보, 받다, 핫라인,...","이날 가맹점사업자 간담회에는 뚜레쥬르, 카페베네, GS25, 피자헛, 놀부보쌈, 미...",미스터피자
3412,499.0,2016.04.08.,"[공정위, 조사, 압박, 업주]",협의회는 현재 MPK(미스터피자코리아) 그룹에 대해 두 가지 불공정거래를 주장하고 ...,MPK그룹
3885,1293.0,2016.10.10.,"[국감, 공정위, 국감, 유통업, 대표, 긴장]","앞서 김진우 점주협의회 회장은 지난달 MPK그룹 앞에서 시위를 열고 ""정우현 회장의...",MPK그룹
4521,323.0,2016.10.11.,"[정무, 공정위, 감사, 깔창, 생리대, 폭리, 행세, 검토, 촉구]","국감에서 '미스터피자' 등 지역 소상공인 가맹분쟁 등을 알고 있는지, 대책은 무엇인...",미스터피자
...,...,...,...,...,...
6018,458.0,2015.04.21.,"[취약, 계층, 공단, 서울, 남부, 지사, 취업, 취약, 계층, 오찬]","맞이해, MPK그룹에 취업한 장애인근로자 8명을 초청해 손 편지와 선물을 전달하며 ...",MPK그룹
5435,1376.0,2015.09.24.,"[특징, 임단협, 타결, 실패, 노조, 부분파업, 파업, 장기, 우려]","◇ 3위 : MPK, 화장품 제조사 지분 인수 소식에 '上' '미스터피자'로 유명한...",미스터피자
5439,1380.0,2015.09.24.,"[특징, 임단협, 타결, 실패, 노조, 부분파업, 파업, 장기, 우려]","◇ 3위 : MPK, 화장품 제조사 지분 인수 소식에 '上' '미스터피자'로 유명한...",미스터피자
5853,68.0,2015.09.24.,"[특징, 임단협, 타결, 실패, 노조, 부분파업, 파업, 장기, 우려]",쌍방울의 주가 급등은 중국 금성그룹과 제주도에 휴양지를 조성하는 1조원 이상 규모의...,MPK그룹


In [11]:
#데이터 클리닝

news_ready_for_LDA.drop(news_ready_for_LDA.loc[news_ready_for_LDA['date']=='CHIEF EXECUTIVE'].index, inplace=True)
news_ready_for_LDA.drop(news_ready_for_LDA.loc[news_ready_for_LDA['date']=='date'].index, inplace=True)
news_ready_for_LDA['date'] = pd.to_datetime(news_ready_for_LDA['date'], format = "%Y.%m.%d.")

In [12]:
news_ready_for_LDA.reset_index(drop=True, inplace=True)

news_ready_for_LDA

Unnamed: 0.1,Unnamed: 0,date,title,contents,company
0,1205.0,2013-07-30,"[공정위, 프랜차이즈, 가맹, 점주, 핫라인, 구축]","간담회에는 뚜레쥬르, 카페베네, GS25, 피자헛, 놀부보쌈, 미니스톱, 미스터피자...",미스터피자
1,1206.0,2013-07-29,"[공정위, 프랜차이즈, 가맹, 본부, 불공정, 행위, 직접, 제보, 받다, 핫라인,...","이날 가맹점사업자 간담회에는 뚜레쥬르, 카페베네, GS25, 피자헛, 놀부보쌈, 미...",미스터피자
2,499.0,2016-04-08,"[공정위, 조사, 압박, 업주]",협의회는 현재 MPK(미스터피자코리아) 그룹에 대해 두 가지 불공정거래를 주장하고 ...,MPK그룹
3,1293.0,2016-10-10,"[국감, 공정위, 국감, 유통업, 대표, 긴장]","앞서 김진우 점주협의회 회장은 지난달 MPK그룹 앞에서 시위를 열고 ""정우현 회장의...",MPK그룹
4,323.0,2016-10-11,"[정무, 공정위, 감사, 깔창, 생리대, 폭리, 행세, 검토, 촉구]","국감에서 '미스터피자' 등 지역 소상공인 가맹분쟁 등을 알고 있는지, 대책은 무엇인...",미스터피자
...,...,...,...,...,...
586,458.0,2015-04-21,"[취약, 계층, 공단, 서울, 남부, 지사, 취업, 취약, 계층, 오찬]","맞이해, MPK그룹에 취업한 장애인근로자 8명을 초청해 손 편지와 선물을 전달하며 ...",MPK그룹
587,1376.0,2015-09-24,"[특징, 임단협, 타결, 실패, 노조, 부분파업, 파업, 장기, 우려]","◇ 3위 : MPK, 화장품 제조사 지분 인수 소식에 '上' '미스터피자'로 유명한...",미스터피자
588,1380.0,2015-09-24,"[특징, 임단협, 타결, 실패, 노조, 부분파업, 파업, 장기, 우려]","◇ 3위 : MPK, 화장품 제조사 지분 인수 소식에 '上' '미스터피자'로 유명한...",미스터피자
589,68.0,2015-09-24,"[특징, 임단협, 타결, 실패, 노조, 부분파업, 파업, 장기, 우려]",쌍방울의 주가 급등은 중국 금성그룹과 제주도에 휴양지를 조성하는 1조원 이상 규모의...,MPK그룹


In [13]:
from gensim import corpora, models

#ldamodel = models.wrappers.LdaMallet.load(datapath("model"))

ldamodel = pd.read_pickle('lda_model.pkl')

dictionary, corpus = make_corpus(news_ready_for_LDA['title'])

In [14]:
#토픽을 각 열로 옮김

topic_list = []

for i, j in enumerate(ldamodel[corpus]):
    prob_list = []
    temp = j
    
    for k, (topic_num, prop_topic) in enumerate(temp):
        prob_list.append(prop_topic)
    
    
    prob_df = pd.DataFrame(prob_list)
    
    prob_df_after_choice = [prob_df == max(prob_list)][0].T.astype(int)
    
    prob_df_after_choice = prob_df_after_choice / sum(prob_df_after_choice.T[0])
    
    topic_list.append(prob_df_after_choice)
    

In [15]:
topic_df = pd.concat(topic_list)
topic_df.reset_index(drop=True, inplace=True)

In [16]:
news_topic_df = pd.concat([news_ready_for_LDA['date'], topic_df, news_ready_for_LDA['company']],axis=1)

In [17]:
#각 날짜에 해당하는 모든 기사 Topic 정보 합치는 함수
def grouped_by_date(concat_df, num_topics):
    grouped_df = concat_df[[x for x in range(num_topics)]].groupby(concat_df['year'])
    
    grouped_with_sum = grouped_df.sum()
    
    grouped_with_sum['합계'] = grouped_with_sum.sum(axis=1)
    
    return grouped_with_sum

In [18]:
def normalized_topic(topic_sum_df):
    topic_sum_df2 = topic_sum_df.iloc[0:,0:9]
    for i in range(len(topic_sum_df2)):
        topic_sum_df2.iloc[i] = topic_sum_df2.iloc[i] / topic_sum_df2.iloc[i, -1]
        
    topic_sum_df2['company'] = topic_sum_df['company']
    return topic_sum_df2

In [19]:
news_topic_df = news_topic_df.dropna()

news_topic_df['year'] = news_topic_df['date'].dt.year
##def 실행 - 날짜 기준으로 기사의 topic 정보 합침
    
news_topic_normalized = grouped_by_date(news_topic_df, 8)
news_topic_normalized['company'] = news_topic_df['company'].iloc[0]

news_topic_normalized = normalized_topic(news_topic_normalized)

news_topic_normalized.reset_index(drop=False, inplace=True)


In [20]:
# topic 컬럼 정리

columns = ["date"] + ["Topic_{}".format(i) for i in range(8)] + ['합계'] + ['company']
news_topic_normalized.columns = columns

In [21]:
news_topic_normalized

Unnamed: 0,date,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,합계,company
0,2013,0.042929,0.189394,0.25,0.184343,0.050505,0.070707,0.146465,0.065657,1.0,미스터피자
1,2014,0.025641,0.057692,0.50641,0.134615,0.102564,0.076923,0.057692,0.038462,1.0,미스터피자
2,2015,0.010667,0.098,0.296,0.106,0.099333,0.030667,0.355333,0.004,1.0,미스터피자
3,2016,0.026861,0.096954,0.060998,0.083756,0.133122,0.020389,0.571912,0.006007,1.0,미스터피자


In [22]:
news_topic_normalized.to_csv('topic_variable.csv')

In [23]:
def ESG_labeling(x):
    
    if x == 0:
        x = 'A'
    
    elif x ==  1:
        x = 'A+'
        
    elif x == 2:
        x = 'B'
    
    elif x == 3:
        x = 'B+'
        
    elif x == 4:
        x = 'C'
        
    elif x == 5:
        x = 'D'
        
    return x
    

In [24]:
#ESG 예측
news_data = pd.read_csv('topic_variable.csv')

#유의적인 변수만을 변수로 선택함

#news_variable = news_data[['company','Topic_1','Topic_2','Topic_7']] 

#전체 변수 선택 시
news_variable = news_data.drop(['date','합계','Unnamed: 0'],axis = 1)

news_variable

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,company
0,0.042929,0.189394,0.25,0.184343,0.050505,0.070707,0.146465,0.065657,미스터피자
1,0.025641,0.057692,0.50641,0.134615,0.102564,0.076923,0.057692,0.038462,미스터피자
2,0.010667,0.098,0.296,0.106,0.099333,0.030667,0.355333,0.004,미스터피자
3,0.026861,0.096954,0.060998,0.083756,0.133122,0.020389,0.571912,0.006007,미스터피자


In [25]:
from sklearn.preprocessing import minmax_scale

job_data = pd.DataFrame(['미스터피자',2.3, 2.1, 2.5, 2.4, 2.4, 1.8]).T

job_variable = job_data.iloc[0:,1:7]

job_variable = pd.concat([job_data[0],job_variable],axis=1)

job_variable.columns = ['company', 'star', 'up', 'wel', 'wl','cul','management']

job_variable

Unnamed: 0,company,star,up,wel,wl,cul,management
0,미스터피자,2.3,2.1,2.5,2.4,2.4,1.8


In [26]:
df_train = pd.merge(news_variable, job_variable)

df_train['company_size'] =  pd.DataFrame([67294953107, 83074096515,100202637253 ,99266261113]) // 1000000000000

In [27]:
#독립변수와 종속변수 분리

x = df_train.drop(['company'], axis=1)

x

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,star,up,wel,wl,cul,management,company_size
0,0.042929,0.189394,0.25,0.184343,0.050505,0.070707,0.146465,0.065657,2.3,2.1,2.5,2.4,2.4,1.8,0
1,0.025641,0.057692,0.50641,0.134615,0.102564,0.076923,0.057692,0.038462,2.3,2.1,2.5,2.4,2.4,1.8,0
2,0.010667,0.098,0.296,0.106,0.099333,0.030667,0.355333,0.004,2.3,2.1,2.5,2.4,2.4,1.8,0
3,0.026861,0.096954,0.060998,0.083756,0.133122,0.020389,0.571912,0.006007,2.3,2.1,2.5,2.4,2.4,1.8,0


In [33]:
import joblib

model_from_joblib = joblib.load('ESG_Predict_model.pkl') 

news_topic_normalized['expect'] = model_from_joblib.predict(x)

#등급 라벨링 (여러 년도일시를 대비하여 lambda로)

news_topic_normalized['expect'] = news_topic_normalized['expect'].apply(lambda x :ESG_labeling(x))

news_topic_normalized

Unnamed: 0,date,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,합계,company,expect
0,2013,0.042929,0.189394,0.25,0.184343,0.050505,0.070707,0.146465,0.065657,1.0,미스터피자,B
1,2014,0.025641,0.057692,0.50641,0.134615,0.102564,0.076923,0.057692,0.038462,1.0,미스터피자,B
2,2015,0.010667,0.098,0.296,0.106,0.099333,0.030667,0.355333,0.004,1.0,미스터피자,C
3,2016,0.026861,0.096954,0.060998,0.083756,0.133122,0.020389,0.571912,0.006007,1.0,미스터피자,C


In [34]:
news_info = pd.concat([news_topic_normalized[['date','company']],news_topic_normalized.iloc[:,1:9],news_topic_normalized[['expect']]],axis=1)

In [35]:
news_info.columns = ['date','company','취약계층 지원', '동반성장','CSR 캠페인','산업재해', '개인정보 유출',
                    '지역사회 공헌','불공정거래','상생경영','소셜 트렌드 등급']

In [36]:
news_info

Unnamed: 0,date,company,취약계층 지원,동반성장,CSR 캠페인,산업재해,개인정보 유출,지역사회 공헌,불공정거래,상생경영,소셜 트렌드 등급
0,2013,미스터피자,0.042929,0.189394,0.25,0.184343,0.050505,0.070707,0.146465,0.065657,B
1,2014,미스터피자,0.025641,0.057692,0.50641,0.134615,0.102564,0.076923,0.057692,0.038462,B
2,2015,미스터피자,0.010667,0.098,0.296,0.106,0.099333,0.030667,0.355333,0.004,C
3,2016,미스터피자,0.026861,0.096954,0.060998,0.083756,0.133122,0.020389,0.571912,0.006007,C


In [38]:
print(news_info['company'][0],"소셜 트렌드",news_info.iloc[:,2:10].T.sort_values(2,ascending=False).index[:3].values)

미스터피자 소셜 트렌드 ['불공정거래' 'CSR 캠페인' '산업재해']
