In [None]:
import sys
import subprocess

if "google.colab" in sys.modules:
    ## 폰트 관련
    subprocess.run(["sudo", "apt-get", "install", "-y", "fonts-nanum"])
    subprocess.run(["sudo", "fc-cache", "-fv"])
    subprocess.run(["rm", "-rf", "~/.cache/matplotlib"]) 
    # KoBERT와 sentencepiece 설치
    subprocess.run(["pip", "install", "git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf"])
    subprocess.run(["pip", "install", "sentencepiece"])
    subprocess.run(["pip", "install", "datasets"])

In [1]:
import sys
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    path = '/content/drive/Othercomputers/내 MacBook Air/Documents/personal_llm_projects/dacon_merong/'
else:
    path = '../'

data_path =  path + 'data/' + '{}.csv'

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
# 한글 폰트 문제 해결
# matplotlib은 한글 폰트를 지원하지 않음
# os정보
import platform

# font_manager : 폰트 관리 모듈
# rc : 폰트 변경 모듈
from matplotlib import font_manager, rc
# unicode 설정
plt.rcParams['axes.unicode_minus'] = False

if 'google.colab' in sys.modules:
    # 폰트 경로 찾기
    plt.rc('font', family='NanumBarunGothic')
    plt.rcParams['axes.unicode_minus'] =False

elif platform.system() == 'Darwin':
    rc('font', family='AppleGothic') # os가 macos
elif platform.system() == 'Windows':
    path = 'c:/Windows/Fonts/malgun.ttf' # os가 windows
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print("Unknown System")

# 가설과 검정

### 1. 카테고리별로 최대한 비슷한 단어를 찾아보기(smae_sample 평균 아래 기준)

In [None]:
fcs =  pd.read_csv(data_path.format('fundamental_cos_sim'))
train = pd.read_csv(data_path.format('train'))


In [14]:
from kobert_tokenizer import KoBERTTokenizer
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [55]:
train_lowmean =  train[fcs['0']< fcs['0'].mean()]

In [56]:
acces =  train_lowmean['인적사고'].fillna('없음').unique()
acces.sort()
acces

array(['감전', '교통사고', '기타', '깔림', '끼임', '넘어짐(기타)', '넘어짐(물체에 걸림)',
       '넘어짐(미끄러짐)', '떨어짐(10미터 이상)', '떨어짐(2미터 미만)', '떨어짐(2미터 이상 ~ 3미터 미만)',
       '떨어짐(3미터 이상 ~ 5미터 미만)', '떨어짐(5미터 이상 ~ 10미터 미만)', '떨어짐(분류불능)',
       '물체에 맞음', '부딪힘', '분류불능', '없음', '절단, 베임', '질병', '질식', '찔림', '화상'],
      dtype=object)

In [57]:
mapper = {}


sample = train_lowmean
for i in sample['재발방지대책 및 향후조치계획']:
    
    input  = tokenizer.encode(
        i
    )
    for token in input:
        word = tokenizer.decode(token)
        if word in mapper.keys():
            mapper[word] +=1
        else:
            mapper[word] = 1

sorted_data = sorted(mapper.items(), key=lambda item: item[1], reverse=True)  # value 기준 정렬

In [58]:
temp1 = pd.DataFrame(sorted_data)
temp1.columns = ['kind', 'all']

for acc in acces:
    mapper = {}

    sample = train_lowmean[train_lowmean['인적사고']== acc]['재발방지대책 및 향후조치계획']

    for i in sample:
        input  = tokenizer.encode(
            i
        )
        for token in input:
            word = tokenizer.decode(token)
            if word in mapper.keys():
                mapper[word] +=1
            else:
                mapper[word] = 1

    sorted_sample_data = sorted(mapper.items(), key=lambda item: item[1], reverse=True)  #


    temp2 = pd.DataFrame(sorted_sample_data)
    temp2.columns = ['kind', acc]

    temp1 =  pd.merge(temp1, temp2, how = 'left').fillna(0)
    temp1[temp1.columns[-1]] = temp1.iloc[:, -1].astype(int)




In [62]:
word_set =  temp1.copy()
word_set.set_index('kind', inplace=True)

In [63]:
word_set

Unnamed: 0_level_0,all,감전,교통사고,기타,깔림,끼임,넘어짐(기타),넘어짐(물체에 걸림),넘어짐(미끄러짐),떨어짐(10미터 이상),...,떨어짐(분류불능),물체에 맞음,부딪힘,분류불능,없음,"절단, 베임",질병,질식,찔림,화상
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
작업,13575,36,37,1054,277,1632,1041,733,1082,94,...,410,2287,1166,176,84,937,238,5,178,137
.,11411,32,77,1022,279,1215,1016,772,1059,110,...,305,1635,898,137,141,713,280,8,147,106
및,11407,34,71,848,303,1244,952,757,1073,133,...,313,1713,942,122,133,716,215,11,140,134
",",11393,32,62,783,325,1335,840,690,941,110,...,338,1780,938,118,136,805,171,10,168,163
[CLS],11335,32,77,1012,283,1189,1006,739,1066,111,...,306,1648,904,138,141,716,273,9,149,105
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
계기로,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
의류,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
완공,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
(0,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [109]:
word_ratio =  word_set.apply(lambda x: x/ x.loc['[CLS]'], axis = 0)
word_ratio

Unnamed: 0_level_0,all,감전,교통사고,기타,깔림,끼임,넘어짐(기타),넘어짐(물체에 걸림),넘어짐(미끄러짐),떨어짐(10미터 이상),...,떨어짐(분류불능),물체에 맞음,부딪힘,분류불능,없음,"절단, 베임",질병,질식,찔림,화상
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
작업,1.197618,1.1250,0.480519,1.041502,0.978799,1.372582,1.034791,0.991881,1.015009,0.846847,...,1.339869,1.387743,1.289823,1.275362,0.595745,1.308659,0.871795,0.555556,1.194631,1.304762
.,1.006705,1.0000,1.000000,1.009881,0.985866,1.021867,1.009940,1.044655,0.993433,0.990991,...,0.996732,0.992112,0.993363,0.992754,1.000000,0.995810,1.025641,0.888889,0.986577,1.009524
및,1.006352,1.0625,0.922078,0.837945,1.070671,1.046257,0.946322,1.024357,1.006567,1.198198,...,1.022876,1.039442,1.042035,0.884058,0.943262,1.000000,0.787546,1.222222,0.939597,1.276190
",",1.005117,1.0000,0.805195,0.773715,1.148410,1.122792,0.834990,0.933694,0.882739,0.990991,...,1.104575,1.080097,1.037611,0.855072,0.964539,1.124302,0.626374,1.111111,1.127517,1.552381
[CLS],1.000000,1.0000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
계기로,0.000088,0.0000,0.000000,0.000000,0.000000,0.000000,0.000994,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
의류,0.000088,0.0000,0.000000,0.000000,0.000000,0.000841,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
완공,0.000088,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000607,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
(0,0.000088,0.0000,0.000000,0.000000,0.000000,0.000000,0.000994,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [143]:
acces

array(['감전', '교통사고', '기타', '깔림', '끼임', '넘어짐(기타)', '넘어짐(물체에 걸림)',
       '넘어짐(미끄러짐)', '떨어짐(10미터 이상)', '떨어짐(2미터 미만)', '떨어짐(2미터 이상 ~ 3미터 미만)',
       '떨어짐(3미터 이상 ~ 5미터 미만)', '떨어짐(5미터 이상 ~ 10미터 미만)', '떨어짐(분류불능)',
       '물체에 맞음', '부딪힘', '분류불능', '없음', '절단, 베임', '질병', '질식', '찔림', '화상'],
      dtype=object)

In [144]:
acc = '감전'
ratio = 0.1

word_ratio_temp = word_ratio[(word_ratio['all'] > 0) & (word_ratio[acc] > ratio)]
sobad = pd.concat([ word_ratio['all'], word_ratio[acc], (word_ratio_temp[acc]/ word_ratio_temp['all'])], axis=1)
sobad.rename({0: 'ratrat'}, axis=1,inplace=True)

sohappy = pd.concat([ word_ratio['all'], word_ratio[acc], (word_ratio_temp['all'] / word_ratio_temp[acc])], axis=1)
sohappy.rename({0: 'ratrat'}, axis=1,inplace=True)


In [None]:
sobad.sort_values('ratrat', ascending=False).head(50)


Unnamed: 0_level_0,all,감전,ratrat
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
누,0.002029,0.125,61.603261
전기,0.004323,0.15625,36.14477
압,0.010234,0.25,24.428879
전원,0.008205,0.125,15.235215
선,0.050904,0.625,12.277946
접,0.015704,0.1875,11.939958
우,0.019232,0.125,6.499427
호,0.029819,0.15625,5.239922
손,0.024261,0.125,5.152273
여부,0.0427,0.15625,3.659285


In [141]:
sohappy.sort_values('ratrat', ascending=False).head(50)

Unnamed: 0_level_0,all,깔림,ratrat
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
로,0.222056,0.113074,1.963804
구,0.216145,0.123675,1.747684
근로자,0.275342,0.176678,1.558435
해,0.1985,0.134276,1.478304
확인,0.25708,0.183746,1.399108
위험,0.222673,0.162544,1.369924
간,0.138156,0.106007,1.303273
사,0.161182,0.123675,1.303273
전,0.345037,0.265018,1.301941
실시,0.611469,0.480565,1.272395


In [117]:
[ word_ratio['all'], word_ratio[acc] ,(word_ratio_temp[acc]/ word_ratio_temp['all'])]

[kind
 작업       1.197618
 .        1.006705
 및        1.006352
 ,        1.005117
 [CLS]    1.000000
            ...   
 계기로      0.000088
 의류       0.000088
 완공       0.000088
 (0       0.000088
 00       0.000088
 Name: all, Length: 2810, dtype: float64,
 kind
 작업       0.978799
 .        0.985866
 및        1.070671
 ,        1.148410
 [CLS]    1.000000
            ...   
 계기로      0.000000
 의류       0.000000
 완공       0.000000
 (0       0.000000
 00       0.000000
 Name: 깔림, Length: 2810, dtype: float64,
 kind
 작업       0.817288
 .        0.979300
 및        1.063913
 ,        1.142564
 [CLS]    1.000000
            ...   
 휴        1.726423
 압        1.726423
 집중       1.741435
 연        1.393148
 벽        1.054026
 Length: 338, dtype: float64]

kind
납      40.053004
갤럭시    40.053004
갓      40.053004
독립     40.053004
빈소     40.053004
         ...    
발행      0.000000
발표      0.000000
발전소     0.000000
발전      0.000000
힙       0.000000
Length: 2810, dtype: float64

In [31]:
pd.DataFrame(sorted_sample_data)

Unnamed: 0,0,1
0,작업,4214
1,안전,4081
2,교육,3569
3,[CLS],3460
4,[SEP],3460
...,...,...
1835,차례,1
1836,무릎,1
1837,맹,1
1838,완공,1
