In [56]:
#테스트 진행 위한 준비
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
import itertools
import string
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-mpnet-base-v2')
# model = SentenceTransformer('all-MiniLM-L6-v2')

#문서 전처리 함수
def preprocess(df_add):
    doc_list = []
    for doc in df_add:
        #구두점 제거
        doc1 = "".join([i for i in doc if i not in string.punctuation]).strip()

        #숫자 제거
        doc2 = "".join([i for i in doc1 if not i.isdigit()])

        #월 제거
        month = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august',
                'september', 'october', 'november', 'december', 'jan', 'feb', 'mar', 'apr',
                 'may', 'jun','jul', 'aug', 'sep', 'oct', 'nov', 'dec']   
        
        doc3 = " ".join([i for i in doc2.split() if i not in month])
            
        #계속 추가 예정
        stopword = ["australian", "australia" ,"duty", "abfgovauimportingexportingand", "manufacturingimportinghowtoimportdisposingunenteredabandonedgoods", 
           "declare", "consigment", "sorted", "license","goods", "products", "quota", "ii", "russia", "httpswwwabfgovauimporting", "customs",
                   "indexation", "working", "available", "subheadings", "cpi", "wwwabfgovau", "tariff", "office", "rates", "spirits", "rules", "blue" ]

        doc4 = " ".join([i for i in doc3.split() if i not in stopword])
        doc_list.append(doc4)
    
    return doc_list

#문서 유니그램 단위로 나누는 함수 + nltk 제공하는 불용어 제거
def unigram(doc_list):
    n_gram_range = (1, 1)
    stop_words = "english"

    candidate_list = []
    for doc in doc_list:
        count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
        candidate = count.get_feature_names_out()
        candidate_list.append(candidate)
    
    return candidate_list

#문서 바이그램 단위로 나누는 함수 + nltk 제공하는 불용어 제거
def bigram(doc_list):
    n_gram_range = (2, 2)
    stop_words = "english"

    candidate_list = []
    for doc in doc_list:
        count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
        candidate = count.get_feature_names_out()
        candidate_list.append(candidate)
    
    return candidate_list

#문서 트라이그램 단위로 나누는 함수 + nltk 제공하는 불용어 제거
def trigram(doc_list):
    n_gram_range = (3, 3)
    stop_words = "english"

    candidate_list = []
    for doc in doc_list:
        count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
        candidate = count.get_feature_names_out()
        candidate_list.append(candidate)
    
    return candidate_list

#전처리 문서와 바이그램 embedding 후 유사도 높은 30개 키워드 추출 함수
def bigram_embedding1(doc_list, candidate_list):
    bigram_keywords1 = []
    top_n = 30

    for i in range(len(doc_list)):
        doc_embeddings = model.encode([doc_list[i]])
        candidate_embeddings = model.encode(candidate_list[i])
        distances = cosine_similarity(doc_embeddings, candidate_embeddings)
        bigram_keywords1.append([candidate_list[i][index] for index in distances.argsort()[0][-top_n:]])
        
    return bigram_keywords1

#원본 문서와 바이그램 embedding 후 유사도 높은 20개 키워드 추출 함수
def bigram_embedding2(doc_list, candidate_list):
    bigram_keywords2 = []
    top_n = 30

    for i in range(len(doc_list)):
        doc_embeddings = model.encode([df_add[i]])
        candidate_embeddings = model.encode(candidate_list[i])
        distances = cosine_similarity(doc_embeddings, candidate_embeddings)
        bigram_keywords2.append([candidate_list[i][index] for index in distances.argsort()[0][-top_n:]])
        
    return bigram_keywords2

#추출 키워드 embedding 함수
def keyword_embedding(bigram_keywords):
    bigram_embeddings = []
    for i in range(len(doc_list)):
        bigram_embedding = []
        for keyword in bigram_keywords[i]:
            bigram_embedding.append(model.encode(keyword))
        bigram_embeddings.append(bigram_embedding)
    
    return bigram_embeddings

#선정 키워드(단어)와 embedding 값 불러오기
def get_keyword1():
    df_keyword = pd.read_csv("호주_키워드_HS_KSIC(description 추가).csv", index_col = False)
    keyword = list(df_keyword["번역"])
    keyword_embeddings = []

    for ele in keyword:
        keyword_embeddings.append(model.encode(ele))
    
    return keyword, keyword_embeddings

#선정 키워드(단어)와 embedding 값 불러오기
def get_keyword2():
    df_keyword = pd.read_csv("호주_키워드_HS_KSIC(description 추가).csv", index_col = False)
    keyword = list(df_keyword["description"])
    keyword_embeddings = []

    for ele in keyword:
        keyword_embeddings.append(model.encode(ele))
    
    return keyword, keyword_embeddings

#추출 키워드와 선정 키워드 유사도 비교
def similarity_test(bigram_embeddings, bigram_keywords, keyword_embeddings, keyword):
    bigram_result = []
    keyword_result = []
    cosine_result = []
    for index, bigram in enumerate(bigram_embeddings):

        b_result = []
        k_result = []
        c_result = []

        for i in range(len(bigram)): 
            for j in range(len(keyword_embeddings)): 
                distances = cosine_similarity([bigram[i]],[keyword_embeddings[j]]) #유사도 비교
                if distances[0][0] > 0.45:
                    b_result.append(bigram_keywords[index][i])
                    k_result.append(keyword[j])
                    c_result.append(str(round(float(distances),3)))

        bigram_result.append(b_result)
        keyword_result.append(k_result)
        cosine_result.append(c_result)
        
    return bigram_result, keyword_result, cosine_result

#유사도 높은 순으로 df 만들기
def make_df(bigram_result, keyword_result, cosine_result):
    df_final = pd.DataFrame()

    bigram_list = []
    keyword_list = []
    distance = []

    for i in range(len(df_add)):
        B, K, D = [], [], []
        b_result,k_result, d_result = "", "", ""
        df_check = pd.DataFrame()
        df_check['bigram'] = pd.Series(bigram_result[i])
        df_check['keyword'] = pd.Series(keyword_result[i])
        df_check['distance'] = pd.Series(cosine_result[i])

        df_check = df_check.sort_values(by="distance", ascending=False)
        
        if len(df_check['bigram']) > 5:
            B = df_check['bigram'].tolist()[:5]
            K = df_check['keyword'].tolist()[:5]
            D = df_check['distance'].tolist()[:5]
        else:
            B = df_check['bigram'].tolist()
            K = df_check['keyword'].tolist()
            D = df_check['distance'].tolist()

#         B = df_check['bigram'].tolist()
#         K = df_check['keyword'].tolist()
#         D = df_check['distance'].tolist()
        
        for b in B:
            b_result = b_result + "/" + b
        for k in K:
            k_result = k_result + "/" + k
        for d in D:
            d_result = d_result + "/" + d


        bigram_list.append(b_result[1:])
        keyword_list.append(k_result[1:])
        distance.append(d_result[1:])

        del df_check

    df_final['ngram'] = bigram_list
    df_final['keyword'] = keyword_list
    df_final['distance'] = distance

    return df_final


In [3]:
df = pd.read_csv("테스트_문서5.csv")

df_title = list(df['title'])
df_text = list(df['text'])
df_add = []

#제목 + 본문을 하나의 문서로 고려하기 때문에 합치는 작업
for i in range(len(df)):
    df_add.append(df_title[i].lower()+""+df_text[i])

In [4]:
#테스트 진행
#최신 문서 50 전처리
doc_list = preprocess(df_add)


#문서에서 유니/바이/트라이그램 추출
candidate_list1 = unigram(doc_list)
candidate_list2 = bigram(doc_list)
candidate_list3 = trigram(doc_list)

#전처리 문서와 유니/바이/트라이/그램 유사도 비교 후 top 30 바이그램 선정
unigram_keywords1 = bigram_embedding1(doc_list, candidate_list1) 
bigram_keywords1 = bigram_embedding1(doc_list, candidate_list2) 
trigram_keywords1 = bigram_embedding1(doc_list, candidate_list3) 

#원본 문서와 유니/바이/트라이/그램 유사도 비교 후 top 30 바이그램 선정
unigram_keywords2 = bigram_embedding2(doc_list, candidate_list1) 
bigram_keywords2 = bigram_embedding2(doc_list, candidate_list2) 
trigram_keywords2 = bigram_embedding2(doc_list, candidate_list3) 

In [8]:
#top 30 유니/바이/트라이 그램 embedding 진행 - 전처리 문서
unigram_embeddings1 = keyword_embedding(unigram_keywords1) 
bigram_embeddings1 = keyword_embedding(bigram_keywords1) 
trigram_embeddings1 = keyword_embedding(trigram_keywords1) 

#top 30 유니/바이/트라이 그램 embedding 진행 - 원본 문서
unigram_embeddings2 = keyword_embedding(unigram_keywords2) 
bigram_embeddings2 = keyword_embedding(bigram_keywords2) 
trigram_embeddings2 = keyword_embedding(trigram_keywords2) 

In [42]:
#설명 키워드 비교 / 원본 문서 - 트라이그램 결과
ngram_result, keyword_result, cosine_result = similarity_test(trigram_embeddings2, trigram_keywords2, keyword_embeddings2, keyword2)

df1 = pd.read_csv("테스트_문서5_비교.csv", index_col = False)
df2 = make_df(ngram_result, keyword_result, cosine_result)
df3 = pd.concat([df1,df2],axis=1)
df3

  df_check['bigram'] = pd.Series(bigram_result[i])
  df_check['keyword'] = pd.Series(keyword_result[i])
  df_check['distance'] = pd.Series(cosine_result[i])


Unnamed: 0,summary,ngram,keyword,distance
0,"주류, 맥주, 연료 제품을 포함한 특정 소비재 관련 새로운 관세율 명시",substituted exciseequivalent gazette,Halogenated derivatives of hydrocarbons.,0.351
1,"전기 차, 하이브리드 차량, 연료 효율 높은 자동차에 대한 대한 관세 철폐",vehicles electric vehicles/certain electric ve...,Motor?cars and other motor vehicles principall...,0.481/0.458/0.423/0.422/0.418
2,의약품 관세 변경,ingredients manufacture medicaments/concession...,Peptones and their derivatives; other protein ...,0.541/0.507/0.494/0.494/0.484
3,고급 자동차 세금 고시,vehicles increased vehicles/vehicles increased...,Motor?cars and other motor vehicles principall...,0.411/0.403/0.392/0.391/0.387
4,특정 연료 관세 변경,,,


In [43]:
#설명 키워드 비교 / 원본 문서 - 바이그램 결과
ngram_result, keyword_result, cosine_result = similarity_test(bigram_embeddings2, bigram_keywords2, keyword_embeddings2, keyword2)

df1 = pd.read_csv("테스트_문서5_비교.csv", index_col = False)
df2 = make_df(ngram_result, keyword_result, cosine_result)
df3 = pd.concat([df1,df2],axis=1)
df3

  df_check['bigram'] = pd.Series(bigram_result[i])
  df_check['keyword'] = pd.Series(keyword_result[i])
  df_check['distance'] = pd.Series(cosine_result[i])


Unnamed: 0,summary,ngram,keyword,distance
0,"주류, 맥주, 연료 제품을 포함한 특정 소비재 관련 새로운 관세율 명시",,,
1,"전기 차, 하이브리드 차량, 연료 효율 높은 자동차에 대한 대한 관세 철폐",certain electric/vehicles plugin/vehicles elec...,Electric instantaneous or storage water heater...,0.523/0.479/0.478/0.469/0.459
2,의약품 관세 변경,hygiene use/treatment equipment/treatment equi...,Perfumes and toilet waters./Instruments and ap...,0.537/0.529/0.529/0.526/0.513
3,고급 자동차 세금 고시,efficient vehicles/luxury car/increased vehicl...,Motor?cars and other motor vehicles principall...,0.447/0.446/0.401/0.368/0.35
4,특정 연료 관세 변경,litre petroleum/gas litre/gas litre/gas litre/...,Petroleum oils and oils obtained from bitumino...,0.463/0.418/0.418/0.418/0.401


In [44]:
#설명 키워드 비교 / 원본 문서 - 유니그램 결과
ngram_result, keyword_result, cosine_result = similarity_test(unigram_embeddings2, unigram_keywords2, keyword_embeddings2, keyword2)

df1 = pd.read_csv("테스트_문서5_비교.csv", index_col = False)
df2 = make_df(ngram_result, keyword_result, cosine_result)
df3 = pd.concat([df1,df2],axis=1)
df3

Unnamed: 0,summary,ngram,keyword,distance
0,"주류, 맥주, 연료 제품을 포함한 특정 소비재 관련 새로운 관세율 명시",fuel/fuel/fuel/fuel/fuel,"Coal; briquettes, ovoids and similar solid fue...",0.464/0.464/0.424/0.424/0.424
1,"전기 차, 하이브리드 차량, 연료 효율 높은 자동차에 대한 대한 관세 철폐",vehicles/car/electric/fuel/fuel,Motor?cars and other motor vehicles principall...,0.62/0.495/0.464/0.464/0.464
2,의약품 관세 변경,ingredients/items/ingredients/soaps/reagents,Chocolate and other food preparations containi...,0.566/0.556/0.552/0.548/0.539
3,고급 자동차 세금 고시,vehicles/car/fuel/fuel/fuel,Motor?cars and other motor vehicles principall...,0.62/0.495/0.464/0.464/0.424
4,특정 연료 관세 변경,petroleum/fuels/fuels/gas/gas,Petroleum oils and oils obtained from bitumino...,0.639/0.581/0.581/0.579/0.579


In [45]:
#설명 키워드 비교 / 전처리 문서 - 트라이그램 결과
ngram_result, keyword_result, cosine_result = similarity_test(trigram_embeddings1, trigram_keywords1, keyword_embeddings2, keyword2)

df1 = pd.read_csv("테스트_문서5_비교.csv", index_col = False)
df2 = make_df(ngram_result, keyword_result, cosine_result)
df3 = pd.concat([df1,df2],axis=1)
df3

Unnamed: 0,summary,ngram,keyword,distance
0,"주류, 맥주, 연료 제품을 포함한 특정 소비재 관련 새로운 관세율 명시",including beers fuel/including beers fuel/incl...,Petrtoleum gases and other gaseous hydrocarbon...,0.388/0.388/0.388/0.379/0.379
1,"전기 차, 하이브리드 차량, 연료 효율 높은 자동차에 대한 대한 관세 철폐",vehicles electric vehicles/certain electric ve...,Motor?cars and other motor vehicles principall...,0.481/0.458/0.423/0.422/0.418
2,의약품 관세 변경,hygiene capable use/concessional treatment equ...,"Perfumes and toilet waters./Medical, surgical,...",0.52/0.507/0.495/0.494/0.494
3,고급 자동차 세금 고시,vehicles increased vehicles/vehicles increased...,Motor?cars and other motor vehicles principall...,0.411/0.403/0.392/0.391/0.387
4,특정 연료 관세 변경,provisions listed section,Parts and accessories of articles,0.391


In [46]:
#설명 키워드 비교 / 전처리 문서 - 바이그램 결과
ngram_result, keyword_result, cosine_result = similarity_test(bigram_embeddings1, bigram_keywords1, keyword_embeddings2, keyword2)

df1 = pd.read_csv("테스트_문서5_비교.csv", index_col = False)
df2 = make_df(ngram_result, keyword_result, cosine_result)
df3 = pd.concat([df1,df2],axis=1)
df3

Unnamed: 0,summary,ngram,keyword,distance
0,"주류, 맥주, 연료 제품을 포함한 특정 소비재 관련 새로운 관세율 명시",beers fuel/beers fuel/atogovaufuelexciserates ...,"Coal; briquettes, ovoids and similar solid fue...",0.435/0.435/0.376/0.376/0.357
1,"전기 차, 하이브리드 차량, 연료 효율 높은 자동차에 대한 대한 관세 철폐",certain electric/vehicles plugin/vehicles elec...,Electric instantaneous or storage water heater...,0.523/0.479/0.478/0.469/0.459
2,의약품 관세 변경,hygiene use/treatment equipment/treatment equi...,Perfumes and toilet waters./Instruments and ap...,0.537/0.529/0.529/0.526/0.513
3,고급 자동차 세금 고시,efficient vehicles/luxury car/increased vehicl...,Motor?cars and other motor vehicles principall...,0.447/0.446/0.401/0.368/0.35
4,특정 연료 관세 변경,provisions listed/provisions listed/provisions...,Parts and accessories of articles/Animal produ...,0.455/0.454/0.429/0.418/0.418


In [47]:
#설명 키워드 비교 / 전처리 문서 - 유니그램 결과
ngram_result, keyword_result, cosine_result = similarity_test(unigram_embeddings1, unigram_keywords1, keyword_embeddings2, keyword2)

df1 = pd.read_csv("테스트_문서5_비교.csv", index_col = False)
df2 = make_df(ngram_result, keyword_result, cosine_result)
df3 = pd.concat([df1,df2],axis=1)
df3

Unnamed: 0,summary,ngram,keyword,distance
0,"주류, 맥주, 연료 제품을 포함한 특정 소비재 관련 새로운 관세율 명시",fuel/fuel/provisions/provisions/fuel,"Coal; briquettes, ovoids and similar solid fue...",0.464/0.464/0.452/0.436/0.424
1,"전기 차, 하이브리드 차량, 연료 효율 높은 자동차에 대한 대한 관세 철폐",vehicles/car/fuel/fuel/electric,Motor?cars and other motor vehicles principall...,0.62/0.495/0.464/0.464/0.464
2,의약품 관세 변경,ingredients/ingredients/soaps/reagents/reagents,Chocolate and other food preparations containi...,0.566/0.552/0.548/0.539/0.531
3,고급 자동차 세금 고시,vehicles/car/fuel/fuel/fuel,Motor?cars and other motor vehicles principall...,0.62/0.495/0.464/0.464/0.424
4,특정 연료 관세 변경,petroleum/fuels/fuels/gas/gas,Petroleum oils and oils obtained from bitumino...,0.639/0.581/0.581/0.579/0.579


In [57]:
#단어 키워드 비교 / 원본 문서 - 트라이그램 결과
ngram_result, keyword_result, cosine_result = similarity_test(trigram_embeddings2, trigram_keywords2, keyword_embeddings1, keyword1)

df1 = pd.read_csv("테스트_문서5_비교.csv", index_col = False)
df2 = make_df(ngram_result, keyword_result, cosine_result)
df3 = pd.concat([df1,df2],axis=1)
df3

  df_check['bigram'] = pd.Series(bigram_result[i])
  df_check['keyword'] = pd.Series(keyword_result[i])
  df_check['distance'] = pd.Series(cosine_result[i])
  df_check['bigram'] = pd.Series(bigram_result[i])
  df_check['keyword'] = pd.Series(keyword_result[i])
  df_check['distance'] = pd.Series(cosine_result[i])


Unnamed: 0,summary,ngram,keyword,distance
0,"주류, 맥주, 연료 제품을 포함한 특정 소비재 관련 새로운 관세율 명시",,,
1,"전기 차, 하이브리드 차량, 연료 효율 높은 자동차에 대한 대한 관세 철폐",certain electric vehicles/vehicles electric ve...,Automotive Parts/car/car/car/Automotive Parts,0.55/0.535/0.497/0.491/0.478
2,의약품 관세 변경,containers medicaments bylaw/effect containers...,Medical container/Medical container/Medical co...,0.725/0.713/0.636/0.619/0.58
3,고급 자동차 세금 고시,notice luxury car/website luxury car/increases...,car/car/car/car/Automotive Parts,0.573/0.54/0.494/0.491/0.453
4,특정 연료 관세 변경,,,


In [58]:
#단어 키워드 비교 / 원본 문서 - 바이그램 결과
ngram_result, keyword_result, cosine_result = similarity_test(bigram_embeddings2, bigram_keywords2, keyword_embeddings1, keyword1)

df1 = pd.read_csv("테스트_문서5_비교.csv", index_col = False)
df2 = make_df(ngram_result, keyword_result, cosine_result)
df3 = pd.concat([df1,df2],axis=1)
df3

  df_check['bigram'] = pd.Series(bigram_result[i])
  df_check['keyword'] = pd.Series(keyword_result[i])
  df_check['distance'] = pd.Series(cosine_result[i])


Unnamed: 0,summary,ngram,keyword,distance
0,"주류, 맥주, 연료 제품을 포함한 특정 소비재 관련 새로운 관세율 명시",,,
1,"전기 차, 하이브리드 차량, 연료 효율 높은 자동차에 대한 대한 관세 철폐",vehicles electric/vehicles notice/efficient ve...,car/car/car/car/Automotive Parts,0.553/0.541/0.498/0.496/0.495
2,의약품 관세 변경,containers medicaments/containers medicaments/...,Medical container/Medical container parts/Clea...,0.825/0.739/0.662/0.639/0.597
3,고급 자동차 세금 고시,luxury car/effect fuel/efficient vehicles/car ...,car/propane/car/car/butane,0.669/0.524/0.498/0.495/0.476
4,특정 연료 관세 변경,litre petroleum/litre petroleum/gas litre/gas ...,crude oil/LNG (liquid natural gas)/propane/LNG...,0.576/0.561/0.543/0.541/0.525


In [59]:
#단어 키워드 비교 / 원본 문서 - 유니그램 결과
ngram_result, keyword_result, cosine_result = similarity_test(unigram_embeddings2, unigram_keywords2, keyword_embeddings1, keyword1)

df1 = pd.read_csv("테스트_문서5_비교.csv", index_col = False)
df2 = make_df(ngram_result, keyword_result, cosine_result)
df3 = pd.concat([df1,df2],axis=1)
df3

Unnamed: 0,summary,ngram,keyword,distance
0,"주류, 맥주, 연료 제품을 포함한 특정 소비재 관련 새로운 관세율 명시",fuel/fuel/fuel/fuel/fuel,propane/butane/LNG (liquid natural gas)/crude ...,0.65/0.559/0.53/0.53/0.498
1,"전기 차, 하이브리드 차량, 연료 효율 높은 자동차에 대한 대한 관세 철폐",car/vehicles/fuel/vehicles/hydrogen,car/car/propane/Automotive Parts/ammonia water,1.0/0.751/0.65/0.592/0.586
2,의약품 관세 변경,medical/medicament/medicaments/treatment/presc...,medicine/medicine/medicine/medicine/medicine,0.805/0.695/0.687/0.635/0.629
3,고급 자동차 세금 고시,car/vehicles/fuel/vehicles/vehicles,car/car/propane/Automotive Parts/machinery,1.0/0.751/0.65/0.592/0.56
4,특정 연료 관세 변경,petroleum/ethanol/gas/fuels/fuel,crude oil/alcohol/propane/propane/propane,0.8/0.744/0.679/0.673/0.65


In [60]:
#단어 키워드 비교 / 전처리 문서 - 트라이그램 결과
ngram_result, keyword_result, cosine_result = similarity_test(trigram_embeddings1, trigram_keywords1, keyword_embeddings1, keyword1)

df1 = pd.read_csv("테스트_문서5_비교.csv", index_col = False)
df2 = make_df(ngram_result, keyword_result, cosine_result)
df3 = pd.concat([df1,df2],axis=1)
df3

  df_check['bigram'] = pd.Series(bigram_result[i])
  df_check['keyword'] = pd.Series(keyword_result[i])
  df_check['distance'] = pd.Series(cosine_result[i])


Unnamed: 0,summary,ngram,keyword,distance
0,"주류, 맥주, 연료 제품을 포함한 특정 소비재 관련 새로운 관세율 명시",including beers fuel/including beers fuel,propane/butane,0.501/0.46
1,"전기 차, 하이브리드 차량, 연료 효율 높은 자동차에 대한 대한 관세 철폐",certain electric vehicles/vehicles electric ve...,Automotive Parts/car/car/car/car,0.55/0.535/0.531/0.497/0.491
2,의약품 관세 변경,containers medicaments bylaw/containers medica...,Medical container/Medical container parts/Clea...,0.725/0.636/0.626/0.603/0.585
3,고급 자동차 세금 고시,notice luxury car/website luxury car/increases...,car/car/car/car/Automotive Parts,0.573/0.54/0.494/0.491/0.453
4,특정 연료 관세 변경,,,


In [68]:
#단어 키워드 비교 / 전처리 문서 - 바이그램 결과
ngram_result, keyword_result, cosine_result = similarity_test(bigram_embeddings1, bigram_keywords1, keyword_embeddings1, keyword1)

df1 = pd.read_csv("테스트_문서5_비교.csv", index_col = False)
df2 = make_df(ngram_result, keyword_result, cosine_result)
df3 = pd.concat([df1,df2],axis=1)
df3

Unnamed: 0,summary,ngram,keyword,distance
0,"주류, 맥주, 연료 제품을 포함한 특정 소비재 관련 새로운 관세율 명시",beers fuel/beers fuel/beers fuel/beers fuel/be...,beverage/propane/alcohol/butane/LNG (liquid na...,0.589/0.586/0.572/0.513/0.452
1,"전기 차, 하이브리드 차량, 연료 효율 높은 자동차에 대한 대한 관세 철폐",luxury car/vehicles electric/vehicles notice/e...,car/car/car/car/car,0.669/0.553/0.541/0.498/0.496
2,의약품 관세 변경,containers medicaments/containers medicaments/...,Medical container/Medical container parts/Clea...,0.825/0.739/0.662/0.639/0.597
3,고급 자동차 세금 고시,luxury car/effect fuel/efficient vehicles/car ...,car/propane/car/car/butane,0.669/0.524/0.498/0.495/0.476
4,특정 연료 관세 변경,fuel temporary/gas litre/gas litre/fuel tempor...,propane/propane/LNG (liquid natural gas)/butan...,0.605/0.543/0.541/0.538/0.482


In [62]:
#단어 키워드 비교 / 전처리 문서 - 유니그램 결과
ngram_result, keyword_result, cosine_result = similarity_test(unigram_embeddings1, unigram_keywords1, keyword_embeddings1, keyword1)

df1 = pd.read_csv("테스트_문서5_비교.csv", index_col = False)
df2 = make_df(ngram_result, keyword_result, cosine_result)
df3 = pd.concat([df1,df2],axis=1)
df3

Unnamed: 0,summary,ngram,keyword,distance
0,"주류, 맥주, 연료 제품을 포함한 특정 소비재 관련 새로운 관세율 명시",beers/beers/fuel/fuel/beers,beverage/alcohol/propane/butane/wine,0.713/0.713/0.65/0.559/0.552
1,"전기 차, 하이브리드 차량, 연료 효율 높은 자동차에 대한 대한 관세 철폐",car/vehicles/fuel/vehicles/hydrogen,car/car/propane/Automotive Parts/ammonia water,1.0/0.751/0.65/0.592/0.586
2,의약품 관세 변경,medical/medicament/medicaments/treatment/presc...,medicine/medicine/medicine/medicine/medicine,0.805/0.695/0.687/0.635/0.629
3,고급 자동차 세금 고시,car/vehicles/fuel/vehicles/vehicles,car/car/propane/Automotive Parts/machinery,1.0/0.751/0.65/0.592/0.56
4,특정 연료 관세 변경,petroleum/ethanol/gas/fuels/fuel,crude oil/alcohol/propane/propane/propane,0.8/0.744/0.679/0.673/0.65


In [66]:
df_add[1]

'removal of customs duty on certain electric vehicles australian customs notice no. 2022/34 removal of customs duty on certain electric vehicles from 1 july 2022, customs duty on electric vehicles, plug-in hybrid vehicles and hydrogen fuel-cell vehicles with a customs value less than the fuel efficient luxury car tax threshold will have a ‘free’ rate of duty. the ‘free’ rate of duty applies to all goods that meet this requirement, except those from russia and belarus which currently have an additional duty of 35 per cent applied. implementation notice of intention to propose customs tariff alterations (no. 6) 2022 (the notice) published on 21 july 2022 implements this measure. the text of the notice, including new additional note 6 to chapter 87 and the new tariff classifications 8703.60.12, 8703.70.12, 8703.80.12 and 8703.90.12, is available at attachment a. the notice operates from 22 july 2022 and applies to goods entered for home consumption from 1 july 2022. lct threshold for 2022