In [1]:
import numpy as np
import itertools

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [2]:
import pandas as pd
import re

df = pd.read_csv("호주_세관_v2(new20).csv")

df_title = list(df['title'])
df_text = list(df['text'])
df_add = []

for i in range(len(df)):
    df_add.append(df_title[i].lower()+""+df_text[i])

df_keyword = pd.read_csv("호주_번역_100.csv", index_col = False)

In [3]:
df_add = df_add[:2]


In [4]:
#전처리
import string
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize

doc_list = []
for doc in df_add:

    #구두점 제거
    doc1 = "".join([i for i in doc if i not in string.punctuation]).strip()

    #숫자 제거
    doc2 = "".join([i for i in doc1 if not i.isdigit()])
    
    #월 제거
    month = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august',
            'september', 'october', 'november', 'december', 'jan', 'feb', 'mar', 'apr',
             'may', 'jun','jul', 'aug', 'sep', 'oct', 'nov', 'dec']   

    doc3 = " ".join([i for i in doc2.split() if i not in month])
    doc_list.append(doc3)

In [5]:
#전처리한 문서에서 바이그램 추출
n_gram_range = (2, 2)
stop_words = "english"

candidate_list = []
for doc in doc_list:
    count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
    candidate = count.get_feature_names_out()
    candidate_list.append(candidate)


In [6]:
#문서 전체와 문서에서 추출한 키워드 수치화
# model = SentenceTransformer('distilbert-base-nli-mean-tokens')
model = SentenceTransformer('all-mpnet-base-v2')
bigram_keywords = []
top_n = 10

for i in range(len(doc_list)):
    doc_embeddings = model.encode([doc_list[i]])
    candidate_embeddings = model.encode(candidate_list[i])
    distances = cosine_similarity(doc_embeddings, candidate_embeddings)
    bigram_keywords.append([candidate_list[i][index] for index in distances.argsort()[0][-top_n:]])

In [7]:
bigram_keywords[1]

['vehicles australian',
 'tariff working',
 'electric vehicles',
 'vehicles duty',
 'tariff classifications',
 'propose tariff',
 'car tax',
 'new tariff',
 'tariff alterations',
 'revised tariff']

In [8]:
#문서와 가장 유사한 키워드 벡터화
bigram_embeddings = []
for i in range(len(doc_list)):
    bigram_embedding = []
    for keyword in bigram_keywords[i]:
        bigram_embedding.append(model.encode(keyword))
    bigram_embeddings.append(bigram_embedding)

In [9]:
#keyword : 불러온 호주 키워드 중 번역 Column 추출
keyword = list(df_keyword["번역"])
keyword_embeddings = []

for ele in keyword:
    keyword_embeddings.append(model.encode(ele))

In [10]:
# keyword와 바이그램 유사도 비교

bigram_result = []
keyword_result = []
cosine_result = []
for index, bigram in enumerate(bigram_embeddings): #2번 반복(총 2개 문서)
    
    b_result = []
    k_result = []
    c_result = []
    
    for i in range(len(bigram)): # 20번 반복(top 20)
        
        for j in range(len(keyword_embeddings)): #102번 반복(keyword 개수)
            
            distances = cosine_similarity([bigram[i]],[keyword_embeddings[j]]) #유사도 비교
            
            if distances[0][0] > 0.4:
                
                b_result.append(bigram_keywords[index][i])
                k_result.append(keyword[j])
                c_result.append(str(round(float(distances),3)))
                
    bigram_result.append(b_result)
    keyword_result.append(k_result)
    cosine_result.append(c_result)

In [11]:
#df 생성
df_final = pd.DataFrame()

bigram_list = []
keyword_list = []
distance = []

for i in range(len(df_add)):
    B, K, D = [], [], []
    b_result,k_result, d_result = "", "", ""
    df_check = pd.DataFrame()
    df_check['bigram'] = pd.Series(bigram_result[i])
    df_check['keyword'] = pd.Series(keyword_result[i])
    df_check['distance'] = pd.Series(cosine_result[i])
    
    df_check = df_check.sort_values(by="distance", ascending=False)
    
    B = df_check['bigram'].tolist()
    K = df_check['keyword'].tolist()
    D = df_check['distance'].tolist()
    
    for b in B:
        b_result = b_result + "/" + b
    for k in K:
        k_result = k_result + "/" + k
    for d in D:
        d_result = d_result + "/" + d
        
        
    bigram_list.append(b_result[1:])
    keyword_list.append(k_result[1:])
    distance.append(d_result[1:])

    del df_check
    
df_final['bigram'] = bigram_list
df_final['keyword'] = keyword_list
df_final['distance'] = distance
    
df_final


Unnamed: 0,bigram,keyword,distance
0,duty goods/duty goods/duty goods,agricultural products/dairy products/cosmetics,0.447/0.41/0.403
1,vehicles australian/car tax/vehicles duty/vehi...,car/car/car/Automotive Parts/car/machinery/Aut...,0.592/0.495/0.493/0.488/0.483/0.435/0.428/0.42


In [12]:
# keyword와 바이그램 유사도 비교

# bigram_result = []
# keyword_result = []
# cosine_result = []
# for index, bigram in enumerate(bigram_embeddings): #2번 반복(총 2개 문서)
    
#     b_result = ""
#     k_result = ""
#     c_result = ""
    
#     for i in range(len(bigram)): # 20번 반복(top 20)
        
#         for j in range(len(keyword_embeddings)): #102번 반복(keyword 개수)
            
#             distances = cosine_similarity([bigram[i]],[keyword_embeddings[j]]) #유사도 비교
            
#             if distances[0][0] > 0.7:
                
#                 b_result = b_result + "/" + bigram_keywords[index][i]
#                 k_result = k_result + "/" + keyword[j]
#                 c_result = c_result + "/" + str(round(float(distances),3))
                
#     bigram_result.append(b_result[1:])
#     keyword_result.append(k_result[1:])
#     cosine_result.append(c_result[1:])

In [13]:
# df_final = pd.DataFrame()

# bigram_list = []
# keyword_list = []
# distance = []

# for i in range(len(df_add)):
#     df_check = pd.DataFrame()
#     df_check['bigram'] = pd.Series(bigram_result[i])
#     df_check['keyword'] = pd.Series(keyword_result[i])
#     df_check['distance'] = pd.Series(cosine_result[i])
    
#     df_check = df_check.sort_values(by="distance", ascending=False)
    
    
#     bigram_list.append(np.array(df_check['bigram'].tolist()))
#     keyword_list.append(np.array(df_check['keyword'].tolist()))
#     distance.append(np.array(df_check['distance'].tolist()))
    
    
#     del df_check
    
# df_final['bigram'] = bigram_list
# df_final['keyword'] = keyword_list
# df_final['distance'] = distance
    
# df_final
