In [1]:
import pandas as pd
import re

#df : 스크래핑한 미국 문서
df = pd.read_csv("america_2.csv")

#df_keyword : 선정한 미국 키워드 101개
df_keyword = pd.read_csv("미국_번역_100개.csv", index_col = False)

In [2]:
import numpy as np
import itertools
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# SentenceTransformer : 모델 훈련 위한 라이브러리
# 'distiluse-base-multilingual-cased-v1' : 사용할 모델
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

In [17]:
#keyword : 불러온 미국 키워드 중 번역 Column 추출
keyword = list(df_keyword["번역"])
print(len(keyword))

101


In [20]:
#키워드 벡터화
keyword_embedding = []

for ele in keyword:
    keyword_embedding.append(model.encode(ele))

print(len(keyword_embedding))

101


In [62]:
#text : 테스트를 위한 랜덤 문서 => df['first_page_text'][0]
text = '''
Bureau of Customs andBorder Protection
General Notices
[CBP Decision 03–01]C
USTOMS ACCREDITATION OF BSI I NSPECTORATE AMERICA
CORPORATION AS A COMMERCIAL LABORATORY
AGENCY: Customs and Border Protection, Department of Home-land SecurityACTION: Notice of Accreditation of BSI Inspectorate America Cor-poration of Garden City, Georgia, as a Commercial Laboratory.SUMMARY: BSI Inspectorate America Corporation of Garden City,Georgia has applied to Customs and Border Protection under Part151.12 of the Customs Regulations for an extension of accreditationas a commercial laboratory to analyze petroleum products underChapter 27 and Chapter 29 of the Harmonized Tariff Schedule of theUnited States (HTSUS). Customs has determined that this companymeets all of the requirements for accreditation as a commercial labo-ratory. Specifically, BSI Inspectorate America Corporation has beengranted accreditation to perform the following test methods at theirGarden City, Georgia site: (1) Distillation of Petroleum Products,ASTM D86; (2) Water in Petroleum Products and Bituminous Mate-rials by Distillation, ASTM D95; (3) API Gravity by Hydrometer,ASTM D287; (4) Kinematic Viscosity of Transparent and OpaqueLiquids, ASTM D445; (5) Sediment in Crude Oils and Fuel Oils byExtraction, ASTM D473; (6) Density, Relative Density (SpecificGravity), or API Gravity of Crude Petroleum and Liquid PetroleumProducts by Hydrometer Method, ASTM D1298; (7) Water and Sedi-ment in Fuel Oils by the Centrifuge Method, ASTM D1796; (8) Waterand Sediment in Middle Distillate Fuels by Centrifuge, ASTMD2709; (9) Water in Crude Oil by Distillation, ASTM D4006; (10)Percent by Weight of Sulfur by Energy-Dispersive X-Ray Fluores-cence, ASTM D4294; (11) Water in Crude Oils by Coulometric KarlFischer Titration, ASTM D4928; and (12) Vapor Pressure of Petro-leum Products, ASTM D5191. Therefore, in accordance with Part151.12 of the Customs Regulations, BSI Inspectorate America Corpo-ration of Garden City, Georgia is hereby accredited to analyze theproducts named above. 1
'''

In [None]:
#text 전처리 과정

In [63]:
#1. text 구두점 제거 + strip()
import string

text= "".join([i for i in text if i not in string.punctuation]).strip()

print(len(text)) #전처리 후 길이 변화 확인 목적
print(text)

1958
Bureau of Customs andBorder Protection
General Notices
CBP Decision 03–01C
USTOMS ACCREDITATION OF BSI I NSPECTORATE AMERICA
CORPORATION AS A COMMERCIAL LABORATORY
AGENCY Customs and Border Protection Department of Homeland SecurityACTION Notice of Accreditation of BSI Inspectorate America Corporation of Garden City Georgia as a Commercial LaboratorySUMMARY BSI Inspectorate America Corporation of Garden CityGeorgia has applied to Customs and Border Protection under Part15112 of the Customs Regulations for an extension of accreditationas a commercial laboratory to analyze petroleum products underChapter 27 and Chapter 29 of the Harmonized Tariff Schedule of theUnited States HTSUS Customs has determined that this companymeets all of the requirements for accreditation as a commercial laboratory Specifically BSI Inspectorate America Corporation has beengranted accreditation to perform the following test methods at theirGarden City Georgia site 1 Distillation of Petroleum ProductsASTM 

In [64]:
#2. text를 단어로 토큰화
from nltk import word_tokenize

words = word_tokenize(text)

print("단어로 토큰화 후 : ", len(words))

단어로 토큰화 후 :  280


In [65]:
#3. stopword 제거
import nltk
from nltk.corpus import stopwords 

stop_words = set(stopwords.words('english'))
result = []
for word in words:
    if word not in stop_words:
        result.append(word)
        
print("불용어 제거 후 : ", len(result))

불용어 제거 후 :  207


In [66]:
#4. 단어들 합치기
sentence = ''
for i in range(len(result)):
    sentence = sentence + " " + result[i]

print(len(sentence))#전처리 전 길이 변화 비교 목적 : 1135 -> 1018
print(sentence)

1710
 Bureau Customs andBorder Protection General Notices CBP Decision 03–01C USTOMS ACCREDITATION OF BSI I NSPECTORATE AMERICA CORPORATION AS A COMMERCIAL LABORATORY AGENCY Customs Border Protection Department Homeland SecurityACTION Notice Accreditation BSI Inspectorate America Corporation Garden City Georgia Commercial LaboratorySUMMARY BSI Inspectorate America Corporation Garden CityGeorgia applied Customs Border Protection Part15112 Customs Regulations extension accreditationas commercial laboratory analyze petroleum products underChapter 27 Chapter 29 Harmonized Tariff Schedule theUnited States HTSUS Customs determined companymeets requirements accreditation commercial laboratory Specifically BSI Inspectorate America Corporation beengranted accreditation perform following test methods theirGarden City Georgia site 1 Distillation Petroleum ProductsASTM D86 2 Water Petroleum Products Bituminous Materials Distillation ASTM D95 3 API Gravity HydrometerASTM D287 4 Kinematic Viscosity 

In [67]:
#5. 전처리된 문장 bi-gram으로 추출
from nltk import ngrams

grams = []
bigrams = ngrams(sentence.split(), 2)

cnt = 0
for gram in bigrams:
    grams.append(gram)
    cnt += 1
    
print(len(grams))
print(grams)

206
[('Bureau', 'Customs'), ('Customs', 'andBorder'), ('andBorder', 'Protection'), ('Protection', 'General'), ('General', 'Notices'), ('Notices', 'CBP'), ('CBP', 'Decision'), ('Decision', '03–01C'), ('03–01C', 'USTOMS'), ('USTOMS', 'ACCREDITATION'), ('ACCREDITATION', 'OF'), ('OF', 'BSI'), ('BSI', 'I'), ('I', 'NSPECTORATE'), ('NSPECTORATE', 'AMERICA'), ('AMERICA', 'CORPORATION'), ('CORPORATION', 'AS'), ('AS', 'A'), ('A', 'COMMERCIAL'), ('COMMERCIAL', 'LABORATORY'), ('LABORATORY', 'AGENCY'), ('AGENCY', 'Customs'), ('Customs', 'Border'), ('Border', 'Protection'), ('Protection', 'Department'), ('Department', 'Homeland'), ('Homeland', 'SecurityACTION'), ('SecurityACTION', 'Notice'), ('Notice', 'Accreditation'), ('Accreditation', 'BSI'), ('BSI', 'Inspectorate'), ('Inspectorate', 'America'), ('America', 'Corporation'), ('Corporation', 'Garden'), ('Garden', 'City'), ('City', 'Georgia'), ('Georgia', 'Commercial'), ('Commercial', 'LaboratorySUMMARY'), ('LaboratorySUMMARY', 'BSI'), ('BSI', 'Inspe

In [68]:
#6. bi-gram 단어 embedding하기
text_embedding = model.encode([grams])

In [None]:
# keyword_embedding과 text_embedding의 유사도 검사

In [71]:
final_result = []
for i in range(len(text_embedding)):
    for j in range(len(keyword_embedding)):
        distances = cosine_similarity([text_embedding[i]], [keyword_embedding[j]])
        if distances[0][0]>0.1:
            final_result.append(result[i])
# distances = cosine_similarity([doc_embedding[0]], candidate_embeddings)
print(set(final_result))

{'Bureau'}


In [7]:
# from tensorflow.keras.preprocessing.text import text_to_word_sequence

# word_tokens = text_to_word_sequence(text)
# word_tokens = [x for x in word_tokens if len(x)>2]

In [8]:
# from nltk.corpus import stopwords
# import nltk
# nltk.download('stopwords')
# stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\water\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# result = []
# for w in word_tokens: 
#     if w not in stop_words: 
#         result.append(w)

In [11]:
# doc_embedding = model.encode(result)

In [16]:
final_result = []
for i in range(len(doc_embedding)):
    for j in range(len(keyword_embedding)):
        distances = cosine_similarity([doc_embedding[i]], [keyword_embedding[j]])
        if distances[0][0]>0.8:
            final_result.append(result[i])
# distances = cosine_similarity([doc_embedding[0]], candidate_embeddings)
print(set(final_result))

set()
