In [1]:
# !pip install gensim



In [8]:
from gensim.models import FastText
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import requests
import gzip
import shutil
import pickle

## 학습완료된 fasttext 모델 다운로드
(최초 1회)

- Facebook에서 개발한 단어 임베딩 모델
-  OOV(Out of Vocabulary) 단어 처리에 강점을 가지고 있으며 다양한 키워드를 다루기에 적합

In [3]:
'''

# 다운로드할 파일의 URL
file_url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz"

# 저장할 로컬 파일 경로
local_file_path = 'D:/playdata_mini_proj/cc.en.300.bin.gz'

# requests를 사용하여 파일 다운로드
response = requests.get(file_url)

# HTTP 상태 코드 확인
if response.status_code == 200:
    # 바이너리 모드로 파일을 열고 다운로드한 내용을 저장
    with open(local_file_path, 'wb') as file:
        file.write(response.content)
    print(f"{local_file_path} 파일 다운로드 완료.")
else:
    print("파일 다운로드 실패. HTTP 상태 코드:", response.status_code)
    
'''

D:/playdata_mini_proj/cc.en.300.bin.gz 파일 다운로드 완료.


## 압축 풀기
(최초 1회)

In [6]:
'''

local_file_path = 'D:/playdata_mini_proj/cc.en.300.bin.gz'
local_bin_file_path = 'D:/playdata_mini_proj/cc.en.300.bin'
with gzip.open(local_file_path, 'rb') as gz_file, open(local_bin_file_path, 'wb') as bin_file:
    shutil.copyfileobj(gz_file, bin_file)
    
'''

## 모델 로드

In [7]:
# 미리 학습된 FastText 모델을 로드
model = FastText.load_fasttext_format('D:/playdata_mini_proj/cc.en.300.bin')

## 데이터 끌어오기

In [12]:
with open('D:/playdata_mini_proj/data/keybert_result.pickle', 'rb') as file:
    keys = pickle.load(file)
    
keys.head()

Unnamed: 0,result
0,"[natural, group, disclosing, lp, gas]"
1,"[light, analysts, weeks, stock, beaten]"
2,"[flying, yellow, disconnect, investors, oil]"
3,"[french, agribusiness, oldest, monday, beer]"
4,"[flying, yellow, disconnect, investors, oil]"


## 벡터화 해보기

In [13]:

keywords = keys['result'][0]
keyword_vectors = [model.wv[keyword] for keyword in keywords]

# 키워드 벡터 확인
for keyword, vector in zip(keywords, keyword_vectors):
    print(f"Vector for '{keyword}': {vector}")


Vector for 'natural': [ 0.02236355 -0.02891106 -0.04033679  0.07239055  0.00839372 -0.07997381
  0.09779778 -0.01371179  0.01030005  0.0070464  -0.05977215  0.02826316
  0.0029789  -0.03155434  0.07967968 -0.03252393 -0.0120744  -0.03059066
  0.06023479 -0.03733392  0.04818533 -0.02058554  0.00981667  0.00839132
  0.03246382  0.01092318  0.03456066 -0.05040621  0.05949245  0.04038371
 -0.05808469 -0.01691647  0.00753197  0.03550281  0.05596606 -0.08521663
  0.03473034 -0.03804912  0.03879257 -0.06000923  0.01076734 -0.01101736
  0.01160554 -0.02726951 -0.08006509 -0.0093736   0.02478546 -0.03101899
 -0.04492284  0.00325101  0.0086383  -0.02355115 -0.00241463  0.00528206
 -0.04696236  0.05687326  0.04173246 -0.03633492 -0.05853149 -0.04778981
  0.00298394 -0.01527326 -0.0256282  -0.03286073 -0.02380301  0.05837882
 -0.01671284 -0.04694207  0.0085839   0.01410309 -0.01048839 -0.02087606
 -0.02474408  0.02670529 -0.0579764  -0.01008738 -0.03090805  0.01466792
 -0.00512746  0.03950053  0.0

In [20]:
len(keyword_vectors)

5

## 벡터화 함수 정의 & 적용해보기

In [32]:
def vectorization(words):
    return [model.wv[w] for w in words]

ex = keys['result'][1:5].apply(vectorization)
print(ex)
print(len(ex[1]))

1    [[-0.116084486, -0.016968077, 0.03961698, 0.08...
2    [[0.013123234, 0.0094847735, 0.019651216, 0.10...
3    [[-0.036596924, -0.054901063, -0.06675389, 0.0...
4    [[0.013123234, 0.0094847735, 0.019651216, 0.10...
Name: result, dtype: object
5


## 벡터화하기

In [39]:
# 벡터화 결과를 저장하기 위한 데이터프레임
vectors = pd.DataFrame()
vectors['result'] = ['nan'] * len(keys)
vectors.head()

Unnamed: 0,result
0,
1,
2,
3,
4,


In [40]:
# 벡터화
for i in range(len(keys)//1000):
    words_list = keys['result'][i*1000:(i+1)*1000]
    vectors['result'][i*1000:(i+1)*1000] = words_list.apply(vectorization)
    print(1000*(i+1), 'complete')

1000 complete
2000 complete
3000 complete
4000 complete
5000 complete
6000 complete
7000 complete
8000 complete
9000 complete
10000 complete
11000 complete
12000 complete
13000 complete
14000 complete
15000 complete
16000 complete
17000 complete
18000 complete
19000 complete
20000 complete
21000 complete
22000 complete
23000 complete
24000 complete
25000 complete
26000 complete
27000 complete
28000 complete
29000 complete
30000 complete
31000 complete
32000 complete
33000 complete
34000 complete
35000 complete
36000 complete
37000 complete
38000 complete
39000 complete
40000 complete
41000 complete
42000 complete
43000 complete
44000 complete
45000 complete
46000 complete
47000 complete
48000 complete
49000 complete
50000 complete
51000 complete
52000 complete
53000 complete
54000 complete
55000 complete
56000 complete
57000 complete
58000 complete
59000 complete
60000 complete
61000 complete
62000 complete
63000 complete
64000 complete
65000 complete
66000 complete
67000 complete
6800

In [41]:
# 남은 부분도 마저 벡터화 후 파일로 저장
vectors['result'][len(keys)//1000*1000:] = keys['result'][len(keys)//1000*1000:].apply(vectorization)
with open('D:/playdata_mini_proj/data/vectors.pickle', 'wb') as file:
        pickle.dump(vectors, file)