In [2]:
!pip install keybert

Collecting keybert
  Downloading keybert-0.8.2.tar.gz (29 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting sentence-transformers>=0.3.8 (from keybert)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
     ---------------------------------------- 0.0/86.0 kB ? eta -:--:--
     ---------------------------------------- 86.0/86.0 kB 4.7 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting rich>=10.4.0 (from keybert)
  Obtaining dependency information for rich>=10.4.0 from https://files.pythonhosted.org/packages/be/2a/4e62ff633612f746f88618852a626bbe24226eba5e7ac90e91dcfd6a414e/rich-13.6.0-py3-none-any.whl.metadata
  Downloading rich-13.6.0-py3-none-any.whl.metadata (18 kB)
Collecting sentencepiece (from sentence-transformers>=0.3.8->keybert)
  Downloading sentencepiece-0.1.99-cp38-cp38-win_amd64.whl (977 kB)
     --------------------------

In [26]:
import os
from keybert import KeyBERT
import pandas as pd
import re

## 모델 정의

In [7]:
# CSV 파일 경로
csv_file_path = './data/RSS_total.csv'

# CSV 파일을 pandas DataFrame으로 읽기
df = pd.read_csv(csv_file_path,encoding='ANSI')

# KeyBERT 모델 로드
model = KeyBERT('distilbert-base-nli-mean-tokens')

## 모델 적용 함수 정의

In [63]:
# 키워드 추출 함수 정의
def extract_keywords(text, num_keywords=5):
    keywords = model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words='english', use_maxsum=True, nr_candidates=20)
    return [keyword[0] for keyword in keywords[:num_keywords]]

## 범위 바꿔가며 데이터 한 번 확인하기

In [41]:
df['news_smy_ifo'][101:106]

101    The following are today's upgrades for Validea...
102    (RTTNews) - The following are some of the stoc...
103    (RTTNews) - The following are some of the stoc...
104    --LifeMD, Inc., a leading direct-to-patient te...
105    Know Labs, Inc., an emerging developer of non-...
Name: news_smy_ifo, dtype: object

## 추출 결과 확인

In [42]:
# 각 행에 대한 키워드 추출 및 결과 저장
result = df['news_smy_ifo'][101:106].apply(extract_keywords)
result

101       [valley, small, upgrades, investor, bancorp]
102             [50, big, trading, rttnews, wednesday]
103             [50, big, trading, rttnews, wednesday]
104    [b2b, today, companies, telehealth, healthcare]
105      [developer, new, glucose, hospital, diabetes]
Name: news_smy_ifo, dtype: object

## 데이터 정제하기

숫자 데이터와 신문사 데이터는 노이즈로 작용할 확률이 높다고 판단해 제거하기로 결정함

In [85]:
# 제거 실습
original_string = df.news_smy_ifo[10000]
print(original_string)
modified_string = re.sub(r'\([^)]*\)|\d+', '', original_string)
print('------------')
print(modified_string)

In trading on Wednesday, shares of Dada Nexus Ltd (Symbol: DADA) crossed below their 200 day moving average of $7.59, changing hands as low as $7.52 per share. Dada Nexus Ltd shares are currently trading off about 4.5% on the day. The chart below shows the one year performanc
------------
In trading on Wednesday, shares of Dada Nexus Ltd  crossed below their  day moving average of $., changing hands as low as $. per share. Dada Nexus Ltd shares are currently trading off about .% on the day. The chart below shows the one year performanc


In [None]:
# 제거하며 데이터프레임 수정
for i in range(len(df)):
    df.news_smy_ifo[i] = re.sub(r'\([^)]*\)|\d+', '', df.news_smy_ifo[i])
    if i % 1000 == 0:
        print('+1000 complete')

In [91]:
# 결과확인
df.news_smy_ifo[102]

" - The following are some of the stocks making big moves in Wednesday's pre-market trading ."

In [92]:
# 결과를 파일로 저장 (최초 한 번만)
#df.to_csv('./data/RSS_for_keybert.csv', index=False, encoding='ANSI')

## 키워드 추출 (뒤에서 부터)

In [130]:
len(df)//1000

146

In [None]:
keys = pd.DataFrame()
keys['result'] = ['nan'] * len(df)
for i in range(0,len(df)//1000):
    keys['result'][len(df)-1-1000*i:len(df)-1-1000*(i+1):-1] = df['news_smy_ifo'][len(df)-1-1000*i:len(df)-1-1000*(i+1):-1].apply(extract_keywords)
    print(1000*i, 'complete')
    
keys['result'][len(df)-1-1000*146::-1] = df['news_smy_ifo'][len(df)-1-1000*146::-1].apply(extract_keywords)

In [161]:
for i in range(len(df)-1000*146):
    keys['result'][i] = extract_keywords(df['news_smy_ifo'][i])

## 결과 확인 & 오류 검사

In [182]:
# 결과 확인
print(keys['result'][999])
print(keys['result'][10000])
print(keys['result'][110000])

['stonex', 'vrtv', 'stocks', 'week', 'january']
['performanc', 'trading', 'dada', 'low', 'wednesday']
['research', 'zacks', 'global', 'mastercard', 'august']


In [189]:
keys.head()

Unnamed: 0,result
0,"[natural, group, disclosing, lp, gas]"
1,"[light, analysts, weeks, stock, beaten]"
2,"[flying, yellow, disconnect, investors, oil]"
3,"[french, agribusiness, oldest, monday, beer]"
4,"[flying, yellow, disconnect, investors, oil]"


In [162]:
len(keys) == len(df)

True

In [188]:
keys.isnull().sum()

result    0
dtype: int64

In [184]:
# 추출과정에 참여하지 못한 누락된 인덱스가 있는지
count = 0
for i in range(len(keys)):
    if keys['result'][i] == 'nan':
        print(i)
        count += 1
print(count)

0


In [193]:
# 잘못 매칭된 데이터가 있는지
count = 0
for i in range(len(keys)):
    for k in keys['result'][i]:
        if k not in df['news_smy_ifo'][i].lower():
            print(i)
            count += 1
print('-----------------------')
print('total :',count)        

-----------------------
total : 0


## 결과 저장

In [195]:
# CSV 파일로
keys.to_csv('./data/keybert_result.csv', index=False,encoding='ANSI')

In [196]:
key_df = pd.read_csv('./data/keybert_result.csv',encoding='ANSI')

In [197]:
key_df.isnull().sum()

result    0
dtype: int64

In [198]:
# pickle 파일로
import pickle
with open('./data/keybert_result.pickle', 'wb') as file:
    pickle.dump(keys, file)

In [199]:
with open('./data/keybert_result.pickle', 'rb') as file:
    keys_df = pickle.load(file)
    
key_df.isnull().sum()

result    0
dtype: int64

In [201]:
key_df.head()

Unnamed: 0,result
0,"['natural', 'group', 'disclosing', 'lp', 'gas']"
1,"['light', 'analysts', 'weeks', 'stock', 'beaten']"
2,"['flying', 'yellow', 'disconnect', 'investors'..."
3,"['french', 'agribusiness', 'oldest', 'monday',..."
4,"['flying', 'yellow', 'disconnect', 'investors'..."
