# 작업환경 생성

In [7]:
# 라이브러리 로드, 데이터 로드, 간단 전처리(특수문자 제거)
import nltk
import pandas as pd
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import  cosine_similarity
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')

#데이터 로드. (lematization, . ! ? 남긴 것)
df_train = pd.read_excel("https://raw.githubusercontent.com/SNMHZ/Drug_Recommendation/master/dataset/5/lem_test2.xlsx")

#nan값 있는지 체크
print(df_train.isnull().values.any())
df_train = df_train.dropna(how='any')
print(df_train.isnull().values.any())

#컨디션에서 특수문자(special character) 제거
df_train['condition_scdrop']=df_train['condition'].apply(lambda x: re.sub('[^a-zA-Z]',' ',x))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
False
False


# 데이터 확인

In [3]:
#데이터프레임 확인
df_train

Unnamed: 0.1,Unnamed: 0,drugName,condition,rating,usefulCount,date,review,condition_scdrop
0,0,Mirtazapine,depression,10,22,28-Feb-12,try antidepressant year citalopram fluoxetine ...,depression
1,1,Mesalamine,crohnsdisease maintenance,8,17,17-May-09,son crohn disease well asacol . complaint show...,crohnsdisease maintenance
2,2,Bactrim,urinarytractinfection,9,3,29-Sep-17,quick reduction symptom,urinarytractinfection
3,3,Contrave,weightloss,9,35,5-Mar-17,contrave combine drug use alcohol smoking opio...,weightloss
4,4,Cyclafem 1 / 35,birthcontrol,9,4,22-Oct-15,birthcontrol one cycle . reading review type s...,birthcontrol
...,...,...,...,...,...,...,...,...
53466,53761,Tamoxifen,breastcancer prevention,10,43,13-Sep-14,take tamoxifen year . side effect severe sweat...,breastcancer prevention
53467,53762,Escitalopram,anxiety,9,11,8-Oct-16,take lexapro escitaploprgram since february . ...,anxiety
53468,53763,Levonorgestrel,birthcontrol,8,7,15-Nov-10,married year old kid . take pill hassle decide...,birthcontrol
53469,53764,Tapentadol,pain,1,20,28-Nov-11,prescribed nucynta severe neck shoulder pain ....,pain


In [4]:
#2개 문장 확인. 
print(df_train['review'][0])
print(df_train['review'][1])

try antidepressant year citalopram fluoxetine amitriptyline none help depression insomnia amp anxiety . doctor suggest change onto mg mirtazapine medicine save life . thankfully side effect especially common weight gain actually lose alot weight . still suicidal thought mirtazapine save .
son crohn disease well asacol . complaint show side effect . take many nine tablet per day one time . happy result reduce bout diarrhea drastically .


# 컨디션 딕셔너리 생성(공백 제거용)

ex)

| original(key) | modified(value) |
| ------------- | --------------- |
| left ventricular dysfunction | leftventriculardysfunction |
| adhd | adhd |
| birth control | birthcontrol |

In [5]:
spaced_condition=pd.read_csv('https://raw.githubusercontent.com/SNMHZ/Drug_Recommendation/master/dataset/spaced_condition.csv', index_col=0)
ori_con=spaced_condition['original'].unique()
mod_con=spaced_condition['modified'].unique()

for i in range(len(ori_con)):
    ori_con[i]=re.sub('[^a-zA-Z]',' ',ori_con[i])

for i in range(len(mod_con)):
    mod_con[i]=re.sub('[^a-zA-Z]',' ',mod_con[i])

con_dict={}
for i in range(len(mod_con)):
    con_dict[ori_con[i]]=mod_con[i]

# 학습을 위한 토큰 생성 (문장단위)

In [26]:
conditions = []
reviews = []
# review split
for i in range(0, len(df_train)):
  sentences = re.split('[.!?]', str(df_train.loc[i, 'review']))
  sentences = list(map(lambda x: x.strip(), sentences))
  sentences = list(filter(lambda x: x != '', sentences))
# make list for making new dataframe(this dataframe uses word2vec model)
  for sentence in sentences:
    conditions.append(df_train.loc[i, 'condition'])
    reviews.append(sentence)

In [29]:
reviews[:5]

['try antidepressant year citalopram fluoxetine amitriptyline none help depression insomnia amp anxiety',
 'doctor suggest change onto mg mirtazapine medicine save life',
 'thankfully side effect especially common weight gain actually lose alot weight',
 'still suicidal thought mirtazapine save',
 'son crohn disease well asacol']

In [34]:
tokenize_data = [word_tokenize(str(sentence)) for sentence in reviews]

In [36]:
#토큰 확인
print(tokenize_data[0])
print(tokenize_data[1])
print(tokenize_data[2])
print(tokenize_data[3])

['try', 'antidepressant', 'year', 'citalopram', 'fluoxetine', 'amitriptyline', 'none', 'help', 'depression', 'insomnia', 'amp', 'anxiety']
['doctor', 'suggest', 'change', 'onto', 'mg', 'mirtazapine', 'medicine', 'save', 'life']
['thankfully', 'side', 'effect', 'especially', 'common', 'weight', 'gain', 'actually', 'lose', 'alot', 'weight']
['still', 'suicidal', 'thought', 'mirtazapine', 'save']


# word2vec 모델 학습

In [38]:
##모델 학습
"""
하이퍼 파라미터
size : 300
window : 1
min_count : 5
workers : 4
method : CBOW
"""
model = Word2Vec(sentences= tokenize_data, size=300, window=1, min_count=5, workers=4, sg=0)

In [41]:
model.save("sentenceunit_word2vec.model")

# 모델 테스트

In [39]:
#학습된 word2vec모델 테스트
print(type(model.wv.vectors))  #모델 weights table
print(model.wv.get_vector('side'))
print(model.wv.get_vector('side').shape)

<class 'numpy.ndarray'>
[-0.24337502  0.24363759 -0.62050337  0.41842476 -0.18864296  0.05832148
 -0.06537849 -0.40217784  0.27395988 -0.59621465  0.59413    -0.33019018
  0.21194983  0.3297442   0.26731834  0.5171175   0.0330053  -0.07013391
 -0.11489046 -0.21973899 -0.19822662  0.5882281   0.708226    0.00567076
  0.840377    0.35381705 -0.47228533 -0.17680347 -0.0681436   0.15785968
 -0.02721927 -0.06609593 -0.02157627  0.26744103 -0.21293879  0.3938433
 -0.22570412  0.08792445  0.13458575 -0.16374373 -0.5448308  -0.7370706
 -0.11973938  0.22129293  0.4719627  -0.26281756 -0.17542244 -0.3515445
  0.0535651  -0.5179774  -0.30915862  0.16798405  0.04894041  0.09144904
  0.47989917  0.4735117   0.24267711  0.15603589  0.2891011   0.27979696
  0.51095015  0.42722246  0.9438565  -0.5384825   0.3521773   0.15593289
  0.03171682 -1.0003289   0.6993823  -0.6974573   0.02633422 -0.013568
  0.6026725   0.28296342 -0.3633771   0.37176648  0.10274269 -0.2434606
  0.46594778 -1.1135343   0.19968

In [40]:
print(len(df_train['condition_scdrop'].unique()))

666
