# 작업환경 생성

In [33]:
# 라이브러리 로드, 데이터 로드, 간단 전처리(특수문자 제거)
import nltk
import pandas as pd
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import  cosine_similarity
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')

#데이터 로드. (lematization, 특수문자 전체 제거)
df_train = pd.read_csv("https://raw.githubusercontent.com/SNMHZ/Drug_Recommendation/master/dataset/3/lem_test.csv")

#nan값 있는지 체크
print(df_train.isnull().values.any())
df_train = df_train.dropna(how='any')
print(df_train.isnull().values.any())

#컨디션에서 특수문자(special character) 제거
df_train['condition_scdrop']=df_train['condition'].apply(lambda x: re.sub('[^a-zA-Z]',' ',x))

True
False


# 데이터 확인

In [34]:
#데이터프레임 확인
df_train

Unnamed: 0.1,Unnamed: 0,level_0,uniqueID,drugName,condition,review,rating,date,usefulCount,condition_scdrop
0,0,0,163740,Mirtazapine,Depression,try antidepressant year citalopram fluoxetine ...,10,2012-02-28,22,Depression
1,1,1,206473,Mesalamine,"Crohn's Disease, Maintenance",son crohn 's disease do well asacol complaint ...,8,2009-05-17,17,Crohn s Disease Maintenance
2,2,2,159672,Bactrim,Urinary Tract Infection,quick reduction symptom,9,2017-09-29,3,Urinary Tract Infection
3,3,3,39293,Contrave,Weight Loss,contrave combine drug use alcohol smoking opio...,9,2017-03-05,35,Weight Loss
4,4,4,97768,Cyclafem 1 / 35,Birth Control,birth control one cycle reading review type si...,9,2015-10-22,4,Birth Control
...,...,...,...,...,...,...,...,...,...,...
53195,53195,53466,159999,Tamoxifen,"Breast Cancer, Prevention",take tamoxifen year side effect severe sweat d...,10,2014-09-13,43,Breast Cancer Prevention
53196,53196,53467,140714,Escitalopram,Anxiety,take lexapro escitaploprgram since february fi...,9,2016-10-08,11,Anxiety
53197,53197,53468,130945,Levonorgestrel,Birth Control,married year old kid take pill hassle decide g...,8,2010-11-15,7,Birth Control
53198,53198,53469,47656,Tapentadol,Pain,prescribed nucynta severe neckshoulder pain ta...,1,2011-11-28,20,Pain


In [48]:
#2개 리뷰 확인. 
print(df_train['review'][0])
print(df_train['review'][1])

try antidepressant year citalopram fluoxetine amitriptyline none help depression insomnia anxiety doctor suggest change onto mg mirtazapine medicine save life thankfully side effect especially common weight gain actually lose alot weight still suicidal thought mirtazapine save
son crohn 's disease do well asacol complaint show side effect take many nine tablet per day one time happy result reduce bout diarrhea drastically


# 컨디션 딕셔너리 생성(공백 제거용)

ex)

| original(key) | modified(value) |
| ------------- | --------------- |
| left ventricular dysfunction | leftventriculardysfunction |
| adhd | adhd |
| birth control | birthcontrol |

In [36]:
spaced_condition=pd.read_csv('https://raw.githubusercontent.com/SNMHZ/Drug_Recommendation/master/dataset/spaced_condition.csv', index_col=0)
ori_con=spaced_condition['original'].unique()
mod_con=spaced_condition['modified'].unique()

for i in range(len(ori_con)):
    ori_con[i]=re.sub('[^a-zA-Z]',' ',ori_con[i])

for i in range(len(mod_con)):
    mod_con[i]=re.sub('[^a-zA-Z]',' ',mod_con[i])

con_dict={}
for i in range(len(mod_con)):
    con_dict[ori_con[i]]=mod_con[i]

# 학습을 위한 토큰 생성 (리뷰단위)

In [37]:
#각 리뷰마다 토큰으로 활용
tokenize_data = [word_tokenize(sentence) for sentence in df_train['review']]

In [38]:
#토큰 확인
print(tokenize_data[0])
print(tokenize_data[1])

['try', 'antidepressant', 'year', 'citalopram', 'fluoxetine', 'amitriptyline', 'none', 'help', 'depression', 'insomnia', 'anxiety', 'doctor', 'suggest', 'change', 'onto', 'mg', 'mirtazapine', 'medicine', 'save', 'life', 'thankfully', 'side', 'effect', 'especially', 'common', 'weight', 'gain', 'actually', 'lose', 'alot', 'weight', 'still', 'suicidal', 'thought', 'mirtazapine', 'save']
['son', 'crohn', "'s", 'disease', 'do', 'well', 'asacol', 'complaint', 'show', 'side', 'effect', 'take', 'many', 'nine', 'tablet', 'per', 'day', 'one', 'time', 'happy', 'result', 'reduce', 'bout', 'diarrhea', 'drastically']


# word2vec 모델 학습

In [39]:
##모델 학습
"""
하이퍼 파라미터
size : 300
window : 5
min_count : 5
workers : 4
sg : 0
"""
model = Word2Vec(sentences= tokenize_data, size=300, window=5, min_count=5, workers=4, sg=0)

In [49]:
model.save("reviewunit.model")

# 모델 테스트

In [43]:
#학습된 word2vec모델 테스트
print(type(model.wv.vectors))  #모델 weights table
print(model.wv.get_vector('side'))
print(model.wv.get_vector('side').shape)

<class 'numpy.ndarray'>
[ 9.8201409e-02 -7.9777676e-01 -9.9643636e-01  1.5658544e+00
  1.6253623e+00 -8.1366825e-01  1.3394305e+00  1.3516600e+00
 -1.4351281e+00 -1.2797773e+00  1.9919756e-01 -8.4513271e-01
  5.8099866e-01  7.5499415e-02  2.0045714e+00  2.7587742e-01
 -4.9577966e-01  1.1168567e+00 -5.4723090e-01 -8.5726339e-01
 -1.4280276e-01  4.9019989e-01 -9.3238622e-01 -2.3033838e-01
  8.9625353e-01 -1.1134421e+00 -8.1651849e-01 -1.3427712e+00
  6.9832325e-01  5.9170914e-01  5.2460372e-01 -4.8411572e-01
  2.1508366e-01 -5.0861454e-01 -5.6738061e-01 -1.6511693e+00
  1.1079187e+00 -7.7671617e-01 -2.3573220e-01 -1.5581998e+00
  2.8272590e-01 -4.1950801e-01  8.9612281e-01 -1.8800481e-01
 -8.6894161e-01  6.3074791e-01  6.1592913e-01 -1.0326896e+00
  4.0035239e-01 -3.2331330e-01  3.0972663e-01  1.5684550e+00
 -2.1330318e+00 -1.3289324e+00  1.1481117e+00 -9.0026295e-01
  8.5818672e-01 -1.4753361e+00 -9.7495514e-01 -6.2327373e-01
  7.8418016e-02 -7.9003781e-02  1.6765706e-01  7.7329534e-01


In [47]:
print(len(df_train['condition_scdrop'].unique()))

663
