In [2]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

In [3]:
df_train = pd.read_csv("https://raw.githubusercontent.com/SNMHZ/Drug_Recommendation/master/dataset/3/lem_train.csv")
df_test = pd.read_csv("https://raw.githubusercontent.com/SNMHZ/Drug_Recommendation/master/dataset/3/lem_test.csv")

In [11]:
wv_model = Word2Vec.load("https://raw.githubusercontent.com/SNMHZ/Drug_Recommendation/master/model/ver1.0/reviewunit_word2vec.model")

## 필요없는 columns과 결측값을 가지고 있는 row 제거

In [4]:
df_train = df_train.drop(['Unnamed: 0', 'level_0'], axis=1)
df_test = df_test.drop(['Unnamed: 0', 'level_0'], axis=1)

df_train = df_train.dropna(how='any')
df_test = df_test.dropna(how='any')

## Functions

In [5]:
# word vectors의 평균을 구하여 doc vectors 계산하는 function
def doc_vectors(document_list):
    document_embedding_list = []
    not_embedding_list = []

    for index, line in enumerate(document_list):
        doc2vec = None
        count = 0
        for word in line.split():
            if word in wv_model.wv.vocab:
                count += 1
                
                if doc2vec is None:
                    doc2vec = wv_model[word]
                else:
                    doc2vec = doc2vec + wv_model[word]

        if doc2vec is not None:
            doc2vec = doc2vec / count
            document_embedding_list.append(doc2vec)
        else:
            document_embedding_list.append(None)
            not_embedding_list.append(index)

    return document_embedding_list, not_embedding_list

In [6]:
# condition vectors 계산하는 function
def con_vectors(condition_list):
  target_vector_row_size = len(condition_list.unique())
  data = condition_list.unique()

  target_list = []

  for target_condition in condition_list:
    target_vector = np.zeros((1, target_vector_row_size), dtype='object')
    target_vector[0][np.where(data == target_condition)] = 1
    target_list.append(target_vector)

  return target_list

## 정수인코딩 - 컨디션 입력

In [7]:
condition_list = df_train['condition'].unique()

condition_number = []
for condition in df_train['condition']:
  condition_number.append(list(condition_list).index(condition))

In [8]:
# cn is 'condition number'
df_train['cn'] = condition_number

In [9]:
# 제대로 인코딩 됐나 체크
df_train.loc[df_train['condition']=='ADHD', ['condition', 'cn']]

Unnamed: 0,condition,cn
1,ADHD,1
27,ADHD,1
50,ADHD,1
114,ADHD,1
207,ADHD,1
...,...,...
159179,ADHD,1
159200,ADHD,1
159286,ADHD,1
159313,ADHD,1


## doc vectors 입력

In [12]:
# document vectors 계산
document_embedding_list, not_embedding_list = doc_vectors(df_train['review'])
target_list = con_vectors(df_train['condition'])
print('문서 벡터의 수 :',len(document_embedding_list))
print('입력 안 된 index 수:', len(not_embedding_list))

  
  app.launch_new_instance()


문서 벡터의 수 : 159363
입력 안 된 index 수: 24


In [13]:
'''
dv is 'document vector'
cv is 'condition vector'
'''
df_train['dv'] = document_embedding_list
df_train['cv'] = target_list

df_train = df_train.dropna(how='any')

## 정수 인코딩 - 리뷰 입력

In [14]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [15]:
word_token_list = [word_tokenize(str(sentence)) for sentence in df_train['review']]

In [16]:
# wt is 'word tokenize'
df_train['wt'] = word_token_list

In [17]:
from tensorflow.keras.datasets import reuters
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Flatten
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

In [18]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train['wt'])

In [19]:
integer_encoding = tokenizer.texts_to_sequences(df_train['wt'])

In [20]:
# ie is 'word integer encoding'
df_train['ie'] = integer_encoding

In [21]:
df_train.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,cn,dv,cv,wt,ie
0,206461,Valsartan,Left Ventricular Dysfunction,side effect take combination bystolic mg fish oil,9,2012-05-20,27,0,"[-0.18059501, -0.26552644, -0.06564915, 0.3360...","[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[side, effect, take, combination, bystolic, mg...","[11, 10, 1, 600, 2216, 15, 2940, 1167]"
1,95260,Guanfacine,ADHD,son halfway fourth week intuniv become concern...,8,2010-04-27,192,1,"[0.061731283, 0.041840646, 0.087043226, 0.1238...","[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[son, halfway, fourth, week, intuniv, become, ...","[446, 2521, 1270, 8, 2505, 155, 970, 121, 46, ..."
2,92703,Lybrel,Birth Control,use take another oral contraceptive pill cycle...,5,2009-12-14,17,2,"[0.10675427, -0.1484156, 0.13022046, 0.1530636...","[[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[use, take, another, oral, contraceptive, pill...","[22, 1, 156, 928, 1242, 20, 337, 154, 177, 23,..."
3,138000,Ortho Evra,Birth Control,first time use form birth control glad go patc...,8,2015-11-03,10,2,"[-0.019708635, -0.09204713, 0.15514378, -0.007...","[[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[first, time, use, form, birth, control, glad,...","[16, 13, 22, 431, 63, 43, 428, 5, 228, 4, 16, ..."
4,35696,Buprenorphine / naloxone,Opiate Dependence,suboxone completely turn life around feel heal...,9,2016-11-27,37,3,"[0.16292676, 0.020405171, 0.12444847, -0.03782...","[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[suboxone, completely, turn, life, around, fee...","[719, 152, 344, 36, 117, 12, 1837, 6822, 302, ..."


## keras model preprocessing

In [22]:
# integer encoding 사이즈 맞추기 위해 padding 추가

max_len = 100 # 리뷰 최대길이 설정, (이하일 시, 앞에 0 padding 추가 / 초과할 시, 제거)
df_train['ie'] = pad_sequences(df_train['ie'], maxlen=max_len)

In [23]:
df_train['cn'].values

array([ 0,  1,  2, ..., 18, 46, 73])

In [25]:
# condition one-hot encoding

target = to_categorical(df_train['cn'].values)

In [57]:
X_train, X_test, y_train, y_test = train_test_split(df_train['ie'], target, shuffle=True, random_state=16)

In [34]:
output_len = len(df_train['condition'].unique())

In [35]:
vocab_size = len(tokenizer.word_index) + 1

## w2v model load


```python
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
 
print(word2vec_model.vectors.shape) # (3000000, 300)
 
embedding_matrix = np.zeros((vocab_size, 300))
# 단어 집합 크기의 행과 300개의 열을 가지는 행렬 생성. 값은 전부 0으로 채워진다.
print(np.shape(embedding_matrix)) # (16, 300)
 
def get_vector(word):
    if word in word2vec_model:
        return word2vec_model[word]
    else:
        return None
 
for word, i in t.word_index.items(): # 훈련 데이터의 단어 집합에서 단어와 정수 인덱스를 1개씩 꺼내온다.
    temp = get_vector(word) # 단어(key) 해당되는 임베딩 벡터의 300개의 값(value)를 임시 변수에 저장
    if temp is not None: # 만약 None이 아니라면 임베딩 벡터의 값을 리턴받은 것이므로
        embedding_matrix[i] = temp # 해당 단어 위치의 행에 벡터의 값을 저장한다.
 
 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
 
model = Sequential()
e = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_len, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(X_train, y_train, epochs=100, verbose=2)


출처: https://ebbnflow.tistory.com/154 [Dev Log : 삶은 확률의 구름]
```

In [30]:
print(wv_model.vector_size) # 300차원 벡터

300


In [36]:
embedding_matrix = np.zeros((vocab_size, 300))
# 단어 집합 크기의 행과 300개의 열을 가지는 행렬 생성. 값은 전부 0으로 채워진다.
print(np.shape(embedding_matrix)) # (56097, 300)

(56097, 300)


In [40]:
# word2vector model에서 단어를 찾아 꺼냄
def get_vector(word):
    if word in wv_model:
        return wv_model[word]
    else:
        return None

In [41]:
for word, i in tokenizer.word_index.items(): # 훈련 데이터의 단어 집합에서 단어와 정수 인덱스를 1개씩 꺼내온다.
    temp = get_vector(word) # 단어(key) 해당되는 임베딩 벡터의 300개의 값(value)를 임시 변수에 저장
    if temp is not None: # 만약 None이 아니라면 임베딩 벡터의 값을 리턴받은 것이므로
        embedding_matrix[i] = temp # 해당 단어 위치의 행에 벡터의 값을 저장한다.

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [53]:
model = Sequential()
model.add(Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_len, trainable=False)) # vocab_size=단어 개수, output_dim=출력할 차원 size, input_length=입력 길이
model.add(LSTM(300)) # 메모리 셀의 은닉 상태의 크기
model.add(Dense(810, activation='softmax')) # 810=y의 길이(만들어낼 확률 분포의 개수)

In [54]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

In [55]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

In [58]:
history = model.fit(X_train, y_train, batch_size=128, epochs=30, callbacks=[es, mc], validation_data=(X_test, y_test))

Epoch 1/30

Epoch 00001: val_acc improved from -inf to 0.17994, saving model to best_model.h5
Epoch 2/30

Epoch 00002: val_acc improved from 0.17994 to 0.18007, saving model to best_model.h5
Epoch 3/30

Epoch 00003: val_acc improved from 0.18007 to 0.18012, saving model to best_model.h5
Epoch 4/30

Epoch 00004: val_acc improved from 0.18012 to 0.18019, saving model to best_model.h5
Epoch 5/30

Epoch 00005: val_acc improved from 0.18019 to 0.18022, saving model to best_model.h5
Epoch 6/30

Epoch 00006: val_acc improved from 0.18022 to 0.18034, saving model to best_model.h5
Epoch 7/30

Epoch 00007: val_acc improved from 0.18034 to 0.18044, saving model to best_model.h5
Epoch 8/30

Epoch 00008: val_acc did not improve from 0.18044
Epoch 9/30

Epoch 00009: val_acc did not improve from 0.18044
Epoch 10/30

Epoch 00010: val_acc improved from 0.18044 to 0.18047, saving model to best_model.h5
Epoch 00010: early stopping


In [59]:
loaded_model = load_model('best_model.h5')
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))


 테스트 정확도: 0.1805


In [60]:
result = loaded_model.evaluate(X_test, y_test)
result



[4.341958999633789, 0.18046943843364716]