# 전처리

In [1]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

In [2]:
import re

In [3]:
df_train = pd.read_excel("https://raw.githubusercontent.com/SNMHZ/Drug_Recommendation/master/dataset/5/lem_train2.xlsx")
df_test = pd.read_excel("https://raw.githubusercontent.com/SNMHZ/Drug_Recommendation/master/dataset/5/lem_test2.xlsx")

In [4]:
df_train = df_train.drop(['Unnamed: 0', 'level_0'], axis=1)
df_test = df_test.drop(['Unnamed: 0', 'level_0'], axis=1)

df_train = df_train.dropna(how='any').reset_index()
df_test = df_test.dropna(how='any').reset_index()

In [5]:
df_train

Unnamed: 0,index,uniqueID,drugName,condition,review,rating,date,usefulCount
0,0,206461,Valsartan,leftventriculardysfunction,side effect take combination bystolic mg fish oil,9,20-May-12,27
1,1,95260,Guanfacine,adhd,son halfway fourth week intuniv . become conce...,8,27-Apr-10,192
2,2,92703,Lybrel,birthcontrol,use take another oral contraceptive pill cycle...,5,14-Dec-09,17
3,3,138000,Ortho Evra,birthcontrol,first time use form birthcontrol . glad go pat...,8,3-Nov-15,10
4,4,35696,Buprenorphine / naloxone,opiatedependence,suboxone completely turn life around . feel he...,9,27-Nov-16,37
...,...,...,...,...,...,...,...,...
159487,159493,191035,Campral,alcoholdependence,write first report mid october . alcohol since...,10,31-May-15,125
159488,159494,127085,Metoclopramide,nauseavomiting,give iv surgey . immediately become anxious co...,1,1-Nov-11,34
159489,159495,187382,Orencia,rheumatoidarthritis,limited improvement month developed bad rash m...,2,15-Mar-14,35
159490,159496,47128,Thyroid desiccated,underactivethyroid,thyroid medication year spent first synthroid ...,10,19-Sep-15,79


In [6]:
conditions = []
reviews = []
# review split
for i in range(0, len(df_train)):
  sentences = re.split('[.!?]', str(df_train.loc[i, 'review']))
  sentences = list(map(lambda x: x.strip(), sentences))
  sentences = list(filter(lambda x: x != '', sentences))
# make list for making new dataframe(this dataframe uses word2vec model)
  for sentence in sentences:
    conditions.append(df_train.loc[i, 'condition'])
    reviews.append(sentence)

In [7]:
conditions_test = []
reviews_test = []
# review split
for i in range(0, len(df_train)):
  sentences = re.split('[.!?]', str(df_train.loc[i, 'review']))
  sentences = list(map(lambda x: x.strip(), sentences))
  sentences = list(filter(lambda x: x != '', sentences))
# make list for making new dataframe(this dataframe uses word2vec model)
  for sentence in sentences:
    conditions_test.append(df_train.loc[i, 'condition'])
    reviews_test.append(sentence)

In [8]:
df_train_sentence = pd.DataFrame(data=list(zip(reviews, conditions)), columns=['review', 'condition'])

In [9]:
df_test_sentence = pd.DataFrame(data=list(zip(reviews_test, conditions_test)), columns=['review', 'condition'])

In [10]:
df_train_sentence

Unnamed: 0,review,condition
0,side effect take combination bystolic mg fish oil,leftventriculardysfunction
1,son halfway fourth week intuniv,adhd
2,become concerned begin last week start take hi...,adhd
3,two day could hardly get bed cranky slept near...,adhd
4,call doctor monday morning say stick day,adhd
...,...,...
987673,doctor start amitiza mg miracle,constipation chronic
987674,four month,constipation chronic
987675,life day bowel motion go comfortably time per ...,constipation chronic
987676,pain little gas,constipation chronic


In [11]:
len(df_train_sentence['condition'].unique())

809

In [12]:
class DatasetParam:
  def __init__(self):
    self.w2v_path = 'https://raw.githubusercontent.com/SNMHZ/Drug_Recommendation/master/model/ver1.0/sentenceunit_word2vec.model'
    self.max_len = 30
    self.batch_size = 4096

In [13]:
param = DatasetParam()

In [14]:
class ReviewConditionDataset:
    def __init__(self, args, condition_info):
        w2v = Word2Vec.load(args.w2v_path)
        self.word_id_dict = {word: i + 2 for i, word in enumerate(w2v.wv.index2word)}
        self.word_id_dict['<pad>'] = 0
        self.word_id_dict['<unknown_word>'] = 1
        self.vocab_size = len(self.word_id_dict)
        self.embedding_dim = w2v.wv.vectors.shape[1]
        self.embeddings = np.append(np.zeros((2, self.embedding_dim), dtype=float), w2v.wv.vectors, axis=0)
        self.pad_id = self.word_id_dict['<pad>']

        self.cat_id_dict = {str(cat_id): i + 1 for i, cat_id in enumerate(condition_info)}
        self.cat_id_dict['<unknown_cat_id>'] = 0

        self.max_len = args.max_len
        self.batch_size = args.batch_size

    def word2id(self, word):
        return self.word_id_dict.get(word) if self.word_id_dict.get(word) is not None else self.word_id_dict['<unknown_word>']

    def sentence2ids(self, sentence, separator=' '):
        words = sentence.split(separator)
        words = words[0:min(self.max_len, len(words))]
        ids = [self.word2id(word) for word in words]
        if len(ids) < self.max_len:
            ids.extend([self.word2id('<pad>')] * (self.max_len - len(ids)))
        return ids

    def cat_id2ids(self, cat_id):
        return self.cat_id_dict.get(cat_id) if self.cat_id_dict.get(cat_id) is not None else self.cat_id_dict['<unknown_cat_id>']

    def prepare_dataframe(self, df, include_cat_id=False):
        df['input_x'] = df['review'].apply(self.sentence2ids)

        if include_cat_id:
            df['input_y'] = df['condition'].apply(self.cat_id2ids)

        return df

    def get_torch_loader(self, _df, include_cat_id=True):
        df = self.prepare_dataframe(_df, include_cat_id=include_cat_id)

        input_x = torch.LongTensor(df['input_x'].to_list())

        if include_cat_id:
            input_y = torch.LongTensor(df['input_y'].to_list())
            dataset = TensorDataset(input_x, input_y)
        else:
            dataset = TensorDataset(input_x)

        loader = DataLoader(dataset, self.batch_size)
        return loader

In [15]:
dataset = ReviewConditionDataset(param, df_train_sentence['condition'].unique())

In [16]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla K80


In [17]:
class ModelParam:
  def __init__(self):
    self.kernel_depth = 100
    self.kernel_sizes = [3, 4]
    self.num_classes = 810
    self.dropout = 0.5

In [18]:
len(df_train_sentence['condition'].unique())

809

In [19]:
mparam = ModelParam()

In [20]:
class TextCNN(nn.Module):
    def __init__(self, args, embeddings=None, vocab_size=None):
        super(TextCNN, self).__init__()

        if embeddings is not None:
            self.embedding_dim = embeddings.shape[1]
            self.embedding = nn.Embedding.from_pretrained(embeddings=embeddings,
                                                          freeze=0)
        else:
            self.embedding_dim = 300
            self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                          embedding_dim=300)

        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=self.embedding_dim,
                      out_channels=args.kernel_depth,
                      kernel_size=args.kernel_sizes[i])
            for i in range(len(args.kernel_sizes))
        ])

        self.fc = nn.Linear(len(args.kernel_sizes) * args.kernel_depth, args.num_classes)
        self.dropout = nn.Dropout(p=args.dropout)

    def forward(self, input_x):
        x_embed = self.embedding(input_x)
        x_embed = x_embed.permute(0, 2, 1)
        x_convs = [F.relu(conv1d(x_embed)) for conv1d in self.convs]
        x_pools = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2]) for x_conv in x_convs]
        x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pools], dim=1)
        logits = self.fc(self.dropout(x_fc))

        return logits

In [21]:
model = TextCNN(mparam, embeddings=torch.FloatTensor(dataset.embeddings))

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
train_sentence, valid_sentence = train_test_split(df_train_sentence, test_size=0.2, random_state=777)

In [24]:
train_sentence

Unnamed: 0,review,condition
123164,past month month half sex drive virtually none...,birthcontrol
302450,well change since wound er last weekend terrib...,constipation chronic
495101,terrible sinus infection,sinusitis
531505,last week,acne
960046,doctor prescribed ambien help sleept hour,insomnia
...,...,...
106071,medication take acute asthma,asthmamaintenance
183206,would recommend treatment anyone plantar wart,humanpapillomavirus
474683,hope find solution work,musclespasm
899887,really hard figure next period,birthcontrol


In [25]:
train_loader = dataset.get_torch_loader(train_sentence)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [26]:
valid_loader = dataset.get_torch_loader(valid_sentence)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [27]:
# for batch in train_loader:
#   print(batch)

In [28]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla K80


In [29]:
def train(model, train_loader, valid_loader):
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print(f'There are {torch.cuda.device_count()} GPU(s) available.')
        print('Device name:', torch.cuda.get_device_name(0))

    else:
        print('No GPU available, using the CPU instead.')
        device = torch.device("cpu")

    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(10):
        model.train()

        train_loss = 0
        for batch in train_loader:
            batch_input_x, batch_input_y = tuple(t.to(device) for t in batch)
            model.zero_grad()
            logits = model(batch_input_x)

            loss = loss_fn(logits, batch_input_y)
            train_loss += loss.item()
            loss.backward()
            optimizer.step()

        model.eval()

        val_loss = 0
        val_accuracy = []
        for batch in valid_loader:
            batch_input_x, batch_input_y = tuple(t.to(device) for t in batch)

            with torch.no_grad():
                logits = model(batch_input_x)

            loss = loss_fn(logits, batch_input_y)
            val_loss += loss.item()

            preds = torch.argmax(logits, dim=1).flatten()

            accuracy = (preds == batch_input_y).cpu().numpy().mean()
            val_accuracy.append(accuracy)

        print('epoch: ', epoch, 'train_loss: ', train_loss/len(train_loader), 'val_loss: ', val_loss/len(valid_loader), 'val_accuracy: ', np.mean(val_accuracy))

    return model

# train

## 리뷰단위 임베딩

In [175]:
train(model, train_loader, valid_loader)

There are 1 GPU(s) available.
Device name: Tesla T4


  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)


epoch:  0 train_loss:  3.769276152658199 val_loss:  3.162481777003554 val_accuracy:  0.34920378965870547
epoch:  1 train_loss:  3.1490750220599097 val_loss:  2.9424512581747084 val_accuracy:  0.38334505502755795
epoch:  2 train_loss:  2.966908146663265 val_loss:  2.819185374213047 val_accuracy:  0.4015516523724562
epoch:  3 train_loss:  2.8635351710556622 val_loss:  2.7510881463035206 val_accuracy:  0.411733379116026
epoch:  4 train_loss:  2.798826354643258 val_loss:  2.7114475711447295 val_accuracy:  0.41715399104366874
epoch:  5 train_loss:  2.7545140211094807 val_loss:  2.683942853427324 val_accuracy:  0.4209340442163652
epoch:  6 train_loss:  2.7204134595986886 val_loss:  2.6652769573399278 val_accuracy:  0.42340045797413794
epoch:  7 train_loss:  2.6944445707521387 val_loss:  2.650431179609455 val_accuracy:  0.4258644979508197
epoch:  8 train_loss:  2.671147885243537 val_loss:  2.639252776005229 val_accuracy:  0.4267379941881006
epoch:  9 train_loss:  2.6503506418091156 val_loss: 

TextCNN(
  (embedding): Embedding(8900, 300)
  (convs): ModuleList(
    (0): Conv1d(300, 100, kernel_size=(3,), stride=(1,))
    (1): Conv1d(300, 100, kernel_size=(4,), stride=(1,))
    (2): Conv1d(300, 100, kernel_size=(5,), stride=(1,))
  )
  (fc): Linear(in_features=300, out_features=4260, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [30]:
train(model, train_loader, valid_loader)

There are 1 GPU(s) available.
Device name: Tesla T4


  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)


epoch:  0 train_loss:  3.7018832357436264 val_loss:  3.1773924000409184 val_accuracy:  0.3475504156403941
epoch:  1 train_loss:  3.177360592728452 val_loss:  2.955463166139564 val_accuracy:  0.382214260973786
epoch:  2 train_loss:  3.0023150629330178 val_loss:  2.8373393379912084 val_accuracy:  0.4004272203223962
epoch:  3 train_loss:  2.9058743733816197 val_loss:  2.7716620503639686 val_accuracy:  0.4109907264360486
epoch:  4 train_loss:  2.8478613900397107 val_loss:  2.7332024233681813 val_accuracy:  0.41576048859737863
epoch:  5 train_loss:  2.8066183273038714 val_loss:  2.7081340673018475 val_accuracy:  0.41879137161330043
epoch:  6 train_loss:  2.7784550745870167 val_loss:  2.689738998607713 val_accuracy:  0.4212850078070021
epoch:  7 train_loss:  2.7549744467661172 val_loss:  2.677372372880274 val_accuracy:  0.4230903770452146
epoch:  8 train_loss:  2.7355428322609225 val_loss:  2.6652971384476642 val_accuracy:  0.4246809850017593
epoch:  9 train_loss:  2.7182813506052286 val_los

TextCNN(
  (embedding): Embedding(8900, 300)
  (convs): ModuleList(
    (0): Conv1d(300, 100, kernel_size=(3,), stride=(1,))
    (1): Conv1d(300, 100, kernel_size=(4,), stride=(1,))
  )
  (fc): Linear(in_features=200, out_features=810, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

## 문장단위 임베딩

In [30]:
train(model, train_loader, valid_loader)

There are 1 GPU(s) available.
Device name: Tesla K80


  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)


epoch:  0 train_loss:  3.825959944354438 val_loss:  3.2371867530199947 val_accuracy:  0.33626204724885644
epoch:  1 train_loss:  3.190945390592585 val_loss:  2.9422106645545183 val_accuracy:  0.38806006306078467
epoch:  2 train_loss:  2.9869136872069206 val_loss:  2.817788537667722 val_accuracy:  0.40619502111189304
epoch:  3 train_loss:  2.888433101881353 val_loss:  2.7544552695994473 val_accuracy:  0.414724480174613
epoch:  4 train_loss:  2.828669913692178 val_loss:  2.7159814591310463 val_accuracy:  0.41981896139382474
epoch:  5 train_loss:  2.786704582253886 val_loss:  2.6900562023629946 val_accuracy:  0.42321906200519
epoch:  6 train_loss:  2.7546024730168477 val_loss:  2.6693669922497807 val_accuracy:  0.42595185636215693
epoch:  7 train_loss:  2.728056403639403 val_loss:  2.6559947753439146 val_accuracy:  0.42724437565974666
epoch:  8 train_loss:  2.7061315731681073 val_loss:  2.6429831251806144 val_accuracy:  0.42899115802032023
epoch:  9 train_loss:  2.686686040206277 val_loss

TextCNN(
  (embedding): Embedding(9193, 300)
  (convs): ModuleList(
    (0): Conv1d(300, 100, kernel_size=(3,), stride=(1,))
    (1): Conv1d(300, 100, kernel_size=(4,), stride=(1,))
  )
  (fc): Linear(in_features=200, out_features=810, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

# test


In [31]:
def predict(model, test_loader):
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print(f'There are {torch.cuda.device_count()} GPU(s) available.')
        print('Device name:', torch.cuda.get_device_name(0))

    else:
        print('No GPU available, using the CPU instead.')
        device = torch.device("cpu")

    model.to(device)

    model.eval()

    result = []
    for batch in test_loader:
        batch_input_x = batch[0].to(device)

        with torch.no_grad():
            logits = model(batch_input_x)

        preds = torch.argmax(logits, dim=1).flatten()
        result.extend(preds.cpu().numpy())

    return result

In [32]:
test_loader = dataset.get_torch_loader(df_test_sentence)

## 리뷰단위 임베딩 2

In [37]:
df_test_sentence['predict'] = predict(model, test_loader)

There are 1 GPU(s) available.
Device name: Tesla T4


In [40]:
len(df_test_sentence[ df_test_sentence['input_y']==df_test_sentence['predict']])

431067

In [41]:
len(df_test_sentence)

987678

In [42]:
431067/987678

0.43644487373415225

## 문장단위 임베딩

In [33]:
df_test_sentence['predict'] = predict(model, test_loader)

There are 1 GPU(s) available.
Device name: Tesla K80


In [34]:
len(df_test_sentence[ df_test_sentence['input_y']==df_test_sentence['predict']]) / len(df_test_sentence)

0.4432021367287719