<a href="https://colab.research.google.com/github/Rhcsky/Colab-pytorch/blob/master/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# !pip install konlpy
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torchtext.data import Field, TabularDataset, Iterator, BucketIterator

import numpy as np
import pandas
import copy
from tqdm import tqdm_notebook
from sklearn.metrics import roc_auc_score

if torch.cuda.is_available():
  print("You can use cuda GPU")
  device = "cuda"
else:
  device = "cpu"

tokenizer = 'mecab_'

You can use cuda GPU


# Dataset Declare

In [0]:
%%time

ID = Field(sequential=False,use_vocab=False,batch_first=True)
TEXT = Field(use_vocab=True,include_lengths=False,batch_first=True)
LABEL = Field(sequential=False,use_vocab=False, batch_first = True, preprocessing = lambda x : int(x))
data_fields = [('id',ID),('year_month',None),('text',TEXT),('smishing',LABEL)]

train, val = TabularDataset.splits(path = '/content/drive/My Drive/Colab Notebooks/data',
                                   train = tokenizer + 'train_split.csv',validation = tokenizer +'val_split.csv', format = 'csv', skip_header = True, fields = data_fields)

CPU times: user 8.48 s, sys: 1.1 s, total: 9.59 s
Wall time: 11 s


In [0]:
print(train[0].text)
print(val[0].text)

['은JX', '행시NNG', '화XSN', '점NNG', '팀장NNG', '전화NNG', ':SC', '-SY', '-SY', '팩스NNG', ':SC', '-SY', '-SY', '입니다VCP+EF', '!SF', '!SF']
['봄NNG', '햇살NNG', '가득MAG', '한XSA+ETM', '기분NNG', '좋VA', '은ETM', '월요일NNG', '입니다VCP+EF', '.SF', '활기찬VA+ETM', '한주NNG', '되XSV', '세요EP+EF', '국NNG', '올림VV+EC']


In [0]:
%%time
dict_freq = 10
TEXT.build_vocab(train,min_freq = dict_freq)
len(TEXT.vocab)
# train[0].__dict__.keys()

CPU times: user 2.87 s, sys: 7.46 ms, total: 2.88 s
Wall time: 2.89 s


In [0]:
%%time
train_loader, val_loader = BucketIterator.splits((train,val),batch_size = 512, device = 'cuda', sort_key = lambda x: len(x.text),sort_within_batch=False, repeat=False, shuffle=False)

# ex_batch = next(train_loader.__iter__())
# print(ex_batch.__dict__keys())

CPU times: user 69 µs, sys: 9 µs, total: 78 µs
Wall time: 82.3 µs


In [0]:
class LSTM(nn.Module):
    def __init__(self,input_size,embed_size,hidden_size,output_size,num_layers=1,bidirec=False, drop_out = 0.3):
        super(LSTM,self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.output_size = output_size

        if bidirec:
            self.num_directions = 2
        else:
            self.num_directions = 1

        if torch.cuda.is_available():
          device = "cuda"
        else:
          device = "cpu"  
                
        self.embed = nn.Embedding(input_size,embed_size)
        self.lstm = nn.LSTM(embed_size,hidden_size,num_layers,batch_first=True, bidirectional=bidirec, dropout=drop_out)
        self.linear = nn.Linear(hidden_size*self.num_directions,output_size)
        self.dropout = nn.Dropout(drop_out)

        
    def init_hidden(self,batch_size):
        # (num_layers * num_directions, batch_size, hidden_size)
        hidden = torch.zeros(self.num_layers*self.num_directions,batch_size,self.hidden_size,device = device)
        cell = torch.zeros(self.num_layers*self.num_directions,batch_size,self.hidden_size,device = device)
        return hidden, cell
    
    def forward(self,inputs):
        """
        inputs : B,T
        """
        embed = self.embed(inputs) # word vector indexing
        hidden, cell = self.init_hidden(inputs.size(0)) # initial hidden,cell
        output, (hidden,cell) = self.lstm(embed,(hidden,cell))

        # Many-to-One
        hidden = hidden[-self.num_directions:] # (num_directions,B,H)
        hidden = torch.cat([h for h in hidden],1)
        hidden = self.dropout(hidden)
        output = self.linear(hidden) # last hidden

        return output

In [0]:
embed = 8
hidden = 16
output = 1
drop_out = 0.3

model = LSTM(len(TEXT.vocab),embed,hidden,output,bidirec=True, num_layers = 2, drop_out = drop_out)
model = model.to('cuda')
loss_function = nn.BCEWithLogitsLoss().to(device)
optimizer = optim.Adam(model.parameters(),lr=0.01)

parameters = "-".join([str(embed),str(hidden),str(drop_out)])
log_dir = "/content/drive/My Drive/Colab Notebooks/logs/LSTM_" + tokenizer

writer = SummaryWriter(log_dir = log_dir)

print(model)

LSTM(
  (embed): Embedding(13704, 8)
  (lstm): LSTM(8, 16, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  (linear): Linear(in_features=32, out_features=1, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)


In [0]:
EPOCH = 8
best_loss = 100

for epoch in tqdm_notebook(range(EPOCH),desc='EPOCH'):
  model.train()
  pred_list = []
  target_list = []
  trn_loss = 0
  for i,batch in enumerate(tqdm_notebook(train_loader,desc = "Train")):
      x = batch.text
      y = batch.smishing.float().unsqueeze(1)
      
      optimizer.zero_grad()
      preds = model(x)
      loss = loss_function(preds,y)
      loss.backward()
      optimizer.step()
      trn_loss += loss.item()
      preds = torch.sigmoid(preds)
      for idx in range(len(preds)):
        pred_list.append(preds[idx].item())
        target_list.append(y[idx].item())
  #scoring
  pred_numpy = np.asarray(pred_list)
  target_numpy = np.asarray(target_list)
  trn_score = roc_auc_score(target_numpy,pred_numpy)
  print(pred_numpy,target_numpy)
  for idx, a in enumerate(pred_numpy):
    if a>0.5:
      pred_numpy[idx] = 1
    else:
      pred_numpy[idx] = 0
  trn_error = len(pred_list) - (pred_numpy == target_numpy).sum()
  print(f'TRAIN score = {trn_score} || loss = {trn_loss} || ACC = {trn_error}')
  
  model.eval()
  pred_list=[]
  target_list = []
  val_loss = 0
  with torch.no_grad():
    for i, (x,y) in enumerate(tqdm_notebook(val_loader,desc="Val")): 
      x = batch.text.to('cuda')
      y = batch.smishing.float().to('cuda').unsqueeze(1)
      preds = model(x)
      loss = loss_function(preds,y)
      val_loss += loss.item()
      preds = torch.sigmoid(preds)      
      for idx in range(len(preds)):
        pred_list.append(preds[idx].item())
        target_list.append(y[idx].item())
  #scoring
  pred_numpy = np.asarray(pred_list)
  target_numpy = np.asarray(target_list)
  val_score = roc_auc_score(target_numpy,pred_numpy)
  print(pred_numpy,target_numpy)
  for idx, a in enumerate(pred_numpy):
    if a>0.5:
      pred_numpy[idx] = 1
    else:
      pred_numpy[idx] = 0
  val_error = len(pred_list) - (pred_numpy == target_numpy).sum()
  print(f'EVAL score = {val_score} || loss = {val_loss} || Error = {val_error}')
  
  writer.add_scalars(f'Loss/{dict_freq}',{'TRAIN' : trn_loss, 'VAL' : val_loss}, epoch)
  writer.add_scalars(f'AUC/{dict_freq}',{'TRAIN' : trn_score, 'VAL' : val_score}, epoch)
  writer.add_scalars(f'Error/{dict_freq}',{'TRAIN' : trn_error, 'VAL' : val_error}, epoch)

  if best_loss > val_loss:
    best_loss = val_loss
    best_model_wts = copy.deepcopy(model.state_dict())
    print("Best model is saved!!")
  if val_loss == 0:
    print("Optimization has been completed and will be terminated in advance.")
    break

model.load_state_dict(best_model_wts)
writer.close()

HBox(children=(IntProgress(value=0, description='EPOCH', max=8, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Train', max=405, style=ProgressStyle(description_width='initi…

[5.13644516e-01 5.23386002e-01 5.28489828e-01 ... 5.32072918e-05
 9.99400377e-01 4.18072479e-04] [0. 0. 1. ... 0. 1. 0.]
TRAIN score = 0.988044724355328 || loss = 21.096782766864635 || ACC = 3312


HBox(children=(IntProgress(value=0, description='Val', max=174, style=ProgressStyle(description_width='initial…

[9.49735695e-05 9.18879814e-05 1.00959878e-04 ... 9.14639022e-05
 9.97096777e-01 9.17978614e-05] [0. 0. 0. ... 0. 1. 0.]
EVAL score = 1.0 || loss = 0.04541989602148533 || Error = 0
Best model is saved!!


HBox(children=(IntProgress(value=0, description='Train', max=405, style=ProgressStyle(description_width='initi…

[3.59823898e-04 1.36797549e-04 9.97659922e-01 ... 1.91582515e-07
 9.96004403e-01 2.69187349e-05] [0. 0. 1. ... 0. 1. 0.]
TRAIN score = 0.9992156246406392 || loss = 1.276079312330694 || ACC = 111


HBox(children=(IntProgress(value=0, description='Val', max=174, style=ProgressStyle(description_width='initial…

[2.33624942e-05 2.14692172e-05 2.42774695e-05 ... 2.19549147e-05
 9.99755323e-01 2.30296882e-05] [0. 0. 0. ... 0. 1. 0.]
EVAL score = 1.0 || loss = 0.006718795171764214 || Error = 0
Best model is saved!!


HBox(children=(IntProgress(value=0, description='Train', max=405, style=ProgressStyle(description_width='initi…

[3.30103976e-05 1.87991609e-05 9.97363031e-01 ... 8.13673523e-06
 9.99159217e-01 6.02771979e-05] [0. 0. 1. ... 0. 1. 0.]
TRAIN score = 0.9995664464388412 || loss = 0.932219497175538 || ACC = 91


HBox(children=(IntProgress(value=0, description='Val', max=174, style=ProgressStyle(description_width='initial…

[1.60505879e-05 1.50437018e-05 1.69883551e-05 ... 1.51465001e-05
 9.99747336e-01 1.67754260e-05] [0. 0. 0. ... 0. 1. 0.]
EVAL score = 1.0 || loss = 0.005833290604641661 || Error = 0
Best model is saved!!


HBox(children=(IntProgress(value=0, description='Train', max=405, style=ProgressStyle(description_width='initi…

KeyboardInterrupt: ignored

# For TEST

In [0]:
#For overall training
%%time
alltrain = TabularDataset("/content/drive/My Drive/Colab Notebooks/data/" + tokenizer + "train.csv",format='csv',fields=data_fields, skip_header=True)
val = TabularDataset("/content/drive/My Drive/Colab Notebooks/data/" + tokenizer + "val_split.csv",format='csv',fields=data_fields, skip_header=True)

TEXT.build_vocab(alltrain,min_freq = 10)

alltrain_loader = Iterator(alltrain, batch_size=512, device=device, repeat=False, sort_key=lambda x: len(x.text),sort_within_batch=True, shuffle=False)
val_loader = Iterator(val, batch_size=512, device=device, repeat=False, sort_key=lambda x: len(x.text),sort_within_batch=True,shuffle=False)

In [0]:
EPOCH = 5
best_loss = 100

for epoch in tqdm_notebook(range(EPOCH),desc='EPOCH'):
  model.train()
  pred_list = []
  target_list = []
  trn_loss = 0
  for i,batch in enumerate(tqdm_notebook(train_loader,desc = "Train")):
      x = batch.text.to('cuda')
      y = batch.smishing.float().to('cuda').unsqueeze(1)
      
      optimizer.zero_grad()
      preds = model(x)
      loss = loss_function(preds,y)
      loss.backward()
      optimizer.step()
      trn_loss += loss.item()
      preds = torch.sigmoid(preds)
      for idx in range(len(preds)):
        pred_list.append(preds[idx].item())
        target_list.append(y[idx].item())

  #scoring
  pred_numpy = np.asarray(pred_list)
  target_numpy = np.asarray(target_list)
  trn_score = roc_auc_score(target_numpy,pred_numpy)
  print(pred_numpy,target_numpy)
  for idx, a in enumerate(pred_numpy):
    if a>0.5:
      pred_numpy[idx] = 1
    else:
      pred_numpy[idx] = 0
  trn_error = len(pred_list) - (pred_numpy == target_numpy).sum()
  print(f'TRAIN score = {trn_score} || loss = {trn_loss} || ACC = {trn_error}')
  
  model.eval()
  pred_list=[]
  target_list = []
  val_loss = 0
  with torch.no_grad():
    for i, (x,y) in enumerate(tqdm_notebook(val_dl,desc="Val")): 
      x = batch.text.to('cuda')
      y = batch.smishing.float().to('cuda').unsqueeze(1)
      preds = model(x)
      loss = loss_function(preds,y)
      val_loss += loss.item()
      preds = torch.sigmoid(preds)      
      for idx in range(len(preds)):
        pred_list.append(preds[idx].item())
        target_list.append(y[idx].item())
  #scoring
  pred_numpy = np.asarray(pred_list)
  target_numpy = np.asarray(target_list)
  val_score = roc_auc_score(target_numpy,pred_numpy)
  print(pred_numpy,target_numpy)
  for idx, a in enumerate(pred_numpy):
    if a>0.5:
      pred_numpy[idx] = 1
    else:
      pred_numpy[idx] = 0
  val_error = len(pred_list) - (pred_numpy == target_numpy).sum()
  print(f'EVAL score = {val_score} || loss = {val_loss} || Error = {val_error}')
  
  # writer.add_scalars('Loss/trn-val',{'TRAIN' : trn_loss, 'VAL' : val_loss}, epoch)
  # writer.add_scalars('AUC/trn-val',{'TRAIN' : trn_score, 'VAL' : val_score}, epoch)
  # writer.add_scalars('Error/trn-val',{'TRAIN' : trn_error, 'VAL' : val_error}, epoch)

  if best_loss > val_loss:
    best_loss = val_loss
    best_model_wts = copy.deepcopy(model.state_dict())
    print("Best model is saved!!")
  if val_loss == 0:
    print("Optimization has been completed and will be terminated in advance.")
    break

model.load_state_dict(best_model_wts)

In [0]:
model.load_state_dict(best_model_wts)

In [0]:
test_fields = [('id',ID),('year',None),('text',TEXT)]
test = TabularDataset("/content/drive/My Drive/Colab Notebooks/data/" + tokenizer + "public_test.csv",format='csv',fields=test_fields, skip_header=True)
test_loader = Iterator(test, batch_size=1000, device=device, repeat=False, shuffle=False)

In [0]:
ids = []
labels = []
pred_list = []

model.eval()
for i, batch in enumerate(tqdm_notebook(test_loader)):
    pred = model(batch.text)
    pred = torch.sigmoid(pred)

    for idx in range(len(pred)):
      ids.append(batch.id[idx].item())
      pred_list.append(pred[idx].item())
      labels.append(torch.round(pred[idx]).item())


result_data = pandas.DataFrame({'id':ids,'smishing':pred_list})
result_dataL = pandas.DataFrame({'id':ids,'smishing':pred_list, 'label':labels})
print(result_data)
# result_data.to_csv('/content/drive/My Drive/Colab Notebooks/data/result_data.csv',index=False)

In [0]:
cnt = 0

for idx, a in enumerate(labels):
  if a == 1:
    cnt += 1

print(cnt)

In [0]:
eouchan = pandas.read_csv("/content/drive/My Drive/Colab Notebooks/data/submission.csv")

you = []
my = []
co = []

for e,m,id in zip(eouchan.label,labels,ids):
  if e == m:
    co.append(id)
  elif e>m:
    you.append(id)
  else:
    my.append(id)

print(f'my = {len(my)}\n{my}')
print(f'you = {len(you)}\n{you}')
print(f'co = {len(co)}\n{co}')

print(len(my)/len(ids))

In [0]:
Path = '/content/drive/My Drive/Colab Notebooks/data/model/model.pt'
Path2 = '/content/drive/My Drive/Colab Notebooks/data/model/remodel.pt'
Path3 = '/content/drive/My Drive/Colab Notebooks/data/model/allmodel.pt'

state = {
    'epoch' : EPOCH,
    'state_dict' : model.state_dict(),
    'optimizer' : optimizer.state_dict(),
    'loss' : loss_function
}
torch.save(model.state_dict(),Path)
torch.save(state,Path2)
torch.save(model,Path3)