<a href="https://colab.research.google.com/github/Rhcsky/Colab-pytorch/blob/master/CNN_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


## Preparing Data

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torchtext.data import Field, LabelField, TabularDataset, Iterator, BucketIterator
from torchtext import datasets

import numpy as np
import pandas
import copy
from tqdm import tqdm_notebook
from sklearn.metrics import roc_auc_score

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

tokenizer = 'soy_'

cuda


In [0]:
from collections import Counter
min_freq = 10
morphs = ['/OL']
X = pandas.read_csv('/content/drive/My Drive/Colab Notebooks/data/soy_train.csv').text.tolist()
X_text = [word for sent in X for word in sent.split()]
word_counter = sorted(Counter(X_text).items(), key = lambda x: -x[1])
word_dictionary = [word for word, freq in word_counter if freq > min_freq and all([morph in word for morph in morphs])]

In [0]:
def custom_tokenizer(sent):
  splited = [word for word in sent.split() if word in word_dictionary]
  return splited

In [0]:
%%time

ID = Field(sequential=False,use_vocab=False,batch_first=True)
# TEXT = Field(tokenize = lambda x: custom_tokenizer(x), use_vocab=True,include_lengths=False,batch_first=True)
TEXT = Field(use_vocab=True,include_lengths=False,batch_first=True)
LABEL = LabelField(sequential=False,use_vocab=False, batch_first = True, preprocessing = lambda x : int(x),dtype = torch.float)
data_fields = [('id',ID),('year_month',None),('text',TEXT),('smishing',LABEL)]

In [0]:
train_data = TabularDataset('/content/drive/My Drive/Colab Notebooks/rm_duplicate/' + tokenizer + 'train.csv','csv',data_fields,skip_header = True)

In [0]:
import random
train_data,valid_data = train_data.split(split_ratio=0.6,random_state=random.seed(1234),stratified=True, strata_field='smishing')

In [0]:
valid_data, test_data = valid_data.split(split_ratio=0.6,random_state=random.seed(1234),stratified=True, strata_field='smishing')

Build the vocab and load the pre-trained word embeddings.

In [0]:
%%time
dict_freq = 10
TEXT.build_vocab(train_data, min_freq = dict_freq)
print(len(TEXT.vocab))
# train[0].__dict__.keys()

As before, we create the iterators.

In [0]:
BATCH_SIZE = 1024

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device,
    sort_key = lambda x: len(x.text),
    sort_within_batch=False
    )

## Build the Model

In [0]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
                
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
                
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
                
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [0]:
torch.cuda.empty_cache()

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 128
N_FILTERS = 128
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

optimizer = optim.Adam(model.parameters(),0.001)
criterion = nn.BCEWithLogitsLoss(reduction='sum')

model = model.to(device)
criterion = criterion.to(device)

In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

## Train the Model

In [0]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division
    acc = correct.sum() / len(correct)
    error = len(correct) - correct.sum()
    return acc, error

In [0]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_error = 0
    
    model.train()
    
    for batch in tqdm_notebook(iterator,desc='TRAIN'):
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.smishing)
        
        acc, error = binary_accuracy(predictions, batch.smishing)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        epoch_error += error.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator), epoch_error

In [0]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_error = 0
    error_list = []

    model.eval()
    with torch.no_grad():
    
        for batch in tqdm_notebook(iterator,desc='EVAL '):

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.smishing)
            
            acc,error = binary_accuracy(predictions, batch.smishing)

            rounded_preds = torch.round(torch.sigmoid(predictions))

            noncorrect = (rounded_preds != batch.smishing).float()

            for non, idx in zip(noncorrect,batch.id):
              if non == 1:
                error_list.append(idx.item())

            # for label,smishing,idx in zip(label,batch.smishing,batch.id):
            #   if label != smishing:
            #     error_list.append(idx)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            epoch_error += error.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator), epoch_error, error_list

In [0]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in tqdm_notebook(range(N_EPOCHS),desc='EPOCH'):

    train_loss, train_acc ,train_error= train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc, valid_error, valid_error_list = evaluate(model, valid_iterator, criterion)
    
    print(f'\tTrain. Error: {train_error:.1f} || Loss: {train_loss:.3f} || Acc: {train_acc*100:.2f}%')
    print(f'\tVal. Error: {valid_error:.1f} || Loss: {valid_loss:.3f} || Acc: {valid_acc*100:.2f}%')
          
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        best_error_list = valid_error_list
        print(f'\t{epoch+1}. Best model is saved!!')
        torch.save(model.state_dict(), 'model.pt')

model.load_state_dict(torch.load('model.pt'))
test_loss, test_acc, test_error, test_error_list = evaluate(model, test_iterator, criterion)
print(f'\nTest. Error: {test_error:.1f} || Loss: {test_loss:.3f} || Acc: {test_acc*100:.2f}%')
print("\nbest_error_list ", best_error_list)
print("test_error_list ",test_error_list)

In [0]:
ids = []
labels = []
pred_list = []

model.load_state_dict(torch.load('model.pt'))
model.eval()

for data in test_data.:
  if len(data.text) < 5:
    data.text += ['<pad>'] * (5 - len(data.text))
  indexed = [TEXT.vocab.stoi[t] for t in data.text]
  tensor = torch.LongTensor(indexed).to(device)
  tensor = tensor.unsqueeze(0)
  pred = model(tensor)
  predict = torch.sigmoid(model(tensor))

  labels.append(torch.round(predict).item())
  ids.append(data.id)
  pred_list.append(predict.item())

result_data = pandas.DataFrame({'id' : ids, 'smishing':pred_list})
result_data.to_csv( str(N_EPOCHS) + 'over.csv',index=False)

print(result_data)
print(sum(labels))

In [0]:
pandas.set_option('display.max_rows', 20)
pandas.set_option('display.width',3000)
# text_data = pandas.read_csv('/content/drive/My Drive/Colab Notebooks/data/' + tokenizer + 'train.csv')
a = text_data.loc[text_data['id'].isin(best_error_list)]

a

In [0]:
from sklearn.manifold import TSNE
import plotly.graph_objs as go
import plotly.offline as py
## Get weights
conv_embds = model.embedding.weight.cpu().detach().numpy()
Visualizing_word_embedding = text_data
print(Visualizing_word_embedding.head())
aa = Visualizing_word_embedding.loc[Visualizing_word_embedding['id'].isin(best_error_list)].to_dict('records')[3:4]
word_list = list({word for row in aa for word in row['text'].split()})

word2index = TEXT.vocab.stoi

conv_embds = [conv_embds[word2index[word]] for word in word_list]
## Plotting function
def plot_words(data, start, stop, step):
    trace = go.Scatter(
        x = data[start:stop:step,0], 
        y = data[start:stop:step, 1],
        mode = 'markers',
        text= word_list[start:stop:step]
    )
    layout = dict(title= 't-SNE 1 vs t-SNE 2',
                  yaxis = dict(title='t-SNE 2'),
                  xaxis = dict(title='t-SNE 1'),
                  hovermode= 'closest')
    fig = dict(data = [trace], layout= layout)
    py.iplot(fig)
## Visualize words in two dimensions 
conv_tsne_embds = TSNE(n_components=2).fit_transform(conv_embds)
plot_words(conv_tsne_embds, 0, 2000, 1)

In [0]:
from sklearn.manifold import TSNE
import plotly.graph_objs as go
import plotly.offline as py
## Get weights
conv_embds = model.embedding.weight.cpu().detach().numpy()
Visualizing_word_embedding = text_data
aa = Visualizing_word_embedding.loc[Visualizing_word_embedding['id'].isin(best_error_list)].to_dict('records')
words = {word for row in aa for word in row['text'].split()}

word2index = TEXT.vocab.stoi

word_index = [conv_embds[word2index[word]] for word in words]
wegits = torch.empty((128,128))
for idx, word in enumerate(word_index):
  wegits[idx] = model.embedding.weight[word]


RuntimeError: ignored

In [0]:
writer = SummaryWriter('/content/drive/My Drive/Colab Notebooks/logs/embedding')

writer.add_embedding(model.embedding.weight,metadata=TEXT.vocab.itos)

In [0]:
# vocab = TEXT.vocab.itos
# len(model.embedding.weight)
weights_map = {}
for voc,weight in zip(vocab,model.embedding.weight):
  print(weight)
  break
  weights_map[voc] = 

dict_keys([])

In [0]:
# model.load_state_dict(torch.load('model.pt'))

# test_loss, test_acc = evaluate(model, test_iterator, criterion)

# print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

## Public Test

In [0]:
alltrain_data = TabularDataset('/content/drive/My Drive/Colab Notebooks/data/soy_train.csv','csv',data_fields,skip_header=True,)

dict_freq = 350
TEXT.build_vocab(alltrain_data,min_freq = dict_freq)
print(len(TEXT.vocab))

BATCH_SIZE = 1024

alltrain_iterator = BucketIterator(
    (alltrain_data),
    batch_size = BATCH_SIZE, 
    device = device,
    sort_key = lambda x: len(x.text),
    sort_within_batch=False
    )

3530


In [0]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 128
N_FILTERS = 128
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

optimizer = optim.Adam(model.parameters(),0.001)
criterion = nn.BCEWithLogitsLoss(reduction='sum')

model = model.to(device)
criterion = criterion.to(device)

In [0]:
torch.cuda.empty_cache()

In [0]:
N_EPOCHS = 1

best_train_loss = 0.086

for epoch in tqdm_notebook(range(N_EPOCHS),desc='EPOCH'):

    train_loss, train_acc ,train_error= train(model, alltrain_iterator, optimizer, criterion)
    
    print(f'\tTrain. Error: {train_error:.1f} || Loss: {train_loss:.3f} || Acc: {train_acc*100:.2f}%')
  
    if train_loss < best_train_loss:
        best_train_loss = train_loss
        print('Best model is saved!!')
        torch.save(model.state_dict(), str(N_EPOCHS) + '_model.pt')
    if train_loss < 0.05:
      print('Overfitting')
      # torch.save(model.state_dict(), str(N_EPOCHS) + 'over_model.pt')
      break

    if train_error == 0.0:
      torch.save(model.state_dict(), N_EPOCHS + '_model.pt')


HBox(children=(IntProgress(value=0, description='EPOCH', max=1, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='TRAIN', max=290, style=ProgressStyle(description_width='initi…

	Train. Error: 6.0 || Loss: 0.063 || Acc: 100.00%
Best model is saved!!


In [0]:
test_fields = [('id',ID),('year',None),('text',TEXT)]
test_set = TabularDataset("/content/drive/My Drive/Colab Notebooks/data/" + tokenizer + "public_test.csv",format='csv',fields=test_fields, skip_header=True)

In [0]:
ids = []
labels = []
pred_list = []
print(N_EPOCHS)
model.load_state_dict(torch.load(str(N_EPOCHS) + '_model.pt'))
model.eval()

for data in test_set:
  if len(data.text) < 5:
    data.text += ['<pad>'] * (5 - len(data.text))
  indexed = [TEXT.vocab.stoi[t] for t in data.text]
  tensor = torch.LongTensor(indexed).to(device)
  tensor = tensor.unsqueeze(0)
  pred = model(tensor)
  predict = torch.sigmoid(model(tensor))

  labels.append(torch.round(predict).item())
  ids.append(data.id)
  pred_list.append(predict.item())

result_data = pandas.DataFrame({'id' : ids, 'smishing':pred_list})
result_data.to_csv(str(10) + 'result_data.csv',index=False)

print(result_data)
print(sum(labels))

1
          id      smishing
0     340000  3.526019e-12
1     340001  5.499422e-07
2     340002  9.148106e-11
3     340003  3.722385e-13
4     340004  1.230075e-12
...      ...           ...
1621  341621  4.544904e-13
1622  341622  1.806402e-12
1623  341623  5.453318e-15
1624  341624  3.837799e-07
1625  341625  1.120625e-10

[1626 rows x 2 columns]
94.0


In [0]:
high = pandas.read_csv('/content/high.csv')
present = pandas.read_csv('/content/1result_data.csv')

pre_label = []
high_label = []
for score,score2 in zip(high['smishing'],present['smishing']):
  if score >=0.5:
    high_label.append(1)
  else:
    high_label.append(0)

  if score2 >=0.5:
    pre_label.append(1)
  else:
    pre_label.append(0)

In [0]:
sum(high_label)

106

In [0]:
cnt = 0

for a, b in zip(high_label,pre_label):
  if a != b:
    if a == 1:
      print('a is 1')
    else:
      print('a is 0')
    cnt += 1

cnt

a is 1
a is 0
a is 1
a is 1
a is 1
a is 0
a is 1
a is 1


8

In [0]:
# high = pandas.read_csv('/content/high.csv')
# df = pandas.read_csv('/content/5.csv')

diff = 0

for a, b in zip(high['smishing'],result_data['smishing']):
  diff += abs(a-b)

diff

15.045782360517876

In [0]:
b = pandas.read_csv('/content/101result_data.csv')
b

Unnamed: 0,id,smishing
0,340000,5.326150e-11
1,340001,1.128968e-06
2,340002,4.772320e-10
3,340003,2.982476e-12
4,340004,1.590194e-11
...,...,...
1621,341621,6.316974e-12
1622,341622,1.327633e-11
1623,341623,5.014807e-14
1624,341624,4.458320e-06


In [0]:
a = pandas.read_csv('/content/Soy_df350_128128_E82.csv')

a

Unnamed: 0,id,smishing
0,340000,4.900000e-11
1,340001,1.540000e-06
2,340002,7.000000e-10
3,340003,1.660000e-12
4,340004,2.060000e-11
...,...,...
1621,341621,4.280000e-12
1622,341622,1.540000e-11
1623,341623,1.120000e-13
1624,341624,2.050000e-05


In [0]:
for col in ['Unnamed: 2','Unnamed: 3','Unnamed: 4','Unnamed: 5','Unnamed: 6','Unnamed: 7']:
  del a[col]

In [0]:
a.to_csv('/content/Soy_df350_128128_E82.csv',index=False)