In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!pip install konlpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install torchtext==0.6.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import time

import torch
from torchtext import data
import torch.nn as nn

In [5]:
# konlpy 설치 경로로 이동
os.chdir('/usr/local/lib/python3.10/dist-packages/konlpy/java')
os.getcwd()

!pwd

/usr/local/lib/python3.10/dist-packages/konlpy/java


In [6]:
import shutil
# 따로 등록해논 jar파일 교체
filename = 'open-korean-text-2.1.0.jar'
src = '/content/drive/MyDrive/data/'
dir = '/usr/local/lib/python3.10/dist-packages/konlpy/java/'
shutil.copy(src + filename, dir + filename)

'/usr/local/lib/python3.10/dist-packages/konlpy/java/open-korean-text-2.1.0.jar'

In [7]:
path = '/content/drive/MyDrive/data/programmers_ai/yujatea/comp_2/'

In [8]:
# Import Data
train = pd.read_csv(os.path.join(path, 'train.csv'))
test = pd.read_csv(os.path.join(path, 'test.csv'))

In [9]:
train.head()

Unnamed: 0,ID,text,label
0,0,유소영비호감 성형아줌마,1
1,1,나오지마라 썅,3
2,2,식상하고 지긋지긋했는데 잘 끝나네 오예 소리벗고 빤스질러~~!!!,6
3,3,성희롱 당할 얼굴이 아닌데?ㅋㅋㅋ,5
4,4,"""끝까지 해보자~쪽파리 원숭이 자한 쓰레기당""",0


In [10]:
test.head()

Unnamed: 0,ID,text,label
0,0,"""솔직히 우리나라 청년들도 불쌍하고 아재들도 불쌍하고 노인들도 불쌍하다. 나라가 참""",
1,1,그만 보고싶네요 .늙은애들은.,
2,2,"""더러운 개신교벌레 새퀴""",
3,3,근데전태수씨 사망이유가뭔가요그어떤기사에도 나오질않네요,
4,4,"""태극기부대와 틀닭바퀴충들에게 순시리는 국모다. ㅉㅉ""",


In [11]:
train.drop(columns=['ID'], inplace=True)
test.drop(columns=['ID'], inplace=True)

In [12]:
vocab = pd.read_csv(os.path.join(path, 'vocab.csv'))

In [13]:
def normalize_text (text):
    # 사용자 정의 지정 표현 수동으로 변환시켜주기
    for key, replace in zip(vocab['key'].values, vocab['replace'].values):
      try:
        text = text.str.replace(key, replace)
      except:
        continue
    # ') 깨문', '병@(#%^신' 처리
    text = text.str.replace(r'\) 깨문', '대깨문')
    text = text.str.replace(r'병[@#%^]?\(*\)?.*?신', "병신")
    text = text.str.replace(r"[^가-힣ㄱ-ㅎㅏ-ㅣ()!?\'\`\"]", " ")
    text = text.str.replace("\s{2,}", " ")
    return text

In [14]:
train["text"]=normalize_text(train["text"])
test['text'] = normalize_text(test['text'])

  text = text.str.replace(key, replace)
  text = text.str.replace(r'\) 깨문', '대깨문')
  text = text.str.replace(r'병[@#%^]?\(*\)?.*?신', "병신")
  text = text.str.replace(r"[^가-힣ㄱ-ㅎㅏ-ㅣ()!?\'\`\"]", " ")
  text = text.str.replace("\s{2,}", " ")


In [15]:
train['text'].head()

0                           유소영비호감 성형아줌마
1                                나오지마라 썅
2    식상하고 지긋지긋했는데 잘 끝나네 오예 소리벗고 빤스질러 !!!
3                     성희롱 당할 얼굴이 아닌데?ㅋㅋㅋ
4            "끝까지 해보자 쪽바리 원숭이 자유한국당 쓰레기"
Name: text, dtype: object

In [16]:
test['text'].head()

0    "솔직히 우리나라 청년들도 불쌍하고 아재들도 불쌍하고 노인들도 불쌍하다 나라가 참"
1                                   그만 보고싶네요 늙은애들은 
2                                    "더러운 개신교벌레 새끼"
3                     근데전태수씨 사망이유가뭔가요그어떤기사에도 나오질않네요
4                     "태극기부대와 틀닭바퀴충들에게 순시리는 국모다 ㅉㅉ"
Name: text, dtype: object

In [17]:
train_df, valid_df = train_test_split(train, test_size=0.2, stratify=train['label'])

In [18]:
train_df.head()

Unnamed: 0,text,label
60732,기레기빨갱이 새끼들 기사들에 맨날 낚이니 계속 쓰는거겠지,3
19268,"""중국은 무슨 온갖 바이러스를 다 껴안고 있냐 """,6
53994,"""진짜 신천지통합당 개틀딱들 못한건 대통령탓 잘한건 남탓ㅋㅋ 논리한번 오지네ㅋㅋㅋ""",2
44881,야 이 계집들아 꼽냐? ㅋㅋㅋ 계집주제에 주는데로 받아들여라 ㅋㅋㅋ,5
43740,못생겼다 이런애가 어케 연예인이냐,1


In [19]:
SEED = 42

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [20]:
from konlpy.tag import Okt

okt = Okt()

In [21]:
TEXT = data.Field(tokenize = okt.morphs, include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)

In [22]:
# source : https://gist.github.com/lextoumbourou/8f90313cbc3598ffbabeeaa1741a11c8
# 데이터프레임으로 customDataset 만들기

class CustomDataset(data.Dataset):

    def __init__(self, df, fields, is_test=False, **kwargs):
        examples = []
        for i, row in df.iterrows():
            label = row.label if not is_test else None
            text = row.text
            examples.append(data.Example.fromlist([text, label], fields))

        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(ex):
        return len(ex.text)

    @classmethod
    def splits(cls, fields, train_df, val_df=None, test_df=None, **kwargs):
        train_data, val_data, test_data = (None, None, None)
        data_field = fields

        if train_df is not None:
            train_data = cls(train_df.copy(), data_field, **kwargs)
        if val_df is not None:
            val_data = cls(val_df.copy(), data_field, **kwargs)
        if test_df is not None:
            test_data = cls(test_df.copy(), data_field, True, **kwargs)

        return tuple(d for d in (train_data, val_data, test_data) if d is not None)

In [23]:
fields = [('text',TEXT), ('label',LABEL)]

train_ds, val_ds = CustomDataset.splits(fields, train_df=train_df, val_df=valid_df)

In [24]:
print(vars(train_ds[10]))
print(type(train_ds[10]))

{'text': ['아이유', '워', '너', '원', '이', '탑', '이네', 'ㅎㅎ'], 'label': 6}
<class 'torchtext.data.example.Example'>


In [25]:
TEXT.build_vocab(train_ds, min_freq=1)

In [26]:
LABEL.build_vocab(train_ds)

In [27]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_ds, val_ds), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)

In [28]:
print(len(TEXT.vocab))

61821


In [29]:
TEXT.vocab.stoi[TEXT.pad_token]

1

In [30]:
# Hyperparameters
num_epochs = 25
learning_rate = 0.001

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 200
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 7
BIDIRECTIONAL = True
DROPOUT = 0.2
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] # padding

In [41]:
class LSTM_net(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        
        self.fc2 = nn.Linear(hidden_dim, 1)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        # text = [sent len, batch size]
        
        embedded = self.embedding(text)
        
        # embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        # output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        # output = [sent len, batch size, hid dim * num directions]
        # output over padding tokens are zero tensors
        
        # hidden = [num layers * num directions, batch size, hid dim]
        # cell = [num layers * num directions, batch size, hid dim]
        
        # concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        # and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        output = self.fc1(hidden)
        output = self.dropout(self.fc2(output))
                
        #hidden = [batch size, hid dim * num directions]
            
        return output

In [42]:
#creating instance of our LSTM_net class

model = LSTM_net(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [43]:
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[-1.4401, -0.9175, -1.4016,  ..., -0.5860, -1.1422, -0.2999],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.1668, -1.0666, -0.3621,  ...,  0.5194,  0.3634,  0.8123],
        ...,
        [ 0.2520,  0.0369,  1.1507,  ...,  0.8470,  1.1519, -0.6965],
        [-0.2630,  1.1507, -0.0878,  ...,  0.4679,  0.1099, -0.9821],
        [-0.7872,  0.1359,  0.7154,  ..., -0.1812, -0.8734, -1.1043]])


In [44]:
model.to(device) #CNN to GPU

LSTM_net(
  (embedding): Embedding(61821, 200, padding_idx=1)
  (rnn): LSTM(200, 256, num_layers=7, dropout=0.2, bidirectional=True)
  (fc1): Linear(in_features=512, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [35]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [36]:
batch = next(iter(train_iterator))

In [37]:
for num, item in enumerate(batch):
  print(num, item)
  break

0 (tensor([[    2, 11022,    58,     2,     2,  2732,   377,     2,   883,     2,
          4909,    90,    33,  5251,     2,     2,   166,     2,     2,     2,
             2, 16284,     2,  6151,     2,     2,  1058,   166,     2,     2,
           122,   226,  2702,     2, 15659,     2,     2,     4,     2,     2,
             2,   197,    24,    35, 10605,  4803,     2,     2,     2,     2,
           882,     2,     2,  4638,     2,     4,     2,    14,  1190,     2,
             2,     2,  2627,     2],
        [   90,   720,   813,   201,    34,   122, 11493,   950,    31,  1424,
          1300,   214,  1593,  7222,    27,  2428,    19,  2646,   702, 53469,
            84,    10,   623,    75,    46,  2100,    39,    17,  3394, 13631,
          2496,  1284,  2059,  1609,    15,   888,    34,   467,   247,   235,
            42,   965,  1009,    88,  7348,  8812,  9835,    91,    91,    84,
           476,  2409,   785,     9,    77,    50,   181,     5,    38,   158,
         54

In [45]:
def train(model, iterator) :
    epoch_loss  = 0.
    epoch_acc  = 0.
    model.train()

    for batch in iterator:
      text, text_lengths = batch.text
      text_lengths = torch.tensor(text_lengths, dtype = torch.int64,device =device)
      optimizer.zero_grad()
      predictions = model(text, text_lengths)
      print(predictions)
      break
      # print(predictions)
      # break
    #   loss = criterion(predictions, batch.label)
    #   epoch_loss  += loss.item()
    #   epoch_acc  += torch.sum( predictions == batch.label.data).item()

    #   loss.backward()
    #   optimizer.step()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [46]:
train(model, train_iterator)

  text_lengths = torch.tensor(text_lengths, dtype = torch.int64,device =device)


RuntimeError: ignored