- 형태소 분석기는 품사를 태깅(tagger)해주는 라이브러리

# 데이터 불러오기

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 재현성 구현

In [2]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import random
import os

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

DATA_PATH = "/content/drive/MyDrive/data/"
SEED = 42

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

# nltk(Natural Language Toolkit)
- python에서 가장 오래되고 유명한 자연어 처리 라이브러리

In [3]:
import nltk
nltk.download('punkt_tab') # 토크나이저 모델 > 함수 or 클래스 실행가능
nltk.download('stopwords') # 불용어 리스트
nltk.download('averaged_perceptron_tagger_eng') # 품사정보

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [4]:
train = pd.read_csv(f'{DATA_PATH}train_news.csv')
test = pd.read_csv(f'{DATA_PATH}test_news.csv')

train.shape, test.shape

((89320, 3), (38280, 2))

In [5]:
train.head()

Unnamed: 0,title,desc,target
0,Sudan Postpones Decision to Expel Oxfam and Sa...,Sudan has decided to postpone a decision to ex...,0
1,Coming Soon: Mobile TV,Cell phone manufacturers are teaming up to bri...,2
2,Experts warn of Internet flu vaccine scam,Although the United States is experiencing a s...,3
3,Bollor ups Havas stake to 20.2,Corporate raider Vincent Bollor said yesterday...,2
4,"Hurricane Ivan Kills 20 in Grenada, Heads West...",Reuters - Hurricane Ivan killed at least 20 pe...,0


In [6]:
text = train['desc'].loc[0]
text

'Sudan has decided to postpone a decision to expel the heads of two British aid agencies - Oxfam and Save the Children - citing administrative difficulties and humanitarian grounds.'

## 토큰화(tokenize)

In [7]:
from nltk.tokenize import word_tokenize
word_tokenize(text)

['Sudan',
 'has',
 'decided',
 'to',
 'postpone',
 'a',
 'decision',
 'to',
 'expel',
 'the',
 'heads',
 'of',
 'two',
 'British',
 'aid',
 'agencies',
 '-',
 'Oxfam',
 'and',
 'Save',
 'the',
 'Children',
 '-',
 'citing',
 'administrative',
 'difficulties',
 'and',
 'humanitarian',
 'grounds',
 '.']

In [10]:
type(word_tokenize(text))

list

## 불용어(stopwords)

In [8]:
from nltk.corpus import stopwords
stopwords.words('english') # 영어 불용어: 노이즈 제거

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [9]:
len(stopwords.words('english'))

179

## 품사 태깅(Part-of-Speech Tagging)

In [11]:
tokens = word_tokenize(text)
nltk.tag.pos_tag(tokens) # 토큰화된 객체 필요

[('Sudan', 'NNP'),
 ('has', 'VBZ'),
 ('decided', 'VBN'),
 ('to', 'TO'),
 ('postpone', 'VB'),
 ('a', 'DT'),
 ('decision', 'NN'),
 ('to', 'TO'),
 ('expel', 'VB'),
 ('the', 'DT'),
 ('heads', 'NNS'),
 ('of', 'IN'),
 ('two', 'CD'),
 ('British', 'JJ'),
 ('aid', 'NN'),
 ('agencies', 'NNS'),
 ('-', ':'),
 ('Oxfam', 'NNP'),
 ('and', 'CC'),
 ('Save', 'NNP'),
 ('the', 'DT'),
 ('Children', 'NNP'),
 ('-', ':'),
 ('citing', 'VBG'),
 ('administrative', 'JJ'),
 ('difficulties', 'NNS'),
 ('and', 'CC'),
 ('humanitarian', 'JJ'),
 ('grounds', 'NNS'),
 ('.', '.')]

- N or V or J로 시작하는 품사들의 단어만 다시 새로운 리스트로 담기

In [12]:
# 품사 태깅
pos_tags = nltk.pos_tag(tokens)

# N, V, J로 시작하는 품사의 단어만 필터링
filtered_words = [word for word, pos in pos_tags if pos.startswith(('N', 'V', 'J'))]
filtered_words

['Sudan',
 'has',
 'decided',
 'postpone',
 'decision',
 'expel',
 'heads',
 'British',
 'aid',
 'agencies',
 'Oxfam',
 'Save',
 'Children',
 'citing',
 'administrative',
 'difficulties',
 'humanitarian',
 'grounds']

- 정규표현식
  - \w: 알파벳, 숫자, _

In [13]:
train['clean'] = train['desc'].str.replace('[^\w] +', '', regex = True).str.lower() # 특정 문자 매칭 시, 삭제
test['clean'] = test['desc'].str.replace('[^\w] +', '', regex = True).str.lower()

train['clean']

Unnamed: 0,clean
0,sudan has decided to postpone a decision to ex...
1,cell phone manufacturers are teaming up to bri...
2,although the united states is experiencing a s...
3,corporate raider vincent bollor said yesterday...
4,reuters hurricane ivan killed at least 20 peop...
...,...
89315,vodafone uk has introduced a service that will...
89316,pullman last weekin studying usc game filmcoug...
89317,the former chairman of us software firm comput...
89318,an australian detainee being arraigned wednesd...


## Lemmatiztion(표제어 추출)
- 사전에 등재된 형태(lemma)로 바꾸는 것
- lemmatization는 문장에서 단어의 원형을 추출하는 과정
    - is, are -> be
    - having -> have


In [14]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

- .lemmatize('단어')
  - 해당 단어의 원형

In [15]:
wnl = WordNetLemmatizer() # 표제어 객체 생성
wnl.lemmatize('dogs')

'dog'

In [16]:
wnl = WordNetLemmatizer()
wnl.lemmatize('us')

'u'

In [18]:
wnl = WordNetLemmatizer()
wnl.lemmatize('is') # 오류도 많음

'is'

## spacy
- 딥러닝 기반의 형태소 분석 라이브러리

In [19]:
import spacy

In [20]:
nlp = spacy.load('en_core_web_sm') # 영어모델 load
nlp

<spacy.lang.en.English at 0x78755347d7b0>

In [21]:
text = train['clean'].iloc[0]
text

'sudan has decided to postpone a decision to expel the heads of two british aid agencies oxfam and save the children citing administrative difficulties and humanitarian grounds.'

In [24]:
doc = nlp(text) # 토큰 클래스 객체, iterable
display(type(doc)) # 데이터타입: .doc.Doc > 문자열 형태
doc # 품사 태깅 결과

spacy.tokens.doc.Doc

sudan has decided to postpone a decision to expel the heads of two british aid agencies oxfam and save the children citing administrative difficulties and humanitarian grounds.

In [25]:
len(doc)

28

In [26]:
doc[0]

sudan

In [30]:
type(doc[0]) # 토큰 객체, 데이터타입: .token.Token

spacy.tokens.token.Token

In [31]:
doc[0].text # 토큰화된 텍스트의 원문

'sudan'

In [32]:
doc[0].lemma_ # 표제어: 단어 원형

'sudan'

In [33]:
doc[0].tag_ # 품사

'NNP'

In [34]:
doc[0].is_alpha # 알파벳 여부

True

In [36]:
doc[0].is_stop # 불용어 여부, sudan > 불용어 x

False

In [37]:
cols = ['단어', '표제어', '품사','알파벳여부','불용어여부']
data = [ [token.text, token.lemma_, token.tag_, token.is_alpha, token.is_stop] for token in doc]
data

[['sudan', 'sudan', 'NNP', True, False],
 ['has', 'have', 'VBZ', True, True],
 ['decided', 'decide', 'VBN', True, False],
 ['to', 'to', 'TO', True, True],
 ['postpone', 'postpone', 'VB', True, False],
 ['a', 'a', 'DT', True, True],
 ['decision', 'decision', 'NN', True, False],
 ['to', 'to', 'TO', True, True],
 ['expel', 'expel', 'VB', True, False],
 ['the', 'the', 'DT', True, True],
 ['heads', 'head', 'NNS', True, False],
 ['of', 'of', 'IN', True, True],
 ['two', 'two', 'CD', True, True],
 ['british', 'british', 'JJ', True, False],
 ['aid', 'aid', 'NN', True, False],
 ['agencies', 'agency', 'NNS', True, False],
 ['oxfam', 'oxfam', 'NNS', True, False],
 ['and', 'and', 'CC', True, True],
 ['save', 'save', 'VB', True, False],
 ['the', 'the', 'DT', True, True],
 ['children', 'child', 'NNS', True, False],
 ['citing', 'cite', 'VBG', True, False],
 ['administrative', 'administrative', 'JJ', True, False],
 ['difficulties', 'difficulty', 'NNS', True, False],
 ['and', 'and', 'CC', True, True],
 

In [38]:
pd.DataFrame(data, columns = cols)

Unnamed: 0,단어,표제어,품사,알파벳여부,불용어여부
0,sudan,sudan,NNP,True,False
1,has,have,VBZ,True,True
2,decided,decide,VBN,True,False
3,to,to,TO,True,True
4,postpone,postpone,VB,True,False
5,a,a,DT,True,True
6,decision,decision,NN,True,False
7,to,to,TO,True,True
8,expel,expel,VB,True,False
9,the,the,DT,True,True


In [39]:
doc = nlp.tokenizer(text)
doc

sudan has decided to postpone a decision to expel the heads of two british aid agencies oxfam and save the children citing administrative difficulties and humanitarian grounds.

In [40]:
doc[1].tag_ # 품사 정보추출 불가

''

In [41]:
doc[1].lemma_ # 표제어 추출 불가

''

- 품사 N,V,J,R로 시작하는 토큰들만 토큰화하기
  - - nlp.tokenizer 사용

In [42]:
train_list = []

for text in tqdm(train['clean']):
  doc = nlp.tokenizer(text)
  # tmp = list(doc)
  tmp = [ t for t in doc if not t.is_alpha]
  train_list.append(tmp)

  0%|          | 0/89320 [00:00<?, ?it/s]

In [43]:
train_list

[[.],
 [.],
 [falldon't, 're, .],
 [20.2, #, 39;s, .],
 [20, people\as, grenadawhere\looting, swept\through, .],
 [1, -, 9, titans15, -, 12moving, 3, -, 0, 35, .],
 [-, #, 39lives, #, 39demands, .],
 [-, -, .],
 [(, filed26/11/2004)increasing, .],
 [#, 39;s, 19, -, -],
 [ , (, as2,000, (, 1,250, miles)a, 's, .],
 [.],
 [3, -, 1, .],
 [-, 30, -],
 [(, #, 39;s, (, ), .],
 [-you, .],
 [400, .],
 [65, .],
 [-at, .],
 [", ...],
 ['s, -, 's, \$10, .],
 [28, o'neal, 19, 100, -, 94, .],
 [-, .],
 [-, -, #, 39;s, .],
 [ ,
  (,
  jan8,
  \$1.1,
  corp&lt;a,
  href="http://www.reuters.co.uk,
  /,
  financequotelookup.jhtml?ticker,
  =,
  msft.o,
  =,
  =,
  =,
  news"&gt;msft.o&lt;/a&gtafter,
  extensionplaintiff'sattorneys,
  .],
 [ , (, 's, -, 5, -, 4, .],
 [2003, .],
 [.],
 [-, .],
 [otsegominn.isquot;crunch, #, 39housecereal, 49, 7, -, -, 6, -, -, .],
 [-a, -, 1, 2, 57, .],
 [-, 20, .],
 [3.5, 2005but, 175,000the],
 [(, #, 39;s, #, 39;t, #, 39;s, -, .],
 [.],
 [100, .],
 [corp.which, 11, .],


- nltk 활용해 모든 문서에 대해 불용어 제거와 명사+동사+형용사+부사만 토큰화해서 train_list에 담기
- test_list 동일 작업

In [44]:
train_list = []
stop_words = stopwords.words('english') # 불용어

for text in tqdm(train['clean']):
  tokens = word_tokenize(text) # 토큰화
  tokens = nltk.tag.pos_tag(tokens) # 품사 태깅
  tokens = [ t for t,p in tokens if t not in stop_words and p[0] in "NVJR"] # 특정품사
  # print(tokens)
  train_list.append(' '.join(tokens))


  0%|          | 0/89320 [00:00<?, ?it/s]

In [45]:
test_list = []
stop_words = stopwords.words('english') # 불용어

for text in tqdm(test['clean']):
  tokens = word_tokenize(text) # 토큰화
  tokens = nltk.tag.pos_tag(tokens) # 품사 태깅
  tokens = [ t for t,p in tokens if t not in stop_words and p[0] in "NVJR"] # 특정품사
  # print(tokens)
  test_list.append(' '.join(tokens))


  0%|          | 0/38280 [00:00<?, ?it/s]

In [46]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(max_features=500)
train_data = vec.fit_transform(train_list).A

In [47]:
train_data

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])