# Chapter 7 Sentiment Analysis
감정분석 : Gensim을 활용한 한글 긍/부정 분석

https://www.lucypark.kr/slides/2015-pyconkr/#1 

In [1]:
# 문장 시퀀스 뒤에 감정을 결정하는 과정으로 정의한다
# Speaker 혹은 Text사고를 표현하는 사람의 감정을 판단하는데 사용

    # 1 감정분석 소개
    # 2 NER을 사용한 감정분석
    # 3 기계학습을 활용한 감정분석
    # 4 NER 시스템의 평가

<br></br>
## 1 Introduction
감정분석 소개

In [2]:
# target : 이진분류(긍정, 부정), 멀티분류 (긍정, 부정, 중립)
# 감정과 토픽 마이닝을 결합한 '토픽-감정분석'을 시행한다

In [3]:
# 감정분석 : lexicon (어휘목록) 을 사용해서 수행할 수 있다
# 1 labMT (10,000단어 분석)
# 2 Warringer (13,915단어 분석)
# 3 OpinionFinder's Subjectivity Lexic (8221단어 분석)
# 4 ANEW  (1034단어 분석)    : Affective Norms for English Words
# 5 AFINN (2477단어 분석)    : Finn Arup Nielson 에 의한 분류
# 6 Balance Affective (277 단어) : 1(긍정), 2(부정), 3(불안정), 4(중립)
# 7 BAWL  (2200단어 분석)    : Berlin Affective Word List Reloaded
# 8 BFAN  (210단어로 구성)    : Bilingual Finnish Affective Norms
# 9 CDGE  : Compass DeRose Guide to Emotion Words
# 10 DAL  : Dictionary of Affect in Language
# 11 WDAL : Whissell's Dictionary of Affect in Language
# 등등...

<br></br>
## 2 영화 리뷰의 감정분석
sentiment analysis for movie review

http://www.nltk.org/book/ch06.html

### 01 학습을 위한 Train 데이터 생성하기
nltk의 movie_review 데이터를 활용

In [1]:
from nltk.corpus import movie_reviews
movie_reviews.categories()

['neg', 'pos']

In [2]:
movie_reviews.fileids(movie_reviews.categories()[0])[:5]

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt']

In [3]:
! cat /home/markbaum/nltk_data/corpora/movie_reviews/neg/cv435_24355.txt

a couple of criminals ( mario van peebles and loretta devine ) move into a rich family's house in hopes of conning them out of their jewels . 
however , someone else steals the jewels before they are able to get to them . 
writer mario van peebles delivers a clever script with several unexpected plot twists , but director mario van peebles undermines his own high points with haphazard camera work , editing and pacing . 
it felt as though the film should have been wrapping up at the hour mark , but alas there was still 35 more minutes to go . 
daniel baldwin ( i can't believe i'm about to type this ) gives the best performance in the film , outshining the other talented members of the cast . 
[r] 


In [4]:
docs = [(list(movie_reviews.words(fid)), cat) 
        for cat in movie_reviews.categories()   # ['neg', 'pos']
        for fid in movie_reviews.fileids(cat)]  # 'neg/cv000_29416.txt', ....

print('docs        :', len(docs))
print('docs[13][1] :', docs[13][1])
print('docs[13][0] :', len(docs[13][0]), docs[13][0][::80])

docs        : 2000
docs[13][1] : neg
docs[13][0] : 1144 ['a', 'all', 'fate', 'like', 'intersperesed', 'the', '.', 'who', 'like', 'characters', '.', 'looks', 'us', 'obviously', 'give']


In [6]:
# 긍/부정 분류된 textdml token을 뒤섞어서 1개로 합친다
import nltk, random
random.shuffle(docs)
all_tokens = nltk.FreqDist(x.lower()    for  x  in  movie_reviews.words())
print('영화리뷰 token의 총 합 :', len(all_tokens.keys()))

# 39,768개 중  2,000개  Train 데이터 추출
token_features = list(all_tokens.keys())[:2000]
token_features[::800]

영화리뷰 token의 총 합 : 39768


['plot', 'seymour', 'strives']

In [None]:
# 중간결과
# doc            : neg, pos target을 표시한, text의 token list를 수집
# token_features : all_tokens.keys())[:2000]의    token list를 수집

### 02 all.tokens [:2000]표본을 추출하여 학습모델을 생성
'pos/cv957_8737.txt' 리뷰가 긍정/ 부정 여부를 

<strong>나이브 베이즈 분류기</strong>를 활용하여 판단

In [7]:
token_file = 'pos/cv957_8737.txt' 
print(token_file + 's Token :', len(movie_reviews.words(token_file)))

# 리뷰 데이터의 Token이 표본DB 포함여부 판단
def doc_features(docs):
    doc_words = set(docs)  # 리뷰 txt의 token을 집합으로 추출
    features = {}
    
    # all_tokens[:2000] 표본에 대해, 리뷰 token의 포함여부를 판단 
    for i, word in enumerate(token_features):
        features['contains(%s)' % word] = (word in doc_words)
        if i == 5 : break
    return features

doc_features(movie_reviews.words( token_file ))

pos/cv957_8737.txts Token : 597


{'contains(:)': True,
 'contains(couples)': False,
 'contains(go)': False,
 'contains(plot)': True,
 'contains(teen)': False,
 'contains(two)': True}

In [9]:
# docs (긍/부정리뷰의 token 모음) 데이터의 
# Train(90%), Test(10%)로 나눈다 
feature_sets = [(doc_features(d), c) for (d,c) in docs]
train_sets, test_sets = feature_sets[100:], feature_sets[:100]

# 나이브 베이즈 분류기로 정확도를 판단
classifiers = nltk.NaiveBayesClassifier.train(train_sets)
print('Accuracy :', nltk.classify.accuracy(classifiers, test_sets))
classifiers.show_most_informative_features() 

Accuracy : 0.6
Most Informative Features
       contains(couples) = True              neg : pos    =      1.5 : 1.0
          contains(plot) = True              neg : pos    =      1.4 : 1.0
          contains(plot) = False             pos : neg    =      1.3 : 1.0
          contains(teen) = True              neg : pos    =      1.2 : 1.0
             contains(:) = False             pos : neg    =      1.0 : 1.0
             contains(:) = True              neg : pos    =      1.0 : 1.0
           contains(two) = False             neg : pos    =      1.0 : 1.0
           contains(two) = True              pos : neg    =      1.0 : 1.0
          contains(teen) = False             pos : neg    =      1.0 : 1.0
       contains(couples) = False             pos : neg    =      1.0 : 1.0


In [11]:
# 결과
# 정확도가 60%를 살짝 넘김.. 그리고 token이 많다고 성능이 높진 않음
# 책과는 결과가 다르다.... GitHub의 결과도 위와 동일.....
# neg : pos 분류차이값이 많이 나던데, 여기선 별로 차이가 덜하다

### 02 Document Classification

http://www.nltk.org/book/ch06.html

In [12]:
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [13]:
# 아래 2줄을 함수에 넣으면 loop에 걸리더라.. (일부러 밖에서 처리를 해야만 작동되더라..)
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document, word_features = word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

document_features(movie_reviews.words('pos/cv957_8737.txt'), word_features[:10])

{'contains(:)': True,
 'contains(a)': True,
 'contains(church)': False,
 'contains(couples)': False,
 'contains(go)': False,
 'contains(party)': False,
 'contains(plot)': True,
 'contains(teen)': False,
 'contains(to)': True,
 'contains(two)': True}

In [14]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

print(nltk.classify.accuracy(classifier, test_set))
classifier.show_most_informative_features(5)

0.78
Most Informative Features
 contains(unimaginative) = True              neg : pos    =      8.4 : 1.0
        contains(welles) = True              neg : pos    =      8.4 : 1.0
        contains(sexist) = True              neg : pos    =      7.7 : 1.0
    contains(schumacher) = True              neg : pos    =      7.5 : 1.0
        contains(shoddy) = True              neg : pos    =      7.1 : 1.0


In [15]:
# unimaginative : 상상력이 없는
# shoddy        : 겉만 번지르르한 싸구려,  재생 털실
# atrocious     : 극악 무도한
# 추출한 단어 자체가 선명해서, 결과도 분명하고 정확도도 높은 결과를 도출한다

<br></br>
## 3 텍스트 전처리
text를 
1. 단어(word)
2. 표제어/주제(lemma)
3. 태그(tag) 

를 포함한 데이터로 추출한다

In [16]:
# text --> 문장 --> token
import nltk
class Splitter(object):
    
    def __init__(self):
        self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        self.nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer()

    def split(self, text):
        sentences = self.nltk_splitter.tokenize(text)             # txt --> 문장
        tokenized_sentences = [self.nltk_tokenizer.tokenize(sent) # 문장 --> token
                               for sent in sentences]
        return tokenized_sentences

In [17]:
# Txt --> 문장 --> token
text = """Why are you looking disappointed. 
We will go to restaurant for dinner."""

splitter = Splitter()
splitted_sentences = splitter.split(text)
splitted_sentences

[['Why', 'are', 'you', 'looking', 'disappointed', '.'],
 ['We', 'will', 'go', 'to', 'restaurant', 'for', 'dinner', '.']]

In [18]:
# 3개 성분을 추출 후, 묶어서 정리
# 단어(word/token), 표제어/주제(lemme), 태그(tag)
class POSTagger(object):
    def __init__(self): 
        pass
    def pos_tag(self, sentences):
        # 입력 sentence 에 token을 첨부한다
        pos = [nltk.pos_tag(sentence)       for sentence in sentences]

        # 첨부된 token을 tuple 형식으로 묶어서, list에 정리한다
        pos = [[(word, [postag]) 
                for (word, postag) in sentence] 
                for sentence in pos]
        return pos

postagger = POSTagger()
pos_tagged_sentences = postagger.pos_tag(splitted_sentences)
pos_tagged_sentences

[[('Why', ['WRB']),
  ('are', ['VBP']),
  ('you', ['PRP']),
  ('looking', ['VBG']),
  ('disappointed', ['VBN']),
  ('.', ['.'])],
 [('We', ['PRP']),
  ('will', ['MD']),
  ('go', ['VB']),
  ('to', ['TO']),
  ('restaurant', ['VB']),
  ('for', ['IN']),
  ('dinner', ['NN']),
  ('.', ['.'])]]

In [10]:
# https://github.com/sujithvm/nlp-modules/blob/master/sentiment%20analysis/sentiment_analyzer.py
# 폴더내 파일들의 text에 대해 긍정/ 부정 tag 작업 시행
# dictionary를 활용해서 tag를 생성
import nltk, yaml, sys, re ,os 
class DictionaryTagger(object):

    def __init__(self, dictionary_paths):
        # 해당 폴더에 있는 text 파일 목록을 수집한다
        files = [open(path, 'r')     for path in dictionary_paths]      
        map(lambda x: x.close(),     files)

        # http://egloos.zum.com/sweeper/v/3042272
        # yaml : 텍스트 파일의 내용을 파싱한다 
        dictionaries = [yaml.load(dict_file) 
                                     for dict_file in files]

        self.dictionary ,self.max_key_size = {}, 0
        for curr_dict in dictionaries:
            for key in curr_dict:
                # key 값으로 존재하는 경우, dict에 내용을 추가한다
                if key in self.dictionary:             
                    self.dictionary[key].extend(curr_dict[key])                    

                # key 값이 없는경우, 새로운 dictionary 목록을 생성한다
                else:
                    self.dictionary[key] = curr_dict[key]                   
                    self.max_key_size = max(self.max_key_size, len(key))

    # 문장의 token의, tag를 list를 추출한다            
    def tag(self, pos_tagged_sentences):
        return [self.tag_sentence(sentence) 
                          for sentence in pos_tagged_sentences]

    # token의 tag추출 함수 (주제도 함께 추출할지를 옵션으로 설정)
    def tag_sentence(self, sentence, tag_with_lemmas = False):
        tag_sentence = []
        N = len(sentence)
        if self.max_key_size == 0: 
            self.max_key_size = N
        i = 0
        while (i < N): # 문장 내 token의 갯수만큼 반복한다
            j = min(i + self.max_key_size, N)
            
            tagged = False
            while (j > i):
                expression_form  = ' '.join([word[0]   for word in sentence[i:j]]).lower()
                expression_lemma = ' '.join([word[1]   for word in sentence[i:j]]).lower()

                if tag_with_lemmas:
                    literal = expression_lemma
                else:
                    literal = expression_form

                if literal in self.dictionary:
                    is_single_token = j - i == 1  # j-1 이 1이 맞는지를  True/ False 로 입력
                    original_position = i         # 시작값
                    i = j
                    taggings = [tag     for tag in self.dictionary[literal]] # value 추출
                    tagged_expression = (expression_form, expression_lemma, taggings)

                    if is_single_token:  # 위 판단이 True인 경우 
                        original_token_tagging = sentence[original_position][2]
                        tagged_expression[2].extend(original_token_tagging) # tag 값을 덧붙인다
                    tag_sentence.append(tagged_expression)
                    tagged = True
                else:
                    j = j - 1   # j 값을 1씩 줄이면서 위의 작업에 적합한 조건으로 조절을 해 나아간다 

            # tagged 값이 기존에 없는경우, 새로 추가한다
            if not tagged:
                tag_sentence.append(sentence[i])
                i += 1
        return tag_sentence

In [20]:
# 긍/부정 정리된 dictionary 목록의 표현을 Counting
def value_of(sentiment):
    if sentiment == 'positive': return 1
    if sentiment == 'negative': return -1
    return 0

def sentiment_score(review):
    return sum([sentence_score(sentence, None, 0.0) for sentence in review])

## 4 NER를 사용한 감정분석 -  176 p (on_tology)
개체명(고유명사) 인식 Named-entity recognition (NER) - 감정식별을 위한 전처리 

    token의 개체명을 별도의 기준으로 식별 후, 클래스로 분류하는 과정으로 
    히든마르코프, 최대엔트로피 마르코프, SVM, 의사결정나무 등을 활용한다
    개체명으로 인식되면, 감정분석에 기여하지 않으므로, 제외한 나머지들로 감정분석을 수행

## 5 기계학습을 사용한 감정분석
Twitter text 데이터에 대한 통계, 자동화, 기계학습 분류기

출처 : http://www.nltk.org/book/ch06.html

data : https://github.com/ravikiranj/twitter-sentiment-analyzer

<img src = "http://www.nltk.org/images/supervised-classification.png" align='left' width = '500'>

### 01 nltk의 모듈
nltk.sentiment.sentiment_analyzer

In [21]:
# nltk.sentiment.sentiment_analyzer() 는 기계학습 기반의 감정분석 모듈
import nltk.sentiment.sentiment_analyzer
from nltk.sentiment import SentimentAnalyzer

### 02 트위터 예제 분석
https://gist.github.com/ravikiranj/2639121

In [22]:
# 중복되는 문자를 단일로 처리
def replaceTwoOrMore(s):
    pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
    return pattern.sub(r"\1\1", s)

In [23]:
# 불용어 목록을 파일에서 읽어온다
def getStopWordList(stopWordListFileName):
    stopWords = []   # 불용어 목록 파일의 text를 '공백'을 기준으로 token생성 뒤, list로 출력
    stopWords.append('AT_USER')
    stopWords.append('URL')
    fp = open(stopWordListFileName, 'r')
    line = fp.readline()
    while line:
        word = line.strip()
        stopWords.append(word)
        line = fp.readline()
    fp.close()
    return stopWords

In [24]:
# 여러번 반복된 단어들의 전처리
def getFeatureVector(tweet):
    featureVector = []
    words = tweet.split()       # 공백을 기분으로 token 단어를 생성
    for w in words:             
        w = replaceTwoOrMore(w) # 2번 이상 반복된 단어를 단일로 전처리 (사용자함수)
        w = w.strip('\'"?,.')   # 문장부호를 제거
        val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w) # 알파벳 여부 확인

    # 해당 단어가 숫자/영어가 아니거나, 불용어에 해당하면 무시
    if(w in stopWords or val is None):
        pass # continue 는 '반복문'에서 가능
    else:
        featureVector.append(w.lower()) # 소문자로 변환한다
    return featureVector 

In [25]:
# https://gist.github.com/ravikiranj/2639031
# 트위터 데이터 전처리 함수
def processTweet(tweet):
    tweet = tweet.lower()  # 데이터를 소문자로 바꾼다

    # www.* 또는 https?://* ==> URL 로 변환
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL', tweet)
    tweet = re.sub('@[^\s]+','AT_USER',tweet)   # @username ==> AT_USER로 변환
    tweet = re.sub('[\s]+', ' ', tweet)         # 공백을 제거   
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)  # 해시태그 '#' 제거
    tweet = tweet.strip('\'"')                  # triming
    return tweet

In [26]:
# 트위터 데이터가 .txt인 경우
import re, csv
stopWords = []

# Tweets 데이터는 1줄씩 모든 프로세스를 진행한다
fp = open('data/sampleTweets.txt', 'r') # Data text 파일
line = fp.readline()
line

'@PrincessSuperC Hey Cici sweetheart! Just wanted to let u know I luv u! OH! and will the mixtape drop soon? FANTASY RIDE MAY 5TH!!!!  \n'

In [27]:
st = open('data/stopwords.txt', 'r')    # Stop word text 파일 
stopWords = getStopWordList('./data/stopwords.txt')
print(len(stopWords), stopWords[:10])

426 ['AT_USER', 'URL', '', 'a', 'about', 'above', 'across', 'after', 'again', 'against']


In [28]:
while line:  # 트위터 데이터 1줄씩 처리
    processedTweet = processTweet(line)
    featureVector = getFeatureVector(processedTweet)
    print(featureVector)
    line = fp.readline()
fp.close() 

[]
[]
[]
['twitter']
[]
['makeitcount']
['sigh']
['hurts']
['hurts']


In [29]:
# 트위터 데이터가 .CSV 인 경우
# Tweets are read one by one and then processed.
inpTweets = csv.reader(open('./data/sampleTweets.csv', 'r'), delimiter=',', quotechar='|')
tweets = []
for row in inpTweets:
    sentiment = row[0]
    tweet = row[1]
    processedTweet = processTweet(tweet)
    featureVector = getFeatureVector(processedTweet) #, stopWords)
    tweets.append((featureVector, sentiment));
tweets

[([], 'positive'),
 ([], 'positive'),
 ([], 'positive'),
 (['twitter'], 'neutral'),
 ([], 'neutral'),
 (['makeitcount'], 'neutral'),
 (['sigh'], 'negative'),
 (['hurts'], 'negative'),
 (['hurts'], 'negative')]

In [30]:
# 특징 추출하는 메서드
def extract_features(tweet):
    tweet_words = set(tweet)
    features = {}
    for word in featureList:
        features['contains(%s)' % word] = (word in tweet_words)
    return features

In [31]:
# 나이브베이즈 분류기로 감정분석
NaiveBClassifier = nltk.NaiveBayesClassifier.train(train_sets)
#processedTestTweet = processTweet(test_sets)
# Testing the classifiertestTweet = 'I liked this book on Sentiment Analysis a lot.'
# processedTestTweet = processTweet(test_sets)
# NaiveBClassifier.classify(extract_features(getFeatureVector(processedTestTweet)))

In [32]:
testTweet = 'I am so badly hurt'
processedTestTweet = processTweet(testTweet)
processedTestTweet
# NaiveBClassifier.classify(extract_features(getFeatureVector(processedTestTweet)))

'i am so badly hurt'

In [33]:
featureVector = getFeatureVector(processedTweet)
print(featureVector)
# line = fp.readline()

['hurts']


### 03  nltk.sentiment.sentiment_analyzer 예제
http://www.nltk.org/howto/probability.html

In [34]:
from nltk.corpus import subjectivity
subjectivity.raw()[:500]

"the movie begins in the past where a young boy named sam attempts to save celebi from a hunter . \nemerging from the human psyche and showing characteristics of abstract expressionism , minimalism and russian constructivism , graffiti removal has secured its place in the history of modern art while being created by artists who are unconscious of their artistic achievements . \nspurning her mother's insistence that she get on with her life , mary is thrown out of the house , rejected by joe , and e"

In [35]:
from nltk.classify import NaiveBayesClassifier
n_instances = 100

# Each document is represented by a tuple (sentence, label)
subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
print('subj_docs :',len(subj_docs), subj_docs[0])
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]
print('obj_docs  :',len(obj_docs), obj_docs[0])

subj_docs : 100 (['smart', 'and', 'alert', ',', 'thirteen', 'conversations', 'about', 'one', 'thing', 'is', 'a', 'small', 'gem', '.'], 'subj')
obj_docs  : 100 (['the', 'movie', 'begins', 'in', 'the', 'past', 'where', 'a', 'young', 'boy', 'named', 'sam', 'attempts', 'to', 'save', 'celebi', 'from', 'a', 'hunter', '.'], 'obj')


In [36]:
# 학습, test 데이터를 생성한다
train_subj_docs = subj_docs[:80]
test_subj_docs  = subj_docs[80:100]
train_obj_docs  =  obj_docs[:80]
test_obj_docs   =  obj_docs[80:100]

training_docs = train_subj_docs + train_obj_docs
testing_docs  = test_subj_docs + test_obj_docs

In [37]:
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import mark_negation, extract_unigram_feats

sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
len(all_words_neg), all_words_neg[:7]

(3799, ['smart', 'and', 'alert', ',', 'thirteen', 'conversations', 'about'])

In [38]:
# We use simple unigram word features, handling negation:
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams = unigram_feats)
len(unigram_feats), unigram_feats[:7]

(83, ['.', 'the', ',', 'a', 'and', 'of', 'to'])

In [39]:
# We apply features to obtain a feature-value representation of our datasets:
training_set = sentim_analyzer.apply_features(training_docs)
test_set = sentim_analyzer.apply_features(testing_docs)

# We can now train our classifier on the training set, and subsequently output the evaluation results:
trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)
# `Training classifier
for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
    print('{0}: {1}'.format(key, value))

Training classifier
Evaluating NaiveBayesClassifier results...
Accuracy: 0.8
F-measure [obj]: 0.8
F-measure [subj]: 0.8
Precision [obj]: 0.8
Precision [subj]: 0.8
Recall [obj]: 0.8
Recall [subj]: 0.8


### 04  nltk.sentiment.sentiment_analyzer  모듈 살펴보기
http://www.nltk.org/howto/probability.html

In [40]:
#from __future__ import print_function
from collections import defaultdict
from nltk.classify.util import apply_features, accuracy as eval_accuracy
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import (BigramAssocMeasures, precision as eval_precision, 
                          recall as eval_recall, f_measure as eval_f_measure)
from nltk.probability import FreqDist
from nltk.sentiment.util import save_file, timer

# 기계학습에 기반한 감정분석도구
class SentimentAnalyzer(object):
    def __init__(self, classifier=None):
        self.feat_extractors = defaultdict(list)
        self.classifier = classifier

In [41]:
    # 텍스트에서 모든 (중복)단어를 반환
    def all_words(self, documents, labeled=None):
        all_words = []
        if labeled is None:
            labeled = documents and isinstance(documents[0], tuple)
        if labeled == True:
            for words, sentiment in documents:
                all_words.extend(words)
        elif labeled == False:
            for words in documents:
                all_words.extend(words)
        return all_words

In [42]:
    # 특징 추출 함수 feature extraction function
    def apply_features(self, documents, labeled=None):
        return apply_features(self.extract_features, documents,labeled)

In [43]:
    # 단어의 특징을 반환하는 코드
    def unigram_word_feats(self, words, top_n=None, min_freq=0):
        unigram_feats_freqs = FreqDist(word for word in words)
        return [w    for   w, f   in   unigram_feats_freqs.most_common(top_n) 
                     if   unigram_feats_freqs[w]  >  min_freq ]

In [44]:
    # bi-gram의 특징을 반환하는 코드
    def bigram_collocation_feats(self, documents, top_n=None, min_freq=3, assoc_measure=BigramAssocMeasures.pmi):
        finder = BigramCollocationFinder.from_documents(documents)
        finder.apply_freq_filter(min_freq)
        return finder.nbest(assoc_measure, top_n)

In [45]:
    # 사용 가능한 특징세트를 사용하여, 주어진 인스턴스를 분류
    def classify(self, instance):
        instance_feats = self.apply_features([instance],labeled=False)
        return self.classifier.classify(instance_feats[0])

In [46]:
    # 텍스트에서 특징 추출을 위해 사용
    def add_feat_extractor(self, function, **kwargs):
        self.feat_extractors[function].append(kwargs)

    def extract_features(self, document):
        all_features = {}
        for extractor in self.feat_extractors:
            for param_set in self.feat_extractors[extractor]:
                feats = extractor(document, **param_set)
            all_features.update(feats)
        return all_features

In [47]:
    # 훈련데이터를 훈련시키는 함수
    def train(self, trainer, training_set, save_classifier = None, **kwargs):
        print("Training classifier")
        self.classifier = trainer(training_set, **kwargs)
        if save_classifier:
            save_file(self.classifier, save_classifier)
        return self.classifier

In [48]:
    # 테스트데이터를 사용한 분류기의 테스트 및 성능평가
    def evaluate(self, test_set, classifier = None, accuracy = True, 
                 f_measure = True, precision = True, recall = True, verbose = False):

        if classifier is None:  # 분류기에 아무것도 지정하지 않은 경우
            classifier = self.classifier  # __init__의 초깃값을 불러와서 작업을 시작
        print("Evaluating {0} results...".format(type(classifier).__name__))
        metrics_results = {}              # 출력 report dictionary를 생성

        if accuracy == True:   # 정확도 측정옵션에 True를 입력시
            accuracy_score = eval_accuracy(classifier, test_set) # 분류기 기준, 데이터를 test한다
            metrics_results['Accuracy'] = accuracy_score         # report dictionary에 기록 

        # 출처 : https://dongyeopblog.wordpress.com/2016/04/08/python-defaultdict-%EC%82%AC%EC%9A%A9%ED%95%98%EA%B8%B0/
        gold_results = defaultdict(set)   # key값 지정없어도, default로 key를 자동으로 지정한다
        test_results = defaultdict(set)   
        labels = set()
        for i, (feats, label) in enumerate(test_set): # test 상세내용을 기록한다
            labels.add(label)
            gold_results[label].add(i)
            observed = classifier.classify(feats)
            test_results[observed].add(i)

        for label in labels:  # test 결과 수집된 labels 에 따라, 평가함수로 계산을 한다
            if precision == True:  # 정확도 측정
                precision_score = eval_precision(gold_results[label], test_results[label])
                metrics_results['Precision [{0}]'.format(label)] = precision_score
            if recall == True:     # recall 측정
                recall_score = eval_recall(gold_results[label], test_results[label])
                metrics_results['Recall [{0}]'.format(label)] = recall_score
            if f_measure == True:  # f-measure 측정
                f_measure_score = eval_f_measure(gold_results[label], test_results[label])
                metrics_results['F-measure [{0}]'.format(label)] = f_measure_score
            if verbose == True:    # Data를 정렬
                for result in sorted(metrics_results):
                    print('{0}: {1}'.format(result, metrics_results[result]))
        return metrics_results

## 7 NER 시스템의 평가
Evaluation of the NER system