# Chapter 3 - Morphology 시작하기
행태학 Getting Our Feet Wet

<br></br>
## 1 행태학의 이해
Morphology: 어간과 접사(접미사, 접두사, 삽입/외접사)

### 01 Morphology
어간과 접사(접미사, 접두사, 삽입/외접사)

In [1]:
# 어간 (자립형태소) : 접사를 추가하지 않아도 존재가능  ex) believe
# 접사 (의존형태소) : 항상 자립형태소와 함깨 존재     ex) anti-, un-, -able, -ly

### 02 Stemming의 이해
스테머 : 단어의 접사제거

In [2]:
# PorterStemmer() : 단어들의 어간 지식을 보유한다

from nltk.stem import PorterStemmer
stemmerporter = PorterStemmer()
print(stemmerporter.stem('working'))
print(stemmerporter.stem('happiness'))

work
happi


In [3]:
# LancasterStemmer() : 랭커스터대학교 개발, 감정단어에 특화

from nltk.stem import LancasterStemmer
stemmerlan = LancasterStemmer()
print(stemmerlan.stem('working'))
print(stemmerlan.stem('happiness'))

work
happy


In [4]:
# RegexpStemmer('ing') : 'ing' 어근을 찾아서 삭제 (1개만 가능)

from nltk.stem import RegexpStemmer
stemmerregexp = RegexpStemmer('ing')
print(stemmerregexp.stem('working'))
print(stemmerregexp.stem('happiness'))
print(stemmerregexp.stem('pairing'))

work
happiness
pair


In [5]:
# SnowballStemmer() : 스페인어, 불어등의 외국어 어근처리

from nltk.stem import SnowballStemmer
print(SnowballStemmer.languages)
spanishstemmer=SnowballStemmer('spanish')
print(spanishstemmer.stem('comiendo'))
frenchstemmer=SnowballStemmer('french')
print(frenchstemmer.stem('manger'))

('danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')
com
mang


In [6]:
""" It is an interface that helps to eliminate morphological affixes from
the tokens and the process is known as stemming. """
class StemmerI(object):
    # Eliminate affixes from token and stem is returned.
    def stem(self, token):
        raise NotImplementedError()

In [7]:
import nltk
def obtain_tokens():
    with open('./data/adam_smith.txt') as stem: 
        tokens = nltk.word_tokenize(stem.read())
    return tokens

tok = obtain_tokens()
print('tok length :' ,len(tok), '\ntokens is  :', tok[:10])

tok length : 435806 
tokens is  : ['\ufeffThe', 'Project', 'Gutenberg', 'EBook', 'of', 'An', 'Inquiry', 'into', 'the', 'Nature']


In [8]:
from nltk.stem.porter import PorterStemmer
def stemming(filtered):
    stem=[]
    for x in filtered:
        stem.append(PorterStemmer().stem(x))
    return stem

stem_tokens = stemming(tok)
# result = dict(zip(tok,stem_tokens))  
print("After stemming is :",stem_tokens[:10])

After stemming is : ['\ufeffthe', 'project', 'gutenberg', 'ebook', 'of', 'An', 'inquiri', 'into', 'the', 'natur']


<br></br>
## 2  원형복원 이해
lemmatization : 다른 단어범주의 형태로 단어를 변환

In [9]:
from nltk.stem import WordNetLemmatizer
lemmatizer_output = WordNetLemmatizer()
print(lemmatizer_output.lemmatize('working'))
print(lemmatizer_output.lemmatize('working',pos='v')) # pos='v' : 음성 카테고리
print(lemmatizer_output.lemmatize('works'))

working
work
work


In [10]:
# to stem
from nltk.stem import PorterStemmer
stemmer_output = PorterStemmer()
print('to Stemmer :' ,stemmer_output.stem('happiness'))

# to lemmatize (원형복원)  
from nltk.stem import WordNetLemmatizer
lemmatizer_output = WordNetLemmatizer()
print('to Word    :' ,lemmatizer_output.lemmatize('happiness'))

to Stemmer : happi
to Word    : happiness


<br></br>
## 3 비영어 언어의 스테머 개발
Morpho에서 지원하는 언어들 확인

In [11]:
# pyicu 설치로 인한 오류가 잔존
# https://stackoverflow.com/questions/40940188/error-installing-pip-pyicu

In [12]:
#from polyglot.downloader import downloader
#print(downloader.supported_languages_table("morph2"))

<br></br>
## 4 형태소 분석기
형태소기반, 어휘기반, 단어기반

http://pythonhosted.org/pyenchant/tutorial.html

In [13]:
# pip install pyenchant
# 알파벳 단어 확인모듈

import enchant
s = enchant.Dict("en_US")
s.check('ness')   # 해당 단어의 적합성 확인

False

In [14]:
texts = "itismyfavouritebook"

def tokenize(st1):
    result, i = [], 0
    for j in range(len(st1),-1,-1):
        if s.check(st1[0:j]):
            st = st1[0:1]
            print(st)
            #result.append(st1[i:j])
            #i = j
    return result

In [15]:
#tokenize("itismyfavouritebook")

<br></br>
## 5 검색엔진
PyStemmer 1.0.1

### 01 불용어 제거 및 토근화 함수
Stemmer 사용자 함수

In [16]:
# 문맥 중심으로 중요단어를 선별
def eliminatestopwords(self,list):
    return [ word for word in list if word not in self.stopwords ]

# 불용어와 token을 구분
def tokenize(self,string):
    Str=self.clean(str)
    Words=str.split("")
    return [self.stemmer.stem(word,0,len(word)-1) for word in words]

### 02 키워드 벡터 차원의 매핑
mapping keywords into vector dimensions

In [17]:
# 문버 벡터에서 주어진 포지션에 대한 키워드를 생성
def obtainvectorkeywordindex(self, documentList):
    # 텍스트를 문자열로 매핑
    vocabstring = "".join(documentList)
    vocablist = self.parser.tokenise(vocabstring)
    # 중요성이 없는 일반단어(common words)를 삭제
    vocablist = self.parser.eliminatestopwords(vocablist)
    uniqueVocablist = util.removeDuplicates(vocablist)
    vectorIndex = {}
    offset = 0
    # 차원/ 매칭을 수행하는 키워드의 index값 추출
    for word in uniqueVocablist:
        vectorIndex[word] = offset
        offset += 1
    return vectorIndex #(keyword:position)

### 03 심플 턴 카운트 모델
Simple term count model

In [18]:
# simple Term Count Model is used
def constructVector(self, wordString):
    # 벡터 초기화
    Vector_val = [0] * len(self.vectorKeywordIndex)
    tokList = self.parser.tokenize(tokString)
    tokList = self.parser.eliminatestopwords(tokList)
    for word in toklist:
        vector[self.vectorKeywordIndex[word]] += 1;
    return vector

### 04 cos() 각을 찾아서 문사유사도 확인
코사인 값이 1 (각도  0, 벡터 평행) : 문서 관련성 O

코사인 값이 0 (각도 90, 벡터 수직) : 문서 관련성 X

In [19]:
# cosine = ( X * Y ) / ||X|| x ||Y||
def cosine(vec1, vec2):
    return float(dot(vec1,vec2) / (norm(vec1) * norm(vec2)))

### 05 키워드와 벡터 공간의 매핑
cos() 각을 찾아서 문사유사도 확인

In [20]:
# 키워드와 벡터 공간의 매핑을 수행
def searching(self,searchinglist):
    askVector = self.buildQueryVector(searchinglist)
    ratings = [util.cosine(askVector, textVector) 
               for textVector in self.documentVectors]
    ratings.sort(reverse = True)
    return ratings

### 06 소스 텍스트의 언어를 탐지하는 모델을 실행
nltk

In [21]:
import nltk, sys
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords

from nltk.stem import SnowballStemmer
print(SnowballStemmer.languages)

('danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')


In [22]:
# 다른언어여부 확인{'german': 2, 'french': 4, 'english': 1}
# nltk.wordpunct_tokenize() : 모든 문장부호를 token으로 생성
def _calculate_languages_ratios(text):
    languages_ratios = {}
    tok = wordpunct_tokenize(text)
    words = [word.lower() for word in tok]
    # 텍스트에서 고유 불용어의 발생을 계산한다
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)
        languages_ratios[language] = len(common_elements)
    return languages_ratios # language "score

In [23]:
# 다른언어의 포함확률을 계산하고, 텍스트내 고유 불용어를 계산한다
def detect_language(text):
    ratios = _calculate_languages_ratios(text)
    most_rated_language = max(ratios, key=ratios.get)
    return most_rated_language

In [24]:
text = '''
All over this cosmos, most of the people believe that there is
an invisible supreme power that is the creator and the runner of
this world. Human being is supposed to be the most intelligent and
loved creation by that power and that is being searched by human
beings in different ways into different things. As a result people
reveal His assumed form as per their own perceptions and beliefs.'''

In [25]:
language = detect_language(text)
print(language)

english


In [26]:
text = '''Artikel 26
1. Jeder hat das Recht auf Bildung. Die Bildung ist unentgeltlich, zum mindesten 
der Grundschulunterricht und die grundlegende Bildung. Der Grundschulunterricht 
ist obligatorisch. Fach- und Berufsschulunterricht müssen allgemein verfügbar 
gemacht werden, und der Hochschulunterricht muß allen gleichermaßen entsprechend 
ihren Fähigkeiten offenstehen.'''

In [27]:
language = detect_language(text)
print(language)

german


In [28]:
text = '''Suzanne et Joseph étaient nés dans les deux premières années de leur 
arrivée à la colonie. Après la naissance de Suzanne, la mère abandonna l’enseignement 
d’état. Elle ne donna plus que des leçons particulières de français. Son mari avait 
été nommé directeur d’une école indigène et, disaient-elle, ils avaient vécu très 
largement malgré la charge de leurs enfants.'''

In [29]:
language = detect_language(text)
print(language)

french


In [30]:
text = '''Лорем ипсум долор сит амет, ан темпорибус сцрибентур сед, дуо ут омиттам 
форенсибус омиттантур. Но вим яуис дицо елаборарет. Ех сеа дицтас тациматес салутанди, 
яуис цоммуне фастидии ет иус, ид вих яуаестио сенсибус патриояуе. Ад инани цонсеяуат вих.'''

In [31]:
language = detect_language(text)
print(language)

russian
