# Supervised Learning 기반 문헌 분류 (G&I)

## 1. Importing Libraries

In [143]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3

## 2. Importing the Dataset
GNI Corpus 1.0에서 vol.10부터 vol.17까지 301개의 논문지를 이용한다.

### 2-1. sentence_tokenized 데이터를 PlaintextCorpus화

corpus_root = "C:/Users/s_eoy/AppData/Roaming/nltk_data/corpora/GNI Corpus 1.0/sentence_tokenized"
gniCorpus = nltk.corpus.PlaintextCorpusReader(corpus_root, ".*\.txt", encoding = "utf_8")

In [144]:
corpus_root = "C:/Users/USER/s_eoy/AppData/Genomics-Informatics-Corpus-master/GNI Corpus 1.0/sentence_tokenized"
gniCorpus = nltk.corpus.PlaintextCorpusReader(corpus_root, ".*\.txt", encoding = "utf_8")

### 2-2. 논문지 정보를 list로 저장

In [160]:
stopwords = nltk.corpus.stopwords.words('english')
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [190]:
# 논문지의 제목을 titles에 저장하기
titles = [title[:] for title in gniCorpus.fileids()]
# 논문지의 volume number를 vols에 저장하기
vols = [vol[4:6] for vol in titles]
# 논문지의 발행 연도를 years에 저장하기
# 각 논문지의 volume number + 2 값이 발행 연도
years = [int(year) + 2 for year in vols ]

In [191]:
# K-means 알고리즘 적용하기

In [195]:
thesis = {'Title': titles, 'Vol.':vols, 'Year':years}
frame = pd.DataFrame(thesis, index=[years], columns=['Title','Vol.','Year'])
frame.head(167)

Unnamed: 0,Title,Vol.,Year
12,gni-10-1-1.txt,10,12
12,gni-10-1-16.txt,10,12
12,gni-10-1-23.txt,10,12
12,gni-10-1-33.txt,10,12
12,gni-10-1-40.txt,10,12
...,...,...,...
16,gni-14-1-1.txt,14,16
16,gni-14-1-12.txt,14,16
16,gni-14-1-2.txt,14,16
16,gni-14-1-20.txt,14,16


In [196]:
# 각 연도별로 분류된 논문지 개수
frame['Year'].value_counts()

13    49
14    44
12    41
18    40
16    37
19    34
15    28
17    28
Name: Year, dtype: int64

In [197]:
# 각 연도별로 중요한 단어(term)들과 속해있는 논문

In [198]:
def tokenize_and_lemmatize(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('^[a-zA-Z]+$',token):
            filtered_tokens.append(token)
        if re.search('^[a-zA-Z]+[0-9]+$',token):
            filtered_tokens.append(token)
        if re.search('^[0-9]+[a-zA-Z]+$',token):
            filtered_tokens.append(token)
    lemmas = [lemmatizer.lemmatize(t) for t in filtered_tokens]
    return lemmas

# 정규표현식을 사용해 필요한 단어만 추출, Lemmatize를 적용하기 전의 단어
def tokenize_only(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens=[]
    for token in tokens:
        if re.search('^[a-zA-Z]+$',token):
            filtered_tokens.append(token)
        if re.search('^[a-zA-Z]+[0-9]+$',token):
            filtered_tokens.append(token)
        if re.search('^[0-9]+[a-zA-Z]+$',token):
            filtered_tokens.append(token)
    return filtered_tokens

In [199]:
# raw에 함수 적용해 단어 추출하기
totalvocab_lemmatized = []
totalvocab_tokenized = []
for i in raw:
    allwords_lemmatized = tokenize_and_lemmatize(i)
    totalvocab_lemmatized.extend(allwords_lemmatized)
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)
    
print(totalvocab_lemmatized[:20])
print(totalvocab_tokenized[:20])

['title', 'survey', 'of', 'the', 'application', 'of', 'ng', 'to', 'sequencing', 'and', 'expression', 'profiling', 'recently', 'the', 'technology', 'of', 'dna', 'sequence', 'variation', 'and']
['Title', 'Survey', 'of', 'the', 'Applications', 'of', 'NGS', 'to', 'Sequencing', 'and', 'Expression', 'Profiling', 'Recently', 'the', 'technologies', 'of', 'DNA', 'sequence', 'variation', 'and']


In [200]:
# Lemmatize가 적용된 단어의 원형을 찾을 수 있게 하는 데이터 생성

vocab_frame = pd.DataFrame({'words':totalvocab_tokenized}, index=totalvocab_lemmatized)

# 특징(단어사전) - 알파벳 순 정렬

terms = tfidf_vectorizer.get_feature_names()

In [201]:
from __future__ import print_function

print("Top terms per Year:")
print()
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
for i in range(8):
    print("Year %d words:" % i, end='')
    for ind in order_centroids[1,:7]:
        print(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0], end=',')
    print()
    print()
    print("Year %d titles:" % i, end='')
    for title in frame.loc[i]['Title'].values.tolist():
        print(' %s.' % title, end='')
    print()
    print()

Top terms per Year:

Year 0 words: cell, Expression, cancers, protein, gene, level, RNAs,

Year 0 titles:

KeyError: 0

### 2-3. 모든 논문지의 내용을 list로 저장

In [149]:
raw = [ gniCorpus.raw(title + ".txt") for title in titles ]
raw[:5]

['Title: Survey of the Applications of NGS to Whole-Genome Sequencing and Expression Profiling\n\nRecently, the technologies of DNA sequence variation and gene expression profiling have been used widely as approaches in the expertise of genome biology and genetics.\n\nThe application to genome study has been particularly developed with the introduction of the next-generation DNA sequencer (NGS) Roche/454 and Illumina/Solexa systems, along with bioinformation analysis technologies of whole-genome de novo assembly, expression profiling, DNA variation discovery, and genotyping.\n\nBoth massive whole-genome shotgun paired-end sequencing and mate paired-end sequencing data are important steps for constructing de novo assembly of novel genome sequencing data.\n\nIt is necessary to have DNA sequence information from a multiplatform NGS with at least 2× and 30× depth sequence of genome coverage using Roche/454 and Illumina/Solexa, respectively, for effective an way of de novo assembly.\n\nMass

## 3. Text Preprocessing
논문지를 연도별로 분류하여 각 연도마다 가장 자주 나온 주요 단어들을 출력하고자 하므로, 논문지의 내용에서 유의미한 단어만을 추출하는 작업이 필요하다.

In [57]:
pattern = '([a-z])[0-9]+([a-z])'
space = ' '

In [61]:
# 문장 단위 분절

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

for i in range( len(raw) ) :
    
    if raw[i].strip() != "" :
        raw[i] = re.sub( r'([a-z])\.([A-Z])', r'\1. \2', raw[i].strip() )
        sentences = sent_tokenize( raw[i].strip() )

        for s in sentences :
          if s != "" :
              result = sys.stdout.write(s + "\n")

    result

Title: Survey of the Applications of NGS to Whole-Genome Sequencing and Expression Profiling

Recently, the technologies of DNA sequence variation and gene expression profiling have been used widely as approaches in the expertise of genome biology and genetics.
The application to genome study has been particularly developed with the introduction of the next-generation DNA sequencer (NGS) Roche/454 and Illumina/Solexa systems, along with bioinformation analysis technologies of whole-genome de novo assembly, expression profiling, DNA variation discovery, and genotyping.
Both massive whole-genome shotgun paired-end sequencing and mate paired-end sequencing data are important steps for constructing de novo assembly of novel genome sequencing data.
It is necessary to have DNA sequence information from a multiplatform NGS with at least 2× and 30× depth sequence of genome coverage using Roche/454 and Illumina/Solexa, respectively, for effective an way of de novo assembly.
Massive short-length

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



형태소를 분석하여 특정 품사의 단어만을 추출한다.  
형태소 분석은 다음의 세 단계로 이루어진다.
- 어간 추출
- 원형 복원
- 품사 부착

### 3-1. 어간 추출

In [62]:
from nltk.stem import PorterStemmer, LancasterStemmer

In [63]:
st1 = PorterStemmer()
st2 =  LancasterStemmer()

In [65]:
words = []
for i in range( len(raw) ) :
    words += word_tokenize(raw[i].strip())
    
print("Porter Stemmer :", [st1.stem(w) for w in words])
print("Lancaster Stemmer :", [st2.stem(w) for w in words])

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



### 3-2. 원형 복원
같은 의미를 가지는 여러 단어를 사전형으로 통일하는 작업
품사(part of speech)를 지정하는 경우 좀 더 정확한 원형을 찾기 가능

In [66]:
from nltk.stem import WordNetLemmatizer

WNL = WordNetLemmatizer()
word = [WNL.lemmatize(w, pos='v') for w in words]
word

['Title',
 ':',
 'Survey',
 'of',
 'the',
 'Applications',
 'of',
 'NGS',
 'to',
 'Whole-Genome',
 'Sequencing',
 'and',
 'Expression',
 'Profiling',
 'Recently',
 ',',
 'the',
 'technologies',
 'of',
 'DNA',
 'sequence',
 'variation',
 'and',
 'gene',
 'expression',
 'profile',
 'have',
 'be',
 'use',
 'widely',
 'as',
 'approach',
 'in',
 'the',
 'expertise',
 'of',
 'genome',
 'biology',
 'and',
 'genetics',
 '.',
 'The',
 'application',
 'to',
 'genome',
 'study',
 'have',
 'be',
 'particularly',
 'develop',
 'with',
 'the',
 'introduction',
 'of',
 'the',
 'next-generation',
 'DNA',
 'sequencer',
 '(',
 'NGS',
 ')',
 'Roche/454',
 'and',
 'Illumina/Solexa',
 'systems',
 ',',
 'along',
 'with',
 'bioinformation',
 'analysis',
 'technologies',
 'of',
 'whole-genome',
 'de',
 'novo',
 'assembly',
 ',',
 'expression',
 'profile',
 ',',
 'DNA',
 'variation',
 'discovery',
 ',',
 'and',
 'genotyping',
 '.',
 'Both',
 'massive',
 'whole-genome',
 'shotgun',
 'paired-end',
 'sequence',
 '

### 3-3. 품사 부착
낱말을 문법적인 기능이나 형태, 뜻에 따라 구분한 것
Penn Treebank Tagset in NLTK
- NNP: 단수 고유명사
- VB: 동사
- VBP: 동사 현재형
- TO: to 전치사
- NN: 명사(단수형 혹은 집합형)
- DT: 관형사

In [68]:
# pos_tag : 단어 토큰에 품사 부착하여 튜플로 출력.
from nltk.tag import pos_tag
tagged_li = pos_tag(words)
tagged_li[:10]

[('Title', 'NN'),
 (':', ':'),
 ('Survey', 'NN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('Applications', 'NNP'),
 ('of', 'IN'),
 ('NGS', 'NNP'),
 ('to', 'TO'),
 ('Whole-Genome', 'NNP')]

In [69]:
# 품사 태그 정보 - 동사인 토큰만 선택.
verbs_li = [t[0] for t in tagged_li if t[1] == "VB"]
verbs_li[:10]

['genome',
 'mate',
 'have',
 'discover',
 'approach',
 'find',
 'create',
 'check',
 'understand',
 'reduce']

In [70]:
# 같은 토큰이라도 품사가 다르면 다른 토큰으로 처리
# 원래의 토큰과 품사를 붙여서 새로운 토큰 이름을 만들어 사용
# 철자는 같고 품사가 다른 단어 구분

In [71]:
def tokenizer(doc):
    return ["/".join(p) for p in tagged_li]

tokenizer(sentences)

['Title/NN',
 ':/:',
 'Survey/NN',
 'of/IN',
 'the/DT',
 'Applications/NNP',
 'of/IN',
 'NGS/NNP',
 'to/TO',
 'Whole-Genome/NNP',
 'Sequencing/NNP',
 'and/CC',
 'Expression/NNP',
 'Profiling/NNP',
 'Recently/NNP',
 ',/,',
 'the/DT',
 'technologies/NNS',
 'of/IN',
 'DNA/NNP',
 'sequence/NN',
 'variation/NN',
 'and/CC',
 'gene/NN',
 'expression/NN',
 'profiling/VBG',
 'have/VBP',
 'been/VBN',
 'used/VBN',
 'widely/RB',
 'as/IN',
 'approaches/NNS',
 'in/IN',
 'the/DT',
 'expertise/NN',
 'of/IN',
 'genome/JJ',
 'biology/NN',
 'and/CC',
 'genetics/NNS',
 './.',
 'The/DT',
 'application/NN',
 'to/TO',
 'genome/VB',
 'study/NN',
 'has/VBZ',
 'been/VBN',
 'particularly/RB',
 'developed/VBN',
 'with/IN',
 'the/DT',
 'introduction/NN',
 'of/IN',
 'the/DT',
 'next-generation/JJ',
 'DNA/NNP',
 'sequencer/NN',
 '(/(',
 'NGS/NNP',
 ')/)',
 'Roche/454/NNP',
 'and/CC',
 'Illumina/Solexa/NNP',
 'systems/NNS',
 ',/,',
 'along/IN',
 'with/IN',
 'bioinformation/NN',
 'analysis/NN',
 'technologies/NNS',
 '

In [78]:
# [], {}, (), <> 괄호와 괄호 안 문자 제거하기
pattern = r'\([^)]*\)'
text = ''
for i in range( len(raw) ) :
    text += re.sub(pattern=pattern, repl='', string= raw[i])
text.split()

['Title:',
 'Survey',
 'of',
 'the',
 'Applications',
 'of',
 'NGS',
 'to',
 'Whole-Genome',
 'Sequencing',
 'and',
 'Expression',
 'Profiling',
 'Recently,',
 'the',
 'technologies',
 'of',
 'DNA',
 'sequence',
 'variation',
 'and',
 'gene',
 'expression',
 'profiling',
 'have',
 'been',
 'used',
 'widely',
 'as',
 'approaches',
 'in',
 'the',
 'expertise',
 'of',
 'genome',
 'biology',
 'and',
 'genetics.',
 'The',
 'application',
 'to',
 'genome',
 'study',
 'has',
 'been',
 'particularly',
 'developed',
 'with',
 'the',
 'introduction',
 'of',
 'the',
 'next-generation',
 'DNA',
 'sequencer',
 'Roche/454',
 'and',
 'Illumina/Solexa',
 'systems,',
 'along',
 'with',
 'bioinformation',
 'analysis',
 'technologies',
 'of',
 'whole-genome',
 'de',
 'novo',
 'assembly,',
 'expression',
 'profiling,',
 'DNA',
 'variation',
 'discovery,',
 'and',
 'genotyping.',
 'Both',
 'massive',
 'whole-genome',
 'shotgun',
 'paired-end',
 'sequencing',
 'and',
 'mate',
 'paired-end',
 'sequencing',
 

In [83]:
# 특수문자, 숫자 제거
new_raw = ''
for i in range( len(raw) ) :
    new_raw = re.sub(r"[^a-zA-Z\s]", "", raw[i])
new_raw = new_raw.split()
new_raw

['Title',
 'Directtoconsumer',
 'genetic',
 'testing',
 'Directtoconsumer',
 'DTC',
 'genetic',
 'testing',
 'is',
 'a',
 'controversial',
 'issue',
 'although',
 'Korean',
 'Government',
 'is',
 'considering',
 'to',
 'expand',
 'DTC',
 'genetic',
 'testing',
 'Preventing',
 'the',
 'exaggeration',
 'and',
 'abusing',
 'of',
 'DTC',
 'genetic',
 'testing',
 'is',
 'an',
 'important',
 'task',
 'considering',
 'the',
 'early',
 'history',
 'of',
 'DTC',
 'genetic',
 'testing',
 'in',
 'Korea',
 'And',
 'the',
 'DTC',
 'genetic',
 'testing',
 'performance',
 'or',
 'method',
 'has',
 'been',
 'rarely',
 'reported',
 'to',
 'the',
 'scientific',
 'andor',
 'medical',
 'community',
 'and',
 'reliability',
 'of',
 'DTC',
 'genetic',
 'testing',
 'needs',
 'to',
 'be',
 'assessed',
 'Law',
 'enforcement',
 'needs',
 'to',
 'improve',
 'these',
 'issues',
 'Also',
 'principle',
 'of',
 'transparency',
 'needs',
 'to',
 'be',
 'applied',
 'Directtoconsumer',
 'DTC',
 'genetic',
 'testing',
 '

In [85]:
# 글자 1개만 있으면 제외하기
li_raw = []
for word in new_raw:
    if len(word) != 1:
        li_raw.append(word)
li_raw

['Title',
 'Directtoconsumer',
 'genetic',
 'testing',
 'Directtoconsumer',
 'DTC',
 'genetic',
 'testing',
 'is',
 'controversial',
 'issue',
 'although',
 'Korean',
 'Government',
 'is',
 'considering',
 'to',
 'expand',
 'DTC',
 'genetic',
 'testing',
 'Preventing',
 'the',
 'exaggeration',
 'and',
 'abusing',
 'of',
 'DTC',
 'genetic',
 'testing',
 'is',
 'an',
 'important',
 'task',
 'considering',
 'the',
 'early',
 'history',
 'of',
 'DTC',
 'genetic',
 'testing',
 'in',
 'Korea',
 'And',
 'the',
 'DTC',
 'genetic',
 'testing',
 'performance',
 'or',
 'method',
 'has',
 'been',
 'rarely',
 'reported',
 'to',
 'the',
 'scientific',
 'andor',
 'medical',
 'community',
 'and',
 'reliability',
 'of',
 'DTC',
 'genetic',
 'testing',
 'needs',
 'to',
 'be',
 'assessed',
 'Law',
 'enforcement',
 'needs',
 'to',
 'improve',
 'these',
 'issues',
 'Also',
 'principle',
 'of',
 'transparency',
 'needs',
 'to',
 'be',
 'applied',
 'Directtoconsumer',
 'DTC',
 'genetic',
 'testing',
 'is',
 