In [7]:
# 공통 LIB
from urllib.request import urlopen

In [8]:
# PDF LIB_1
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from io import open

In [18]:
# PDF LIB_2
from PyPDF2 import PdfFileReader, PdfFileWriter

In [1]:
# ngrames 단어묶기
from bs4 import BeautifulSoup as BS
import re
import string
from collections import Counter

In [5]:
# 마르코프 모델(요약문만들기)
from random import randint

In [13]:
# PDF 파일 열기_1 : return -> String
def readPDF(pdfFile):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    
    process_pdf(rsrcmgr, device, pdfFile)
    device.close()
    
    content = retstr.getvalue()
    retstr.close()
    return content

# url형식으로 파일열기
def readPDF_URL(url):
    pdfFile = urlopen(url)
    
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    
    process_pdf(rsrcmgr, device, pdfFile)
    device.close()
    
    content = retstr.getvalue()
    retstr.close()
    pdfFile.close()
    return content

In [21]:
# PDF 파일 열기_2 : return -> 해당페이지의 String
def readPDF_getPage(pdfFile,pageNum):
    pdfReader = PdfFileReader(open(pdfFile, 'rb'))
    return pdfReader.getPage(pageNum).extract_text()

# PDF 파일 열기_2 : return -> 전체페이지의 String
def readPDF_getPage(pdfFile):
    pdfReader = PdfFileReader(open(pdfFile, 'rb'))
    content = ''
    for i in range(pdfReader.getNumPages()):
        content += pdfReader.getPage(i).extract_text()
    return content

# PDF 파일 한 페이지만 따로 저장
def savePDF_onePage(pdfFile, saveName, pageNum):
    pdfReader  = PdfFileReader(open(pdfFile, 'rb'))
    pdf_writer = PdfFileWriter()
    pdf_writer.add_page(pdfReader.getPage(pageNum))
    with open(saveName,'wb') as f:
        pdf_writer.write(f)

In [32]:
# ngrames 단어묶기_1 : return => list
def getNgrams(content, n):
    content = re.sub('\n|[[\d+\]]',' ',content)
    content = bytes(content,'UTF-8')
    content = content.decode('ascii','ignore')
    content = content.split(' ')
    content = [word for word in content if word != '']
    output = []
    for i in range(len(content)-n+1):
        output.append(content[i:i+n])
    return output

# 사용 예시
"""
html = urlopen('https://en.wikipedia.org/wiki/Python_(programming_language)')
bs = BS(html,'html.parser')
content = bs.find('div',{'id':'mw-content-text'}).get_text()
ngrams = getNgrames(content,2)
for ngram in ngrams:
    print(ngram)
"""

"\nhtml = urlopen('https://en.wikipedia.org/wiki/Python_(programming_language)')\nbs = BS(html,'html.parser')\ncontent = bs.find('div',{'id':'mw-content-text'}).get_text()\nngrams = getNgrames(content,2)\nfor ngram in ngrams:\n    print(ngram)\n"

In [4]:
# ngrames 단어묶기
def cleanSentence(sentence):
    sentence = sentence.split(' ')
    sentence = [word.strip(string.punctuation+string.whitespace) for word in sentence]
    sentence = [word for word in sentence if len(word) > 1 or (word.lower() == 'a' or word.lower() == 'i')]
    return sentence
    
def cleanInput(content):
    content = content.upper()
    content = re.sub('\n|[[\d+\]]',' ',content)
    content = bytes(content,'UTF-8')
    content = content.decode('ascii','ignore')
    sentences = content.split('. ')
    return [cleanSentence(sentence) for sentence in sentences]

def getNgramsFromSentence(content, n):
    output = []
    for i in range(len(content)-n+1):
        output.append(content[i:i+n])
    return output
    
def getNgrams(content, n):
    content = cleanInput(content)
    ngrams = Counter()
    ngrams_list = []
    for sentence in content:
        newNgrams = [' '.join(ngram) for ngram in getNgramsFromSentence(sentence,n)]
        ngrams_list.extend(newNgrams)
        ngrams.update(newNgrams)
    return(ngrams)

# 사용 예시
"""
html = urlopen('https://en.wikipedia.org/wiki/Python_(programming_language)')
bs = BS(html,'html.parser')
content = bs.find('div',{'id':'mw-content-text'}).get_text()
#ngrames = getNgrames(content,2)
ngrams = getNgrams(content,2)
for key,value in ngrams.items():
    print(key,value)
"""

"\nhtml = urlopen('https://en.wikipedia.org/wiki/Python_(programming_language)')\nbs = BS(html,'html.parser')\ncontent = bs.find('div',{'id':'mw-content-text'}).get_text()\n#ngrames = getNgrames(content,2)\nngrams = getNgrams(content,2)\nfor key,value in ngrams.items():\n    print(key,value)\n"

In [7]:
# 마르코프 모델(요약문만들기)
def wordListSum(wordList):
    sum = 0
    for word, value in wordList.items():
        sum += value
    return sum

def retrieveRandomWord(wordList):
    randIndex = randint(1, wordListSum(wordList))
    for word, value in wordList.items():
        randIndex -= value
        if randIndex <= 0:
            return word
        
def buildWordDict(text):
    text = text.replace('\n',' ')
    text = text.replace('"','')
    text = text.replace('\r','')
    
    punctuation = [',','.',';',':']
    for symbol in punctuation:
        text = text.replace(symbol,' {} '.format(symbol))
        
    words = text.split(' ')
    words = [word for word in words if word != '']
    
    wordDict = {}
    for i in range(1, len(words)):
        if words[i-1] not in wordDict:
            wordDict[words[i-1]] = {}
        if words[i] not in wordDict[words[i-1]]:
            wordDict[words[i-1]][words[i]] = 0
        wordDict[words[i-1]][words[i]] += 1
    return wordDict;

"""
사용방법
speech = 'http://pythonscraping.com/files/inaugurationSpeech.txt'
text = str(urlopen(speech).read(),'utf-8')
wordDict = buildWordDict(text)

#for key, value in wordDict.items():
#    print(key , value)

length = 100
chain = ['I']
for i in range(0, length):
    newWord = retrieveRandomWord(wordDict[chain[-1]])
    chain.append(newWord)

print(' '.join(chain))
"""

"\n사용방법\nspeech = 'http://pythonscraping.com/files/inaugurationSpeech.txt'\ntext = str(urlopen(speech).read(),'utf-8')\nwordDict = buildWordDict(text)\n\n#for key, value in wordDict.items():\n#    print(key , value)\n\nlength = 100\nchain = ['I']\nfor i in range(0, length):\n    newWord = retrieveRandomWord(wordDict[chain[-1]])\n    chain.append(newWord)\n\nprint(' '.join(chain))\n"