In [None]:
import os
import sys
import csv
import re
import string
import collections
from konlpy.tag import Komoran

class Preprocess:
    def __init__(self):
        self.komoran = Komoran()
        with open('datasets/stopwords.txt', 'r', encoding='utf-8') as f:
            self.stopwords = set(f.read().split(','))

    def preprocess(self, text):
        text = text.strip()
        text = re.compile('<.*?>').sub('', text)
        text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
        text = re.sub('\s+', ' ', text)
        text = re.sub(r'[^\w\s]', ' ', str(text).strip())
        text = re.sub(r'\d', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text

    def is_korean(self, text):
        korean_pattern = re.compile('[^ㄱ-ㅎ|ㅏ-ㅣ|가-힣]+')
        if korean_pattern.search(text):
            return False
        return True

    def final(self, text):
        n = []
        word = self.komoran.nouns(text)
        p = self.komoran.pos(text)
        for pos in p:
            if pos[1] in ['SL'] and self.is_korean(pos[0]):
                word.append(pos[0])
        for w in word:
            if len(w) > 1 and w not in self.stopwords:
                n.append(w)
        return " ".join(n)

    def finalpreprocess(self, documents):
        for doc in documents:
            doc['article'] = self.final(self.preprocess(doc['article']))
        return documents

def load_documents(file_path):
    documents = []
    with open(file_path, mode='r', encoding='cp949') as file:
        reader = csv.DictReader(file)
        for row in reader:
            title = row['title']
            article = row['article']
            documents.append({'title': title, 'article': article})
    return documents

def filter_documents(documents, exclude_words):
    filtered_documents = []
    for doc in documents:
        article_words = doc['article'].split()
        filtered_words = [word for word in article_words if word not in exclude_words]
        filtered_article = ' '.join(filtered_words)
        if filtered_article.endswith('서울'):
            filtered_article = filtered_article.rsplit(' ', 1)[0]
        doc['article'] = filtered_article
        filtered_documents.append(doc)
    return filtered_documents
    
## tests ########################################################################################

def textrank(document, window_size=2, rsp=0.15):
    '''
    This function accepts a string representation
    of a document and three hyperperameters as input.
    It returns Pandas matrix (that can be treated
    as a dictionary) that maps words in the
    document to their associated TextRank significance
    scores. Note that only words that are classified
    as having relevant POS tags are present in the
    map.
    '''
        
    # Build a weighted graph where nodes are words and
    # edge weights are the number of times words cooccur
    # within a window of predetermined size. In doing so
    # we double count each coocurrence, but that will not
    # alter relative weights which ultimately determine
    # TextRank scores.
    words = document
    edge_weights = collections.defaultdict(lambda: collections.Counter())
    for index, word in enumerate(words):
        for other_index in range(index - window_size, index + window_size + 1):
            if other_index >= 0 and other_index < len(words) and other_index != index:
                other_word = words[other_index]
                edge_weights[word][other_word] += 1.0

    # Apply PageRank to the weighted graph:
    word_probabilities = pagerank.power_iteration(edge_weights, rsp=rsp)
    word_probabilities.sort_values(ascending=False)

    return word_probabilities
    
def apply_text_tank(documents, title="a document"):
    print()
    print("Applying TextRank to \"%s\" ..." % title)
    
    for doc in documents:
        document = doc['article']
        keyword_scores = textrank(document)
        
        print()
        header = "Keyword Significance Scores for \"%s\":" % doc['title']
        print(header)
        print("-" * len(header))
        print(keyword_scores)
        print()
        
def main():
    file_path = 'datasets/Korea_DB_0413.csv'
    documents = load_documents(file_path)
    preprocessor = Preprocess()
    processed_documents = preprocessor.finalpreprocess(documents)
    exclude_words = ['뉴시스', '기자', '뉴스', '동아']
    filtered_documents = filter_documents(processed_documents, exclude_words)
    apply_text_tank(2/datasets/Korea_DB_0413.csv, "4월 10일 투표")
    
if __name__ == "__main__":
    main()

In [None]:
filtered_documents

In [4]:
import pagerank
import csv
import collections

def textrank(document, window_size=2, rsp=0.15):
    '''
    This function accepts a string representation
    of a document and three hyperperameters as input.
    It returns Pandas matrix (that can be treated
    as a dictionary) that maps words in the
    document to their associated TextRank significance
    scores. Note that only words that are classified
    as having relevant POS tags are present in the
    map.
    '''
        
    # Build a weighted graph where nodes are words and
    # edge weights are the number of times words cooccur
    # within a window of predetermined size. In doing so
    # we double count each coocurrence, but that will not
    # alter relative weights which ultimately determine
    # TextRank scores.
    words = document
    edge_weights = collections.defaultdict(lambda: collections.Counter())
    for index, word in enumerate(words):
        for other_index in range(index - window_size, index + window_size + 1):
            if other_index >= 0 and other_index < len(words) and other_index != index:
                other_word = words[other_index]
                edge_weights[word][other_word] += 1.0

    # Apply PageRank to the weighted graph:
    word_probabilities = pagerank.power_iteration(edge_weights, rsp=rsp)
    word_probabilities.sort_values(ascending=False)

    return word_probabilities
    
def apply_text_tank(documents, title="a document"):
    print()
    print("Applying TextRank to \"%s\" ..." % title)
    
    for doc in documents:
        document = doc['article']
        keyword_scores = textrank(document)
        
        print()
        header = "Keyword Significance Scores for \"%s\":" % doc['title']
        print(header)
        print("-" * len(header))
        print(keyword_scores)
        print()

def load_documents(file_path):
    documents = []
    with open(file_path, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            title = row['title']
            article = row['article']
            documents.append({'title': title, 'article': article})
    return documents

def main():
    file_path = 'datasets/preprocessed.csv'
    documents = load_documents(file_path)
    apply_text_tank(documents, "4월 10일 투표")
    
if __name__ == "__main__":
    main()


Applying TextRank to "4월 10일 투표" ...

Keyword Significance Scores for "조태열-블링컨 첫 통화…“한미 동맹 더욱 심화·발전”":
----------------------------------------------------------------
취    0.005755
임    0.010792
     0.281809
인    0.008279
사    0.016701
       ...   
응    0.003055
보    0.004990
결    0.003145
행    0.003420
견    0.002889
Length: 124, dtype: float64


Keyword Significance Scores for "조국 “민주, 총선서 포용력 발휘해 반윤연대 꾸려야”":
----------------------------------------------------------------
정    0.026972
책    0.008983
     0.274561
싱    0.003169
크    0.005612
       ...   
여    0.002074
달    0.002061
식    0.002080
범    0.003255
예    0.001972
Length: 172, dtype: float64


Keyword Significance Scores for "‘재산신고 누락’ 이균용 전 대법원장 후보자 경고 처분":
-----------------------------------------------------------------
상    0.007658
장    0.028676
     0.268016
주    0.010837
식    0.011314
       ...   
통    0.004234
일    0.004541
간    0.004393
백    0.004523
기    0.003674
Length: 88, dtype: float64


Keyword Significan

KeyboardInterrupt: 

In [None]:
file_path = 'datasets/preprocessed.csv'
documents = load_documents(file_path)
apply_text_tank(documents, "총선")


Applying TextRank to "총선" ...

Keyword Significance Scores for "조태열-블링컨 첫 통화…“한미 동맹 더욱 심화·발전”":
----------------------------------------------------------------
취    0.005755
임    0.010792
     0.281809
인    0.008279
사    0.016701
       ...   
응    0.003055
보    0.004990
결    0.003145
행    0.003420
견    0.002889
Length: 124, dtype: float64


Keyword Significance Scores for "조국 “민주, 총선서 포용력 발휘해 반윤연대 꾸려야”":
----------------------------------------------------------------
정    0.026972
책    0.008983
     0.274561
싱    0.003169
크    0.005612
       ...   
여    0.002074
달    0.002061
식    0.002080
범    0.003255
예    0.001972
Length: 172, dtype: float64


Keyword Significance Scores for "‘재산신고 누락’ 이균용 전 대법원장 후보자 경고 처분":
-----------------------------------------------------------------
상    0.007658
장    0.028676
     0.268016
주    0.010837
식    0.011314
       ...   
통    0.004234
일    0.004541
간    0.004393
백    0.004523
기    0.003674
Length: 88, dtype: float64


Keyword Significance Scor