In [19]:
import numpy as np
import pandas as pd
from pymongo import MongoClient 
import os

from sklearn.cluster import KMeans

#text
from nltk.tokenize import sent_tokenize
from konlpy.tag import Kkma
from konlpy.tag import Twitter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize

def conn_db(): 
    user = os.getenv("DBUSER", 'signature')
    pwd = os.getenv("DBPWD", 'shanekang')
    ip_addr = os.getenv("DBADDR", '54.180.213.105')
    conn = MongoClient(f'mongodb://{user}:{pwd}@{ip_addr}:27017') 
    return conn

conn = conn_db() 
NewsData = conn.NewsData.raw_news

article_data = [elem['content'] for elem in NewsData.find({})]
total_data = article_data

def news_cluster(total_data, n_clusters):
    
    contents_list = list()
    stop_words = ['머니S', '머니투데이', '구독', '파이낸셜뉴스', '헬스조선', '중앙일보', '프레시안', '조선일보', '.com',
                     '매일경제', '한국', '엠빅', '뉴시스', '무단전재재', '오마이뉴스', '연합뉴스', '연합뉴스TV', 'JTBC','SBS',
                      'MBC','KBS','한겨레','YTN','MBN','TV조선','채널A','동아일보','뉴스1', '강원일보','경향신문','세계일보',
                      '한국일보','서울신문','노컷뉴스','이데일리', 'CBS', 'co.kr', '자료사진', '기자', '언론사', '섹션', '무단',
                 '무단전재재', '재배포', '재배포금지','재판매 및 DB금지', '서울경제']

    #토큰화
    contents= [sent_tokenize(x) for x in total_data]

    # stopword 포함 문장 처리
    for content in contents:
        new_str = ''
        for sent in content:
            has_stopword = False
            for word in stop_words:
                if word in sent:
                    has_stopword = True
                    break
            if has_stopword:
                continue
            else:
                new_str += sent +' '

        contents_list.append(new_str)
        
    df = pd.DataFrame(contents_list)
    df = df.rename({0:'content'}, axis='columns')
    contents = df['content'].tolist()
    
    # clustering
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(contents)
    X = normalize(X)

    kmeans = KMeans(n_clusters=n_clusters).fit(X)

    labels = kmeans.labels_

    df['labels'] = labels
    
    clustered_sentences = list() 
    for clust_idx in df.labels.unique(): 
        sentences = df[df['labels'] == clust_idx].content
        clustered_sentences.append('\n'.join([sent for sent in sentences]))

#     return clustered_sentences

    class SentenceTokenizer(object):

        def __init__(self):
            self.kkma = Kkma()
            self.twitter = Twitter()
            self.stopwords = ['뉴스','연합', '자료사진','서울연합','중인' ,'만큼', '마찬가지', '꼬집었', "연합뉴스", "데일리", "동아일보", "중앙일보", "조선일보", "기자"
            ,"아", "휴", "아이구", "아이쿠", "아이고", "어", "나", "우리", "저희", "따라", "의해", "을", "를", "에", "의", "가"]

        def text2sentences(self, text):
        #text2sentences : text(str)를 입력받아 Kkma.sentences()를 이용하여 문장단위로 나누어 준 후 sentences를 return 해 준다.

            sentences = sent_tokenize(text)
            for idx in range(0, len(sentences)):
                if len(sentences[idx]) <= 10:
                    sentences[idx-1] += (' ' + sentences[idx])
                    sentences[idx] = ''
            return sentences
#         print(sentences)

        def get_nouns(self, sentences):
            nouns = []
            for sentence in sentences:
                if sentence is not '':
                    nouns.append(' '.join([noun for noun in self.twitter.nouns(str(sentence))
                                            if noun not in self.stopwords and len(noun) > 1]))

            return nouns
    
    class GraphMatrix(object):
        
        def __init__(self):
            self.tfidf = TfidfVectorizer()
            self.cnt_vec = CountVectorizer()
            self.graph_sentence = []

        def build_sent_graph(self, sentence):
            tfidf_mat = self.tfidf.fit_transform(sentence).toarray()
            self.graph_sentence = np.dot(tfidf_mat, tfidf_mat.T)
            return  self.graph_sentence

        def build_words_graph(self, sentence):
            cnt_vec_mat = normalize(self.cnt_vec.fit_transform(sentence).toarray().astype(float), axis=0)
            vocab = self.cnt_vec.vocabulary_
            return np.dot(cnt_vec_mat.T, cnt_vec_mat), {vocab[word] : word for word in vocab}


    class Rank(object):
        def get_ranks(self, graph, d=0.85): # d = damping factor
            A = graph
            matrix_size = A.shape[0]
            for _id in range(matrix_size):
                A[_id, _id] = 0 # diagonal 부분을 0으로
                link_sum = np.sum(A[:,_id]) # A[:, _id] = A[:][_id]
                if link_sum != 0:
                    A[:, _id] /= link_sum
                A[:, _id] *= -d
                A[_id, _id] = 1

            B = (1-d) * np.ones((matrix_size, 1))
            ranks = np.linalg.solve(A, B) # 연립방정식 Ax = b
            return {idx: r[0] for idx, r in enumerate(ranks)}

    class TextRank(object):
            
        def __init__(self, text):
            self.sent_tokenize = SentenceTokenizer()

            # if text[:5] in ('http:', 'https'):
            #     self.sentences = self.sent_tokenize.url2sentences(text)
            self.sentences = self.sent_tokenize.text2sentences(text)

            self.nouns = self.sent_tokenize.get_nouns(self.sentences)

            self.graph_matrix = GraphMatrix()
            self.sent_graph = self.graph_matrix.build_sent_graph(self.nouns)
            self.words_graph, self.idx2word = self.graph_matrix.build_words_graph(self.nouns)

            self.rank = Rank()
            self.sent_rank_idx = self.rank.get_ranks(self.sent_graph)
            self.sorted_sent_rank_idx = sorted(self.sent_rank_idx, key=lambda k: self.sent_rank_idx[k], reverse=True)

            self.word_rank_idx = self.rank.get_ranks(self.words_graph)
            self.sorted_word_rank_idx = sorted(self.word_rank_idx, key=lambda k: self.word_rank_idx[k], reverse=True)


        def summarize(self, sent_num=3):
            summary = []
            index=[]
            for idx in self.sorted_sent_rank_idx[:sent_num]:
                index.append(idx)
            index.sort()
            for idx in index:
                summary.append(self.sentences[idx])
            return summary
        
    news0 = clustered_sentences[0]
    news1 = clustered_sentences[1]
    news2 = clustered_sentences[2]
    
    textrank = TextRank(news0)
    for row in textrank.summarize(3):
        print(row)
        
    textrank = TextRank(news1)
    for row in textrank.summarize(3):
        print(row)
        
    textrank = TextRank(news2)
    for row in textrank.summarize(3):
        print(row)


news_cluster(article_data, 5)