In [15]:
from nltk.tokenize import RegexpTokenizer, word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
from sklearn.preprocessing import normalize
import os
    
# 1-1) Load the stopwords
def get_stopwords(stopwords_path):
    stop_words_1 = stopwords.words('english')   # stopwords 179개
    with open(stopwords_path, 'r', encoding='utf-8') as f:
        stop_words = f.readlines()
        stop_words = [word.strip() for word in stop_words]   # stopwords 854개
    return stop_words
        
# 1-2) Load the script  
def get_script(script_path):
    with open(script_path) as f:
        text = f.read()
    sentences = sent_tokenize(text)
    sentences = [sent.replace('\n', ' ') for sent in sentences] 
    return sentences

# 3-1) Build the sentence graph
def build_sent_graph(sents, tfidf):  # 문장 리스트 -> tf-idf matrix -> sentence graph
    graph_sentence = []
    tfidf_mat = tfidf.fit_transform(sents).toarray()
    graph_sentence = np.dot(tfidf_mat, tfidf_mat.T)
    return graph_sentence

# 3-2) Build the word graph
def build_word_graph(sent, cnt_vec):
    cnt_vec_mat = normalize(cnt_vec.fit_transform(sent).toarray().astype(float), axis=0)
    vocab = cnt_vec.vocabulary_
    graph_word = np.dot(cnt_vec_mat.T, cnt_vec_mat)
    idx2word = {vocab[word] : word for word in vocab}
    return graph_word, idx2word

# 4) Calculate the ranks of each sentence or word
def get_ranks(graph, d=0.85):
    A = graph
    matrix_size = A.shape[0]
    for id in range(matrix_size):
        A[id, id] = 0   # diagonal 부분 -> 0으로 바꿔줌(diagonal matrix)
        link_sum = np.sum(A[:, id])
        if link_sum != 0:
            A[:, id] /= link_sum
        A[:, id] *= -d
        A[id, id] = 1
        
    B = (1-d) * np.ones((matrix_size, 1))
    ranks = np.linalg.solve(A, B)
    
    return {idx: r[0] for idx, r in enumerate(ranks)}

# 5-1) Get the list of keywords
def get_keywords(sorted_word_idx, idx2word, word_num=5):
    keywords = []
    index = []
    for idx in sorted_word_idx[:word_num]:
        index.append(idx)      
    for idx in index:
        keywords.append(idx2word[idx])
        
    return keywords

# 5-2) Get the list of keysentences
def get_keysents(sorted_sent_idx, sentences, sent_num=2):
    keysents=[]
    index=[]
    for idx in sorted_sent_idx[:sent_num]:
        index.append(idx)
    for idx in index:
        keysents.append(sentences[idx])

    return keysents

# 6) Final: Get the sentence with blank, answer sentence, answer word
def keysents_blank(keywords:list, keysents:list):
    keysent=''   # blank 만들 keysent
    keysent_blank=''   # blank 만든 keysent
    keyword_keysent=''   # keysent의 blank에 들어갈 keyword
    lowest_weight=23   # 가장 작은 weight(초기값: 최대 weight+1)
    
    for sent in keysents:
        sent_weight = keysents.index(sent) + 1 
        
        keyword=''
        for word in keywords:
            if word in sent:
                keyword = word
                break   # keywords 리스트는 앞의 index일수록 순위가 높은 키워드 -> 문장에 존재하면 break    
        if keyword!='':
            word_weight = keywords.index(keyword) + 1
        else:
            word_weight = 23
            
        weight = sent_weight + word_weight
        if weight<lowest_weight:
            lowest_weight = weight
            keysent = sent
            keyword_keysent = keyword
    
    keysent_blank = keysent.replace(keyword_keysent, '__________')
    
    return {'sentence_blank':keysent_blank, 'sentence':keysent, 'answer':keyword_keysent}

def preprocess_sents(sentences, stop_words):
    # 2) Preprocess the sentences
    sents_after=[]   # stop_words 제거, lower()한 list of sentences
    for sent in sentences:
        words = word_tokenize(sent)
        sents_after.append(' '.join([word.lower() for word in words if word.lower() not in stop_words and len(word)>1]))
        sents_after = [s for s in sents_after if s!=''] 
    return sents_after

def run(script_path='scripts_for_stopwords/075-Clustering algorithms.flac.txt'):
    # 1-3) Set the algorithm
    sent_ngram = 1
    word_ngram = 1
    tfidf = TfidfVectorizer(ngram_range=(1, sent_ngram))
    cnt_vec = CountVectorizer(ngram_range=(1, word_ngram))
    
    stopwords_path = 'stop_words_english.txt'
    script_path = script_path
    
    stop_words = get_stopwords(stopwords_path)
    sentences = get_script(script_path)
    sents_after = preprocess_sents(sentences, stop_words)
    
    sent_graph = build_sent_graph(sents_after, tfidf)
    word_graph, idx2word = build_word_graph(sents_after, cnt_vec)
    
    sent_rank_idx = get_ranks(sent_graph)  # 문장 가중치 그래프
    sorted_sent_idx = sorted(sent_rank_idx,   # 문장 가중치 그래프-가중치 작은 차순 정렬
                             key=lambda k: sent_rank_idx[k], reverse=True)
    word_rank_idx = get_ranks(word_graph)  # 단어 가중치 그래프
    sorted_word_idx = sorted(word_rank_idx, 
                             key=lambda k: word_rank_idx[k], reverse=True)

    keywords = get_keywords(sorted_word_idx, idx2word, word_num=10)
    keysents = get_keysents(sorted_sent_idx, sentences, sent_num=10)
#     print(keywords)
#     print(keysents_blank(keywords, keysents))
    
    return keywords, keysents_blank(keywords, keysents)
    

if __name__ == '__main__':
    run(script_path='scripts_for_stopwords/075-Clustering algorithms.flac.txt')

In [29]:
def make_wordfile(script_path):
    keywords_list = run(script_path)[0]
    with open('stop_words_english.txt', 'r', encoding='utf-8') as f:
        stopwords_list = f.readlines()
        stopwords_list = [word.strip() for word in stopwords_list]
    
    sciwords_list = [word for word in keywords_list if word not in stopwords_list]
    
    file = open('sciwords.txt', 'a', encoding='utf-8')
    for word in keywords_list:
        if word not in stopwords_list:
            file.write(word+'\n')
    file.close()

----
## stopwords 작업하면서 데이터사이언스 관련 단어집 만들기  
키워드 50개 추출 -> stopwords.txt에 포함되는 것 제거 -> sciwords.txt에 저장

In [36]:
print(run('scripts_for_stopwords/[English] But what is a neural network_ _ Chapter 1, Deep learning [DownSub.com].txt'))

(['network', 'layer', 'learning', 'neurons', 'biases', 'function', 'sigmoid', 're', 'networks', 'digits', 'input', 'structure', 'image', 'feed', 'neuron', 'neural', 'videos', 'component', 'visual', 'outputs', 'code', 'machine', 'pixels', 'activations', 'weight', 'pixel', 'series', 'values', 'biological', 'inactive', 'weights', 'active', 'final', 'pattern', 'connections', 'lisha', 'bias', 'edge', 'linear', 'patterns', 'background', 'distinct', 'weighted', 'handwritten', 'sum', 'partners', 'sort', 'question', 'experiment', 'break'], {'sentence_blank': "There's also a couple __________s in between called the hidden __________s  Which for the time being?", 'sentence': "There's also a couple layers in between called the hidden layers  Which for the time being?", 'answer': 'layer'})


stopwords 추가

In [30]:
make_wordfile('scripts_for_stopwords/075-Clustering algorithms.flac.txt')

['clusters', 'process', 'clustering', 'algorithms', 'cluster', 'group', 'image', 'centers', 'characteristics', 'center', 'learning', 'clustering algorithm', 'algorithm', 'based', 'density', 'science', 'outliers', 'distance', 'cluster centers', 'diagram', 'objects', 'noise', 'approach', 'resent clusters', 'clusters process', 'iterations center', 'movements', 'movements blue', 'process iterations', 'blue threshold', 'blue', 'iterations', 'center movements', 'resent', 'threshold', 'shades readings', 'area clusters', 'readings dance', 'cluster shades', 'clusters core', 'core area', 'area', 'core samples', 'dance', 'off', 'readings', 'shades', 'core', 'dance off', 'samples']


## KeyBERT

In [31]:
from keybert import KeyBERT
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.preprocessing import normalize

# 2) Preprocess the sentences
def preprocess_sents(sentences, stop_words):
    sents_after=[]   # stop_words 제거, lower()한 list of sentences
    for sent in sentences:
        words = word_tokenize(sent)
        sents_after.append(' '.join([word.lower() for word in words if word.lower() not in stop_words and len(word)>1]))
        sents_after = [s for s in sents_after if s!=''] 
    return sents_after

# 3-1) Build the sentence graph
def build_sent_graph(sents, tfidf):  # 문장 리스트 -> tf-idf matrix -> sentence graph
    graph_sentence = []
    tfidf_mat = tfidf.fit_transform(sents).toarray()
    graph_sentence = np.dot(tfidf_mat, tfidf_mat.T)
    return graph_sentence

# 4) Calculate the ranks of each sentence or word
def get_ranks(graph, d=0.85):
    A = graph
    matrix_size = A.shape[0]
    for id in range(matrix_size):
        A[id, id] = 0   # diagonal 부분 -> 0으로 바꿔줌(diagonal matrix)
        link_sum = np.sum(A[:, id])
        if link_sum != 0:
            A[:, id] /= link_sum
        A[:, id] *= -d
        A[id, id] = 1
        
    B = (1-d) * np.ones((matrix_size, 1))
    ranks = np.linalg.solve(A, B)
    
    return {idx: r[0] for idx, r in enumerate(ranks)}

# 5-1) Get the list of keywords
def get_keywords(text, kw_model=KeyBERT('all-MiniLM-L12-v2'), word_num=10, stopwords_list=None):
    stopwords_list = stopwords_list
    keywords = kw_model.extract_keywords(text, top_n=word_num, 
                                         keyphrase_ngram_range=(1,1), 
                                         stop_words=stopwords_list)
    return keywords

# 5-2) Get the list of keysentences
def get_keysents(sorted_sent_idx, sentences, sent_num=2):
    keysents=[]
    index=[]
    for idx in sorted_sent_idx[:sent_num]:
        index.append(idx)
    for idx in index:
        keysents.append(sentences[idx])

    return keysents

# 6) Final: Get the sentence with blank, answer sentence, answer word
def keysents_blank(keywords:list, keysents:list):
    keysent=''   # blank 만들 keysent
    keysent_blank=''   # blank 만든 keysent
    keyword_keysent=''   # keysent의 blank에 들어갈 keyword
    lowest_weight=23   # 가장 작은 weight(초기값: 최대 weight+1)
    
    for sent in keysents:
        sent_weight = keysents.index(sent) + 1 
        
        keyword=''
        for word in keywords:
            if word in sent:
                keyword = word
                break   # keywords 리스트는 앞의 index일수록 순위가 높은 키워드 -> 문장에 존재하면 break    
        if keyword!='':
            word_weight = keywords.index(keyword) + 1
        else:
            word_weight = 23
            
        weight = sent_weight + word_weight
        if weight<lowest_weight:
            lowest_weight = weight
            keysent = sent
            keyword_keysent = keyword
    
    keysent_blank = keysent.replace(keyword_keysent, '__________')
    
    return {'keywords':keywords, 'sentence_blank':keysent_blank, 'sentence':keysent, 'answer':keyword_keysent}


def key_question(script_path='scripts_for_stopwords/075-Clustering algorithms.flac.txt'):
    sent_ngram = 2
    stopwords_path = 'stop_words_english.txt'
    script_path = script_path    

    with open(script_path) as f:
        text = f.read()
        sentences = sent_tokenize(text)
        sentences = [sent.replace('\n', ' ') for sent in sentences] 
    
    with open(stopwords_path, 'r', encoding='utf-8') as f:
        stop_words = f.readlines()
        stop_words = [word.strip() for word in stop_words]
        
        
    tfidf = TfidfVectorizer(ngram_range=(1, sent_ngram))
    stop_words = get_stopwords(stopwords_path)
    sentences = get_script(script_path)
    sents_after = preprocess_sents(sentences, stop_words)
    sent_graph = build_sent_graph(sents_after, tfidf)
    sent_rank_idx = get_ranks(sent_graph)  # 문장 가중치 그래프
    sorted_sent_idx = sorted(sent_rank_idx,   # 문장 가중치 그래프-가중치 작은 차순 정렬
                             key=lambda k: sent_rank_idx[k], reverse=True)
    keysents = get_keysents(sorted_sent_idx, sentences, sent_num=10)
    
    
        
    kw_model = KeyBERT('all-MiniLM-L12-v2')
    keywords_weight = get_keywords(text, kw_model, 10, stop_words)
    keywords = [word_tup[0] for word_tup in keywords_weight]
    
    
    return keysents_blank(keywords, keysents)

key_question(script_path='scripts_for_stopwords/sample_script.txt')

In [33]:
key_question(script_path = 'scripts_for_stopwords/sample_script.txt')



{'keywords': ['gradient',
  'derivative',
  'derivatives',
  'logistic',
  'computation',
  'descent',
  'computational',
  'computed',
  'derive',
  'propagation'],
 'sentence_blank': 'I have to admit, using the computation graph is a little bit of an overkill for deriving __________ descent for logistic regression, but I want to start explaining things this way to get you familiar with these ideas so that, hopefully, it will make a bit more sense when we talk about full-fledged neural networks.',
 'sentence': 'I have to admit, using the computation graph is a little bit of an overkill for deriving gradient descent for logistic regression, but I want to start explaining things this way to get you familiar with these ideas so that, hopefully, it will make a bit more sense when we talk about full-fledged neural networks.',
 'answer': 'gradient'}

In [1]:
from keybert import KeyBERT

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
with open('scripts_for_stopwords/075-Clustering algorithms.flac.txt') as f:
    text = f.read()

In [13]:
with open('stop_words_english.txt', 'r', encoding='utf-8') as f:
    stopwords_list = f.readlines()
    stopwords_list = [word.strip() for word in stopwords_list]

In [20]:
kw_model = KeyBERT('all-MiniLM-L12-v2')

Downloading: 100%|████████████████████████████████████████████████████████████████| 1.18k/1.18k [00:00<00:00, 1.18MB/s]
Downloading: 100%|█████████████████████████████████████████████████████████████████████| 190/190 [00:00<00:00, 190kB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████| 10.2k/10.2k [00:00<00:00, 10.2MB/s]
Downloading: 100%|█████████████████████████████████████████████████████████████████████| 573/573 [00:00<00:00, 575kB/s]
Downloading: 100%|█████████████████████████████████████████████████████████████████████| 116/116 [00:00<00:00, 116kB/s]
Downloading: 100%|█████████████████████████████████████████████████████████████████| 39.3k/39.3k [00:00<00:00, 192kB/s]
Downloading: 100%|█████████████████████████████████████████████████████████████████████| 349/349 [00:00<00:00, 350kB/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████| 134M/134M [00:13<00:00, 9.77MB/s]
Downloading: 100%|██████████████████████

In [21]:
keywords = kw_model.extract_keywords(text, top_n=10)
keywords

[('clustering', 0.55),
 ('clusters', 0.5067),
 ('cluster', 0.5036),
 ('algorithms', 0.4563),
 ('supervised', 0.3841),
 ('similarity', 0.3694),
 ('algorithm', 0.3669),
 ('learning', 0.348),
 ('unsupervised', 0.3421),
 ('similarities', 0.342)]

In [22]:
keywords = kw_model.extract_keywords(text, top_n=10, keyphrase_ngram_range=(1,2), stop_words=stopwords_list)
keywords



[('algorithms clustering', 0.5935),
 ('clustering algorithms', 0.5926),
 ('clustering drooping', 0.5872),
 ('clustering applications', 0.5549),
 ('clustering algorithm', 0.5521),
 ('clustering', 0.55),
 ('learning algorithms', 0.5161),
 ('similarities algorithms', 0.5076),
 ('clusters', 0.5067),
 ('cluster', 0.5036)]

In [17]:
print(run('scripts_for_stopwords/075-Clustering algorithms.flac.txt'))

(['clusters', 'process', 'clustering', 'algorithms', 'cluster', 'group', 'characteristics', 'algorithm', 'science', 'outliers'], {'sentence_blank': "I'm here shows the number of __________ and now they would group to reduce their numbers.", 'sentence': "I'm here shows the number of clusters and now they would group to reduce their numbers.", 'answer': 'clusters'})


----
-----

In [1]:
from nltk.tokenize import RegexpTokenizer, word_tokenize, sent_tokenize
from nltk.corpus import stopwords

In [7]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [5]:
with open('scripts_for_stopwords/075-Clustering algorithms.flac.txt') as f:
    text = f.read()
print(text)

Welcome to buy ties at a science in the last four videos recovered algorithms that's required. Supervised learning ensembles and logistic regression. Let's take a break from it. And look at some unsupervised, learning algorithms, algorithms for clustering clustering is about drooping. Similar objects. For example, similar individuals may have similar tastes and needs similar products and possibly be interchangeable. Do they need of a customer? What do we mean by similar? For example, for individuals? We can look at age gender income. And so, it is pretty obvious that what appeals to a 20-year old is quite different from what it feels to 60 year old. We can group, people objects events, music movies location, and just about anything. We can think of.  If you think of buying a house, you can look at the number of bathrooms. And number of bedrooms. If it is a condo apartment duplex and so on, you can do it neighborhoods by characteristics and then find multiple houses in similar neighborh

In [6]:
sentences = sent_tokenize(text)
sentences = [sent.replace('\n', ' ') for sent in sentences]

In [7]:
sentences

["Welcome to buy ties at a science in the last four videos recovered algorithms that's required.",
 'Supervised learning ensembles and logistic regression.',
 "Let's take a break from it.",
 'And look at some unsupervised, learning algorithms, algorithms for clustering clustering is about drooping.',
 'Similar objects.',
 'For example, similar individuals may have similar tastes and needs similar products and possibly be interchangeable.',
 'Do they need of a customer?',
 'What do we mean by similar?',
 'For example, for individuals?',
 'We can look at age gender income.',
 'And so, it is pretty obvious that what appeals to a 20-year old is quite different from what it feels to 60 year old.',
 'We can group, people objects events, music movies location, and just about anything.',
 'We can think of.',
 'If you think of buying a house, you can look at the number of bathrooms.',
 'And number of bedrooms.',
 'If it is a condo apartment duplex and so on, you can do it neighborhoods by chara

In [10]:
# stop_words_1 = stopwords.words('english')
# len(stop_words_1)

In [11]:
with open('stop_words_english.txt', 'r', encoding='utf-8') as f:
    stop_words = f.readlines()
    stop_words = [word.strip() for word in stop_words]
stop_words 

['able',
 'about',
 'above',
 'abroad',
 'according',
 'accordingly',
 'across',
 'actually',
 'adj',
 'after',
 'afterwards',
 'again',
 'against',
 'ago',
 'ahead',
 "ain't",
 'all',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'alongside',
 'already',
 'also',
 'although',
 'always',
 'am',
 'amid',
 'amidst',
 'among',
 'amongst',
 'an',
 'and',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anyways',
 'anywhere',
 'apart',
 'appear',
 'appreciate',
 'appropriate',
 'are',
 "aren't",
 'around',
 'as',
 "a's",
 'aside',
 'ask',
 'asking',
 'associated',
 'at',
 'available',
 'away',
 'awfully',
 'back',
 'backward',
 'backwards',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'begin',
 'behind',
 'being',
 'believe',
 'below',
 'beside',
 'besides',
 'best',
 'better',
 'between',
 'beyond',
 'both',
 'brief',
 'but',
 'by',
 'came',
 'can',
 'cannot',
 'cant',
 "can't",
 'caption',
 'cau

In [12]:
len(stop_words)

855

In [13]:
# stop_words 제거, lower()한 문장 리스트
sents_after=[]
for sent in sentences:
    words = word_tokenize(sent)
    sents_after.append(' '.join([word.lower() for word in words if word.lower() not in stop_words and len(word)>1]))
    sents_after = [s for s in sents_after if s!='']
sents_after

["buy ties science videos recovered algorithms 's required",
 'supervised learning ensembles logistic regression',
 "'s break",
 'unsupervised learning algorithms algorithms clustering clustering drooping',
 'objects',
 'individuals tastes products interchangeable',
 'customer',
 'individuals',
 'age gender income',
 'pretty obvious appeals 20-year feels 60 year',
 'group people objects events music movies location',
 'buying house number bathrooms',
 'number bedrooms',
 'condo apartment duplex neighborhoods characteristics multiple houses neighborhoods answer buyer',
 'similarity measure distance objects 12 touring figure express characteristics terms numbers group hair',
 'dye color',
 'obvious',
 'close brown auburn',
 'colors rgb representation numbers similarities',
 'algorithms clustering',
 'algorithms',
 "'s list algorithms killer library",
 'multiple approaches clustering',
 'top-down bottom-up approach',
 'clusters',
 'number data points process jordan performs',
 'cover algo

TF-IDF: 단어의 중요도 나타냄  
TfidfVectorizer: 문장 단위로 TF-IDF 수치 벡터화한 matrix return  
CountVectorizer: 단어 count를 기준으로 벡터화한 matrix return

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
from sklearn.preprocessing import normalize

In [15]:
tfidf = TfidfVectorizer(ngram_range=(1, 1))
cnt_vec = CountVectorizer(ngram_range=(1, 3))
graph_sentence = []

In [16]:
# 문장 리스트 -> tf-idf matrix -> sentence graph
def build_sent_graph(sents):
    tfidf_mat = tfidf.fit_transform(sents).toarray()
    graph_sentence = np.dot(tfidf_mat, tfidf_mat.T)
    return graph_sentence

In [17]:
# word graph 생성
def build_word_graph(sent):
    cnt_vec_mat = normalize(cnt_vec.fit_transform(sent).toarray().astype(float), axis=0)
    vocab = cnt_vec.vocabulary_
    graph_word = np.dot(cnt_vec_mat.T, cnt_vec_mat)
    idx2word = {vocab[word] : word for word in vocab}
    return graph_word, idx2word

In [18]:
sent_graph = build_sent_graph(sents_after)
sent_graph

array([[1.        , 0.        , 0.        , ..., 0.10216038, 0.        ,
        0.3490473 ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.10216038, 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.3490473 , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [19]:
word_graph, idx2word = build_word_graph(sents_after)
print(word_graph.shape)
print(idx2word)

(525, 525)
{78: 'buy', 487: 'ties', 435: 'science', 513: 'videos', 413: 'recovered', 19: 'algorithms', 426: 'required', 79: 'buy ties', 488: 'ties science', 438: 'science videos', 514: 'videos recovered', 414: 'recovered algorithms', 26: 'algorithms required', 80: 'buy ties science', 490: 'ties science videos', 439: 'science videos recovered', 515: 'videos recovered algorithms', 415: 'recovered algorithms required', 474: 'supervised', 294: 'learning', 223: 'ensembles', 308: 'logistic', 418: 'regression', 475: 'supervised learning', 297: 'learning ensembles', 224: 'ensembles logistic', 309: 'logistic regression', 476: 'supervised learning ensembles', 298: 'learning ensembles logistic', 225: 'ensembles logistic regression', 73: 'break', 501: 'unsupervised', 121: 'clustering', 201: 'drooping', 502: 'unsupervised learning', 295: 'learning algorithms', 20: 'algorithms algorithms', 22: 'algorithms clustering', 127: 'clustering clustering', 129: 'clustering drooping', 503: 'unsupervised learn

In [20]:
def get_ranks(graph, d=0.85):
    A = graph
    matrix_size = A.shape[0]
    for id in range(matrix_size):
        A[id, id] = 0   # diagonal 부분 -> 0으로 바꿔줌(diagonal matrix)
        link_sum = np.sum(A[:, id])
        if link_sum != 0:
            A[:, id] /= link_sum
        A[:, id] *= -d
        A[id, id] = 1
        
    B = (1-d) * np.ones((matrix_size, 1))
    ranks = np.linalg.solve(A, B)
    
    return {idx: r[0] for idx, r in enumerate(ranks)}

In [21]:
sent_rank_idx = get_ranks(sent_graph)  # 문장 가중치 그래프
sent_rank_idx

{0: 0.9393160724618903,
 1: 0.22345147584497832,
 2: 0.15000000000000002,
 3: 1.8737618357205856,
 4: 0.6130794269312003,
 5: 0.9999999999999998,
 6: 0.15000000000000002,
 7: 0.9999999999999998,
 8: 0.15000000000000002,
 9: 0.9999999999999998,
 10: 0.6996554015044547,
 11: 0.6459902030898118,
 12: 0.8812406870758399,
 13: 0.2984911013572488,
 14: 0.8847084664283494,
 15: 0.15000000000000002,
 16: 0.9999999999999998,
 17: 0.15000000000000002,
 18: 0.3227035723169527,
 19: 2.0866566757758376,
 20: 1.583984144099648,
 21: 0.7829481002984047,
 22: 0.8819855829194424,
 23: 0.3411087519189534,
 24: 1.657752557379373,
 25: 1.271563724622544,
 26: 1.0728599767780531,
 27: 0.94406293975779,
 28: 1.4242170057875143,
 29: 0.4518320816202148,
 30: 1.5174930071520718,
 31: 1.4395108099139957,
 32: 0.8669801525239387,
 33: 0.3659041998645545,
 34: 1.300939549652392,
 35: 1.305990914010611,
 36: 0.9778164116982234,
 37: 0.5461748576328475,
 38: 1.0168933328119203,
 39: 0.8403099388282287,
 40: 1.6242

In [22]:
sorted_sent_idx = sorted(sent_rank_idx, key=lambda k: sent_rank_idx[k], reverse=True)
sorted_sent_idx

[19,
 41,
 50,
 3,
 46,
 24,
 40,
 20,
 30,
 31,
 28,
 35,
 34,
 25,
 56,
 42,
 26,
 38,
 5,
 7,
 9,
 16,
 49,
 53,
 36,
 27,
 0,
 14,
 22,
 12,
 32,
 47,
 39,
 45,
 48,
 21,
 57,
 51,
 10,
 55,
 11,
 4,
 37,
 29,
 33,
 44,
 59,
 23,
 18,
 13,
 1,
 2,
 6,
 8,
 15,
 17,
 43,
 52,
 54,
 58]

In [23]:
word_rank_idx = get_ranks(word_graph)
sorted_word_idx = sorted(word_rank_idx, key=lambda k: word_rank_idx[k], reverse=True)
sorted_word_idx

[343,
 132,
 397,
 387,
 121,
 346,
 19,
 113,
 248,
 95,
 262,
 91,
 299,
 354,
 210,
 102,
 456,
 14,
 122,
 173,
 294,
 57,
 180,
 190,
 114,
 468,
 369,
 435,
 487,
 488,
 420,
 419,
 184,
 358,
 402,
 187,
 189,
 342,
 50,
 264,
 300,
 341,
 386,
 470,
 51,
 52,
 98,
 265,
 385,
 97,
 188,
 340,
 384,
 469,
 94,
 401,
 301,
 42,
 281,
 321,
 283,
 176,
 352,
 422,
 429,
 322,
 404,
 428,
 486,
 67,
 69,
 282,
 323,
 427,
 68,
 92,
 93,
 142,
 403,
 353,
 143,
 466,
 249,
 241,
 255,
 1,
 193,
 231,
 233,
 356,
 360,
 449,
 497,
 0,
 2,
 105,
 106,
 239,
 315,
 355,
 447,
 448,
 483,
 485,
 496,
 232,
 194,
 240,
 484,
 313,
 495,
 359,
 314,
 464,
 339,
 326,
 80,
 438,
 513,
 79,
 413,
 414,
 490,
 515,
 26,
 415,
 439,
 514,
 78,
 426,
 217,
 218,
 219,
 220,
 221,
 389,
 452,
 99,
 100,
 211,
 302,
 382,
 388,
 450,
 197,
 381,
 508,
 116,
 195,
 506,
 507,
 451,
 196,
 212,
 214,
 290,
 499,
 500,
 15,
 477,
 498,
 64,
 288,
 460,
 478,
 16,
 65,
 213,
 289,
 459,
 66,
 479,
 

In [24]:
def keywords(sorted_word_idx, idx2word, word_num=5):
    keywords = []
    index = []
    for idx in sorted_word_idx[:word_num]:
        index.append(idx)      
    for idx in index:
        keywords.append(idx2word[idx])
        
    return keywords

명사만 추출할 수는 없나?

In [25]:
keywords = keywords(sorted_word_idx, idx2word, 50)
keywords

['number',
 'clusters',
 'process',
 'points',
 'clustering',
 'number clusters',
 'algorithms',
 'cluster',
 'group',
 'centers',
 'image',
 'center',
 'left',
 'numbers',
 'easiest',
 'characteristics',
 'stands',
 'algorithm',
 'clustering algorithm',
 'decided',
 'learning',
 'based',
 'density',
 'distance',
 'cluster centers',
 'step',
 'outliers',
 'science',
 'ties',
 'ties science',
 'repeat process',
 'repeat',
 'diagram',
 'objects',
 'process left center',
 'discuss',
 'discuss centers image',
 'note process left',
 'associate',
 'image note',
 'left center',
 'note process',
 'point discuss centers',
 'step associate point',
 'associate point',
 'associate point discuss',
 'centers image note',
 'image note process',
 'point discuss',
 'centers image']

In [27]:
def keysents(sorted_sent_idx, sent_num=2):
    keysents=[]
    index=[]
    for idx in sorted_sent_idx[:sent_num]:
        index.append(idx)
    for idx in index:
        keysents.append(sentences[idx])

    return keysents

In [34]:
keysents = keysents(sorted_sent_idx, sent_num=10)
keysents

['And so, this will be one step of grade with respect to a single example.',
 'It turns out that if you are familiar with calculus, you could show that this ends up being -Y_over_A+1-Y_over_1-A.',
 'Welcome back.',
 "Here's a cleaned-up version of the diagram.",
 "We'll provide the derivative formulas, what else you need, throughout this course.",
 'The key takeaways will be what you need to implement.',
 'Now, having computed this quantity of DA and the derivative or your final alpha variable with respect to A, you can then go backwards.',
 'Then, similarly, DW2, which is how much you want to change W2, is X2_times_DZ and B, excuse me, DB is equal to DZ.',
 'Then, the final step in that computation is to go back to compute how much you need to change W and B.',
 'I have to admit, using the computation graph is a little bit of an overkill for deriving gradient descent for logistic regression, but I want to start explaining things this way to get you familiar with these ideas so that, h

keysents는 여러 후보 중에서 keywords가 많은 문장을 선택하는 방향으로?  

1. keyword 
    - return keywords, keyword 순위
2. keysents
    - return keysents, keysents with blank(keyword), keyword in blank
    - keysents 1위부터 돌면서 weight=(각 keysents의 순위 + 갖고 있는 keyword 순위(다수라면 더 낮은 keyword 순위))
    - weight가 같은 keysents라면 자체 keysents 순위가 더 높은 keysents 선택

In [44]:
def keysents_blank(keywords:list, keysents:list):
    keysent=''   # blank 만들 keysent
    keysent_blank=''   # blank 만든 keysent
    keyword_keysent=''   # keysent의 blank에 들어갈 keyword
    lowest_weight=23   # 가장 작은 weight(초기값: 최대 weight+1)
    
    for sent in keysents:
        sent_weight = keysents.index(sent) + 1 
        
        keyword=''
        for word in keywords:
            if word in sent:
                keyword = word
                break   # keywords 리스트는 앞의 index일수록 순위가 높은 키워드 -> 문장에 존재하면 break    
        if keyword!='':
            word_weight = keywords.index(keyword) + 1
        else:
            word_weight = 23
            
        weight = sent_weight + word_weight
        if weight<lowest_weight:
            lowest_weight = weight
            keysent = sent
            keyword_keysent = keyword
    
    keysent_blank = keysent.replace(keyword_keysent, '__________')
    
    return {'sentence_blank':keysent_blank, 'sentence':keysent, 'answer':keyword_keysent}

In [45]:
keysents_blank(keywords, keysents)

{'sentence_blank': 'And so, this will be one step of grade with __________ to a single example.',
 'sentence': 'And so, this will be one step of grade with respect to a single example.',
 'answer': 'respect'}