In [None]:
from nltk.tokenize import RegexpTokenizer, word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
from sklearn.preprocessing import normalize
    
# 1-1) Load the stopwords
def get_stopwords(stopwords_path):
    stop_words_1 = stopwords.words('english')   # stopwords 179개
    with open(stopwords_path, 'r', encoding='utf-8') as f:
        stop_words = f.readlines()
        stop_words = [word.strip() for word in stop_words]   # stopwords 854개
    return stop_words
        
# 1-2) Load the script  
def get_script(script_path):
    with open(script_path) as f:
        text = f.read()
    sentences = sent_tokenize(text)
    sentences = [sent.replace('\n', ' ') for sent in sentences] 
    return sentences

# 3-1) Build the sentence graph
def build_sent_graph(sents, tfidf):  # 문장 리스트 -> tf-idf matrix -> sentence graph
    graph_sentence = []
    tfidf_mat = tfidf.fit_transform(sents).toarray()
    graph_sentence = np.dot(tfidf_mat, tfidf_mat.T)
    return graph_sentence

# 3-2) Build the word graph
def build_word_graph(sent, cnt_vec):
    cnt_vec_mat = normalize(cnt_vec.fit_transform(sent).toarray().astype(float), axis=0)
    vocab = cnt_vec.vocabulary_
    graph_word = np.dot(cnt_vec_mat.T, cnt_vec_mat)
    idx2word = {vocab[word] : word for word in vocab}
    return graph_word, idx2word

# 4) Calculate the ranks of each sentence or word
def get_ranks(graph, d=0.85):
    A = graph
    matrix_size = A.shape[0]
    for id in range(matrix_size):
        A[id, id] = 0   # diagonal 부분 -> 0으로 바꿔줌(diagonal matrix)
        link_sum = np.sum(A[:, id])
        if link_sum != 0:
            A[:, id] /= link_sum
        A[:, id] *= -d
        A[id, id] = 1
        
    B = (1-d) * np.ones((matrix_size, 1))
    ranks = np.linalg.solve(A, B)
    
    return {idx: r[0] for idx, r in enumerate(ranks)}

# 5-1) Get the list of keywords
def get_keywords(sorted_word_idx, idx2word, word_num=5):
    keywords = []
    index = []
    for idx in sorted_word_idx[:word_num]:
        index.append(idx)      
    for idx in index:
        keywords.append(idx2word[idx])
        
    return keywords

# 5-2) Get the list of keysentences
def get_keysents(sorted_sent_idx, sentences, sent_num=2):
    keysents=[]
    index=[]
    for idx in sorted_sent_idx[:sent_num]:
        index.append(idx)
    for idx in index:
        keysents.append(sentences[idx])

    return keysents

# 6) Final: Get the sentence with blank, answer sentence, answer word
def keysents_blank(keywords:list, keysents:list):
    keysent=''   # blank 만들 keysent
    keysent_blank=''   # blank 만든 keysent
    keyword_keysent=''   # keysent의 blank에 들어갈 keyword
    lowest_weight=23   # 가장 작은 weight(초기값: 최대 weight+1)
    
    for sent in keysents:
        sent_weight = keysents.index(sent) + 1 
        
        keyword=''
        for word in keywords:
            if word in sent:
                keyword = word
                break   # keywords 리스트는 앞의 index일수록 순위가 높은 키워드 -> 문장에 존재하면 break    
        if keyword!='':
            word_weight = keywords.index(keyword) + 1
        else:
            word_weight = 23
            
        weight = sent_weight + word_weight
        if weight<lowest_weight:
            lowest_weight = weight
            keysent = sent
            keyword_keysent = keyword
    
    keysent_blank = keysent.replace(keyword_keysent, '__________')
    
    return {'sentence_blank':keysent_blank, 'sentence':keysent, 'answer':keyword_keysent}

def preprocess_sents(sentences, stop_words):
    # 2) Preprocess the sentences
    sents_after=[]   # stop_words 제거, lower()한 list of sentences
    for sent in sentences:
        words = word_tokenize(sent)
        sents_after.append(' '.join([word.lower() for word in words if word.lower() not in stop_words and len(word)>1]))
        sents_after = [s for s in sents_after if s!=''] 
    return sents_after

def run():
    # 1-3) Set the algorithm
    sent_ngram = 1
    word_ngram = 3    
    tfidf = TfidfVectorizer(ngram_range=(1, sent_ngram))
    cnt_vec = CountVectorizer(ngram_range=(1, word_ngram))
    
    stopwords_path = 'stop_words_english.txt'
    script_path = 'sample_script.txt'
    stop_words = get_stopwords(stopwords_path)
    sentences = get_script(script_path)
    sents_after = preprocess_sents(sentences, stop_words)
    
    sent_graph = build_sent_graph(sents_after, tfidf)
    word_graph, idx2word = build_word_graph(sents_after, cnt_vec)
    
    sent_rank_idx = get_ranks(sent_graph)  # 문장 가중치 그래프
    sorted_sent_idx = sorted(sent_rank_idx,   # 문장 가중치 그래프-가중치 작은 차순 정렬
                             key=lambda k: sent_rank_idx[k], reverse=True)
    word_rank_idx = get_ranks(word_graph)  # 단어 가중치 그래프
    sorted_word_idx = sorted(word_rank_idx, 
                             key=lambda k: word_rank_idx[k], reverse=True)

    keywords = get_keywords(sorted_word_idx, idx2word, word_num=10)
    keysents = get_keysents(sorted_sent_idx, sentences, sent_num=10)

    return keysents_blank(keywords, keysents)



if __name__ == '__main__':
    run()

In [None]:
run()

----
-----

In [2]:
from nltk.tokenize import RegexpTokenizer, word_tokenize, sent_tokenize
from nltk.corpus import stopwords

In [7]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [8]:
with open('sample_script.txt') as f:
    text = f.read()
print(text)

Welcome back. In this video,
we'll talk about how to compute derivatives for you
to implement gradient descent for logistic regression.
The key takeaways will be what you need to implement.
That is, the key equations you need in order to
implement gradient descent for logistic regression.
In this video, I want to do this computation using the computation graph.
I have to admit, using the computation graph is a little bit of
an overkill for deriving gradient descent for logistic regression,
but I want to start explaining things this
way to get you familiar with these ideas so that,
hopefully, it will make a bit more sense when we talk about full-fledged neural networks.
To that, let's dive into gradient descent for logistic regression.
To recap, we had set up logistic regression as follows,
your predictions, Y_hat, is defined as follows,
where z is that.
If we focus on just one example for now, then the loss,
or respect to that one example,
is defined as follows,
where A is the output o

In [9]:
sentences = sent_tokenize(text)
sentences = [sent.replace('\n', ' ') for sent in sentences]

In [10]:
sentences

['Welcome back.',
 "In this video, we'll talk about how to compute derivatives for you to implement gradient descent for logistic regression.",
 'The key takeaways will be what you need to implement.',
 'That is, the key equations you need in order to implement gradient descent for logistic regression.',
 'In this video, I want to do this computation using the computation graph.',
 'I have to admit, using the computation graph is a little bit of an overkill for deriving gradient descent for logistic regression, but I want to start explaining things this way to get you familiar with these ideas so that, hopefully, it will make a bit more sense when we talk about full-fledged neural networks.',
 "To that, let's dive into gradient descent for logistic regression.",
 'To recap, we had set up logistic regression as follows, your predictions, Y_hat, is defined as follows, where z is that.',
 'If we focus on just one example for now, then the loss, or respect to that one example, is defined a

In [46]:
stop_words_1 = stopwords.words('english')
len(stop_words_1)

179

In [12]:
with open('stop_words_english.txt', 'r', encoding='utf-8') as f:
    stop_words = f.readlines()
    stop_words = [word.strip() for word in stop_words]
stop_words 

['able',
 'about',
 'above',
 'abroad',
 'according',
 'accordingly',
 'across',
 'actually',
 'adj',
 'after',
 'afterwards',
 'again',
 'against',
 'ago',
 'ahead',
 "ain't",
 'all',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'alongside',
 'already',
 'also',
 'although',
 'always',
 'am',
 'amid',
 'amidst',
 'among',
 'amongst',
 'an',
 'and',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anyways',
 'anywhere',
 'apart',
 'appear',
 'appreciate',
 'appropriate',
 'are',
 "aren't",
 'around',
 'as',
 "a's",
 'aside',
 'ask',
 'asking',
 'associated',
 'at',
 'available',
 'away',
 'awfully',
 'back',
 'backward',
 'backwards',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'begin',
 'behind',
 'being',
 'believe',
 'below',
 'beside',
 'besides',
 'best',
 'better',
 'between',
 'beyond',
 'both',
 'brief',
 'but',
 'by',
 'came',
 'can',
 'cannot',
 'cant',
 "can't",
 'caption',
 'cau

In [47]:
len(stop_words)

854

In [14]:
# stop_words 제거, lower()한 문장 리스트
sents_after=[]
for sent in sentences:
    words = word_tokenize(sent)
    sents_after.append(' '.join([word.lower() for word in words if word.lower() not in stop_words and len(word)>1]))
    sents_after = [s for s in sents_after if s!='']
sents_after

['video talk derivatives implement gradient descent logistic regression',
 'key takeaways implement',
 'key equations order implement gradient descent logistic regression',
 'video computation computation graph',
 'admit computation graph bit overkill deriving gradient descent logistic regression start explaining familiar ideas bit sense talk full-fledged neural networks',
 "'s dive gradient descent logistic regression",
 'recap set logistic regression predictions y_hat defined',
 'focus loss respect defined output logistic regression ground truth label',
 "'s write computation graph 's features x1 x2",
 'order input w1 w2 addition feature values x1 x2',
 'computational graph w1 x1 w2 x2 rectangular box',
 "y_hat sigma_of_z 's step computation graph finally ay wo n't copy formula",
 'logistic regression modify parameters order reduce loss',
 "propagation steps loss single training 's talk derivatives",
 "'s cleaned-up version diagram",
 'derivatives respect loss derivative loss respect

TF-IDF: 단어의 중요도 나타냄  
TfidfVectorizer: 문장 단위로 TF-IDF 수치 벡터화한 matrix return  
CountVectorizer: 단어 count를 기준으로 벡터화한 matrix return

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
from sklearn.preprocessing import normalize

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1, 1))
cnt_vec = CountVectorizer(ngram_range=(1, 3))
graph_sentence = []

In [17]:
# 문장 리스트 -> tf-idf matrix -> sentence graph
def build_sent_graph(sents):
    tfidf_mat = tfidf.fit_transform(sents).toarray()
    graph_sentence = np.dot(tfidf_mat, tfidf_mat.T)
    return graph_sentence

In [18]:
# word graph 생성
def build_word_graph(sent):
    cnt_vec_mat = normalize(cnt_vec.fit_transform(sent).toarray().astype(float), axis=0)
    vocab = cnt_vec.vocabulary_
    graph_word = np.dot(cnt_vec_mat.T, cnt_vec_mat)
    idx2word = {vocab[word] : word for word in vocab}
    return graph_word, idx2word

In [19]:
sent_graph = build_sent_graph(sents_after)
sent_graph

array([[1.        , 0.18706219, 0.5055462 , ..., 0.65789828, 0.09646828,
        0.14247652],
       [0.18706219, 1.        , 0.41530032, ..., 0.18186413, 0.        ,
        0.        ],
       [0.5055462 , 0.41530032, 1.        , ..., 0.49149814, 0.09101052,
        0.        ],
       ...,
       [0.65789828, 0.18186413, 0.49149814, ..., 1.        , 0.40334796,
        0.12068576],
       [0.09646828, 0.        , 0.09101052, ..., 0.40334796, 1.        ,
        0.28064357],
       [0.14247652, 0.        , 0.        , ..., 0.12068576, 0.28064357,
        1.        ]])

In [20]:
word_graph, idx2word = build_word_graph(sents_after)
print(word_graph.shape)
print(idx2word)

(581, 581)
{524: 'video', 465: 'talk', 113: 'derivatives', 274: 'implement', 253: 'gradient', 124: 'descent', 298: 'logistic', 392: 'regression', 529: 'video talk', 466: 'talk derivatives', 114: 'derivatives implement', 275: 'implement gradient', 254: 'gradient descent', 125: 'descent logistic', 299: 'logistic regression', 530: 'video talk derivatives', 467: 'talk derivatives implement', 115: 'derivatives implement gradient', 276: 'implement gradient descent', 255: 'gradient descent logistic', 126: 'descent logistic regression', 283: 'key', 463: 'takeaways', 286: 'key takeaways', 464: 'takeaways implement', 287: 'key takeaways implement', 182: 'equations', 336: 'order', 284: 'key equations', 183: 'equations order', 337: 'order implement', 285: 'key equations order', 184: 'equations order implement', 338: 'order implement gradient', 62: 'computation', 257: 'graph', 525: 'video computation', 64: 'computation computation', 66: 'computation graph', 526: 'video computation computation', 65:

In [21]:
def get_ranks(graph, d=0.85):
    A = graph
    matrix_size = A.shape[0]
    for id in range(matrix_size):
        A[id, id] = 0   # diagonal 부분 -> 0으로 바꿔줌(diagonal matrix)
        link_sum = np.sum(A[:, id])
        if link_sum != 0:
            A[:, id] /= link_sum
        A[:, id] *= -d
        A[id, id] = 1
        
    B = (1-d) * np.ones((matrix_size, 1))
    ranks = np.linalg.solve(A, B)
    
    return {idx: r[0] for idx, r in enumerate(ranks)}

In [22]:
sent_rank_idx = get_ranks(sent_graph)  # 문장 가중치 그래프
sent_rank_idx

{0: 1.602526076662568,
 1: 0.45481217716176786,
 2: 1.3921844584315919,
 3: 0.9465557087297984,
 4: 1.0276633567654725,
 5: 1.1863467102567133,
 6: 0.8232136675453178,
 7: 1.0029128590705292,
 8: 0.9152092026275391,
 9: 0.772167339593344,
 10: 0.8699249956073611,
 11: 0.8125927761589898,
 12: 1.0067684950665285,
 13: 0.9665485546415963,
 14: 0.15000000000000002,
 15: 1.53322263380662,
 16: 0.5311469055073599,
 17: 0.7364226931447745,
 18: 1.658016019645117,
 19: 1.0236637326102713,
 20: 0.5095809130856517,
 21: 1.491111657176732,
 22: 1.113467516717841,
 23: 1.3415121419000944,
 24: 0.15000000000000002,
 25: 0.4864607633484701,
 26: 0.8886424016922253,
 27: 0.6377794775880602,
 28: 0.15000000000000002,
 29: 0.15000000000000002,
 30: 0.8050680809337288,
 31: 0.8262730822324702,
 32: 1.2366408838125782,
 33: 0.9446726227991267,
 34: 1.3272242297113557,
 35: 0.8744352440296118,
 36: 0.8435502919822894,
 37: 0.8409078278485468,
 38: 2.0091180810776197,
 39: 0.7983923173879451,
 40: 0.76326

In [23]:
sorted_sent_idx = sorted(sent_rank_idx, key=lambda k: sent_rank_idx[k], reverse=True)
sorted_sent_idx

[38,
 18,
 0,
 15,
 21,
 2,
 23,
 34,
 32,
 5,
 22,
 4,
 19,
 12,
 7,
 13,
 3,
 33,
 8,
 26,
 35,
 10,
 36,
 37,
 31,
 6,
 11,
 30,
 39,
 9,
 40,
 17,
 27,
 16,
 20,
 25,
 1,
 14,
 24,
 28,
 29]

In [24]:
word_rank_idx = get_ranks(word_graph)
sorted_word_idx = sorted(word_rank_idx, key=lambda k: word_rank_idx[k], reverse=True)
sorted_word_idx

[405,
 298,
 299,
 392,
 103,
 306,
 152,
 516,
 257,
 30,
 124,
 253,
 254,
 235,
 542,
 66,
 531,
 204,
 315,
 109,
 93,
 269,
 142,
 255,
 126,
 125,
 465,
 62,
 490,
 292,
 568,
 563,
 81,
 433,
 330,
 453,
 7,
 228,
 305,
 468,
 229,
 259,
 272,
 331,
 332,
 404,
 469,
 121,
 123,
 247,
 249,
 273,
 431,
 432,
 452,
 25,
 122,
 198,
 230,
 403,
 8,
 9,
 21,
 22,
 24,
 197,
 209,
 248,
 347,
 67,
 196,
 258,
 430,
 451,
 23,
 210,
 346,
 348,
 336,
 238,
 113,
 147,
 189,
 190,
 477,
 108,
 107,
 349,
 10,
 574,
 557,
 43,
 98,
 375,
 507,
 474,
 295,
 418,
 558,
 454,
 274,
 380,
 566,
 445,
 205,
 241,
 202,
 163,
 135,
 136,
 156,
 157,
 161,
 472,
 0,
 87,
 91,
 160,
 201,
 471,
 1,
 2,
 88,
 89,
 92,
 141,
 329,
 366,
 470,
 493,
 86,
 90,
 134,
 138,
 139,
 140,
 162,
 473,
 491,
 496,
 137,
 367,
 494,
 495,
 555,
 133,
 492,
 556,
 554,
 203,
 368,
 110,
 167,
 407,
 32,
 168,
 311,
 357,
 31,
 37,
 38,
 239,
 312,
 406,
 448,
 450,
 166,
 191,
 358,
 449,
 359,
 199,
 318,

In [32]:
def keywords(sorted_word_idx, idx2word, word_num=5):
    keywords = []
    index = []
    for idx in sorted_word_idx[:word_num]:
        index.append(idx)      
    for idx in index:
        keywords.append(idx2word[idx])
        
    return keywords

명사만 추출할 수는 없나?

In [33]:
keywords = keywords(sorted_word_idx, idx2word, 10)
keywords

['respect',
 'logistic',
 'logistic regression',
 'regression',
 'derivative',
 'loss',
 'dz',
 'variable',
 'graph',
 'calculus']

In [27]:
def keysents(sorted_sent_idx, sent_num=2):
    keysents=[]
    index=[]
    for idx in sorted_sent_idx[:sent_num]:
        index.append(idx)
    for idx in index:
        keysents.append(sentences[idx])

    return keysents

In [34]:
keysents = keysents(sorted_sent_idx, sent_num=10)
keysents

['And so, this will be one step of grade with respect to a single example.',
 'It turns out that if you are familiar with calculus, you could show that this ends up being -Y_over_A+1-Y_over_1-A.',
 'Welcome back.',
 "Here's a cleaned-up version of the diagram.",
 "We'll provide the derivative formulas, what else you need, throughout this course.",
 'The key takeaways will be what you need to implement.',
 'Now, having computed this quantity of DA and the derivative or your final alpha variable with respect to A, you can then go backwards.',
 'Then, similarly, DW2, which is how much you want to change W2, is X2_times_DZ and B, excuse me, DB is equal to DZ.',
 'Then, the final step in that computation is to go back to compute how much you need to change W and B.',
 'I have to admit, using the computation graph is a little bit of an overkill for deriving gradient descent for logistic regression, but I want to start explaining things this way to get you familiar with these ideas so that, h

keysents는 여러 후보 중에서 keywords가 많은 문장을 선택하는 방향으로?  

1. keyword 
    - return keywords, keyword 순위
2. keysents
    - return keysents, keysents with blank(keyword), keyword in blank
    - keysents 1위부터 돌면서 weight=(각 keysents의 순위 + 갖고 있는 keyword 순위(다수라면 더 낮은 keyword 순위))
    - weight가 같은 keysents라면 자체 keysents 순위가 더 높은 keysents 선택

In [44]:
def keysents_blank(keywords:list, keysents:list):
    keysent=''   # blank 만들 keysent
    keysent_blank=''   # blank 만든 keysent
    keyword_keysent=''   # keysent의 blank에 들어갈 keyword
    lowest_weight=23   # 가장 작은 weight(초기값: 최대 weight+1)
    
    for sent in keysents:
        sent_weight = keysents.index(sent) + 1 
        
        keyword=''
        for word in keywords:
            if word in sent:
                keyword = word
                break   # keywords 리스트는 앞의 index일수록 순위가 높은 키워드 -> 문장에 존재하면 break    
        if keyword!='':
            word_weight = keywords.index(keyword) + 1
        else:
            word_weight = 23
            
        weight = sent_weight + word_weight
        if weight<lowest_weight:
            lowest_weight = weight
            keysent = sent
            keyword_keysent = keyword
    
    keysent_blank = keysent.replace(keyword_keysent, '__________')
    
    return {'sentence_blank':keysent_blank, 'sentence':keysent, 'answer':keyword_keysent}

In [45]:
keysents_blank(keywords, keysents)

{'sentence_blank': 'And so, this will be one step of grade with __________ to a single example.',
 'sentence': 'And so, this will be one step of grade with respect to a single example.',
 'answer': 'respect'}