<a href="https://colab.research.google.com/github/ReynaldiJ/portfolio/blob/main/text_summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def read_article(file_name):
    file = open(file_name, "r")
    filedata = file.readlines()
    #split the sentences based on dot (.) separator
    article = filedata[0].split(". ")
    sentences = []
    for sentence in article:
        print(sentence)
        #remove non alphabetic character
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
    sentences.pop()

    return sentences

In [None]:
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []

    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]

    all_words = list(set(sent1 + sent2))

    #build vector 0 with dimension following the len of all_words
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)

    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1

    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1

    return 1 - cosine_distance(vector1, vector2)

In [None]:
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))

    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix

In [None]:
def generate_summary(file_name, top_n):
    stop_words = stopwords.words('english')
    summarize_text = []

    # Step 1 - Read text anc split it
    sentences =  read_article(file_name)

    # Step 2 - Generate Similary Martix across sentences
    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)

    # Step 3 - Rank sentences in similarity martix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
    scores = nx.pagerank(sentence_similarity_graph)

    # Step 4 - Sort the rank and pick top sentences
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
    print("Indexes of top ranked_sentence order are ", ranked_sentence)

    for i in range(top_n):
      summarize_text.append(" ".join(ranked_sentence[i][1]))

    # Step 5 - Offcourse, output the summarize texr
    print("Summarize Text: \n", ". ".join(summarize_text))


In [None]:
generate_summary( "sample1.txt", 5)

Jakarta, CNN Indonesia -- Pusat Pimpinan (PP) Muhammadiyah memutuskan untuk mengalihkan dana mereka dari Bank Syariah Indonesia (BSI) ke sejumlah bank lainnya.

Indexes of top ranked_sentence order are  []


IndexError: list index out of range

In [None]:
sentences =  read_article("sample.txt")


If for some reason you wanted to bring it from a pleasant 20C to boiling point, German firm MAN Energy Solutions has a heat pump that could do it
And it would take less time than Kenneth Branagh's film version of Hamlet."We can do this in less than four hours," explains Raymond Decorvet, who works in business development at MAN Energy
"Or we could freeze the whole thing in about 11 hours."Theirs is among the largest heat pump units in the world
Heat pumps work by compressing gently warmed refrigerants to raise the temperature of these fluids
That heat can then be passed on to homes or industrial machinery.Heat pumps require electricity to work but can produce around three or four kilowatts of heat for every kilowatt of power they consume, making them highly efficient
Plus, some designs can provide cooling as well.Heat pumps are increasingly popular with some home owners but domestic devices are relatively small and tend to have outputs of several kilowatts or so
MAN Energy's biggest co

In [None]:
sentence_similarity_martix = build_similarity_matrix(sentences, stop_words=None)

In [None]:
sentence_similarity_martix

array([[0.        , 0.13710212, 0.09416472, 0.13093073, 0.19518001,
        0.11268723, 0.18574538, 0.12198751],
       [0.13710212, 0.        , 0.12049505, 0.04188539, 0.11707323,
        0.08111071, 0.15597978, 0.18731716],
       [0.09416472, 0.12049505, 0.        , 0.19178532, 0.05360563,
        0.        , 0.10202887, 0.4020422 ],
       [0.13093073, 0.04188539, 0.19178532, 0.        , 0.2981424 ,
        0.12909944, 0.24826442, 0.3354102 ],
       [0.19518001, 0.11707323, 0.05360563, 0.2981424 , 0.        ,
        0.26461887, 0.33704692, 0.20833333],
       [0.11268723, 0.08111071, 0.        , 0.12909944, 0.26461887,
        0.        , 0.18314742, 0.09622504],
       [0.18574538, 0.15597978, 0.10202887, 0.24826442, 0.33704692,
        0.18314742, 0.        , 0.33704692],
       [0.12198751, 0.18731716, 0.4020422 , 0.3354102 , 0.20833333,
        0.09622504, 0.33704692, 0.        ]])

In [None]:
sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)

In [None]:
sentence_similarity_graph

<networkx.classes.graph.Graph at 0x130e00650>

In [None]:
pip install pyvis

Collecting pyvis
  Obtaining dependency information for pyvis from https://files.pythonhosted.org/packages/ab/4b/e37e4e5d5ee1179694917b445768bdbfb084f5a59ecd38089d3413d4c70f/pyvis-0.3.2-py3-none-any.whl.metadata
  Downloading pyvis-0.3.2-py3-none-any.whl.metadata (1.7 kB)
Using cached pyvis-0.3.2-py3-none-any.whl (756 kB)
Installing collected packages: pyvis
Successfully installed pyvis-0.3.2
Note: you may need to restart the kernel to use updated packages.


In [None]:
from pyvis.network import Network
net=Network(notebook=True)
net.from_nx(sentence_similarity_graph)
net.show('example.html')

example.html


In [None]:
scores = nx.pagerank(sentence_similarity_graph)

In [None]:
scores

{0: 0.12907083196593166,
 1: 0.12907083196593166,
 2: 0.11278750410220478,
 3: 0.12907083196593166,
 4: 0.12907083196593166,
 5: 0.11278750410220478,
 6: 0.12907083196593166,
 7: 0.12907083196593166}

In [None]:
ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)


In [None]:
print("Indexes of top ranked_sentence order are ", ranked_sentence)

Indexes of top ranked_sentence order are  [(0.12907083196593166, ['The', 'company', 'recently', 'installed', 'two', 'of', 'these', 'machines', 'in', 'the', 'port', 'city', 'of', 'Esbjerg,', 'in', 'Denmark.In', 'this', 'installation,', 'the', 'heat', "pumps'", 'CO2', 'refrigerant', 'will', 'absorb', 'a', 'small', 'amount', 'of', 'heat', 'from', 'seawater']), (0.12907083196593166, ['That', 'heat', 'can', 'then', 'be', 'passed', 'on', 'to', 'homes', 'or', 'industrial', 'machinery.Heat', 'pumps', 'require', 'electricity', 'to', 'work', 'but', 'can', 'produce', 'around', 'three', 'or', 'four', 'kilowatts', 'of', 'heat', 'for', 'every', 'kilowatt', 'of', 'power', 'they', 'consume,', 'making', 'them', 'highly', 'efficient']), (0.12907083196593166, ['MAN', "Energy's", 'biggest', 'commercial', 'heat', 'pump', 'is', 'thousands', 'of', 'times', 'more', 'powerful', '-', 'with', 'a', 'total', 'heating', 'capacity', 'of', '48', 'megawatts', '(MW).It', 'can', 'produce', 'temperatures', 'of', 'up', 't