<a href="https://colab.research.google.com/github/Siddhartha15/BioMedical-TextSummarization/blob/main/Main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# !pip install wikipedia-api




In [11]:
import wikipediaapi


def get_article(topic):
    wiki = wikipediaapi.Wikipedia(language='en', extract_format=wikipediaapi.ExtractFormat.WIKI)
    try:
        p_wiki = wiki.page(topic)
    except:
        print("Page " + topic + " Exists: False")
        quit()

    return p_wiki.text, p_wiki.summary

In [12]:
import string

# import nltk
from textblob import TextBlob

import nltk
nltk.download('punkt')
nltk.download('stopwords')

def tokenize(raw):
    sent = nltk.sent_tokenize(raw)
    sent = list(map(lambda x: x.translate(string.punctuation), sent))
    sent = [word for word in sent if len(word) >= 10]
    return sent


def textblob_tokenizer(str_input):
    blob = TextBlob(str_input.lower())
    tokens = blob.words
    words = [token.stem() for token in tokens]
    return words

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
  
import pandas as pd

columns = ['name', 'f', 'p', 'r', 'text']


def gen_serie(name, rouge, text):
    return pd.Series([
        name,
        rouge[0]['rouge-l']['f'],
        rouge[0]['rouge-l']['p'],
        rouge[0]['rouge-l']['r'],
        text],
        index=columns)

In [6]:
# !pip install rouge

Collecting rouge
  Downloading https://files.pythonhosted.org/packages/43/cc/e18e33be20971ff73a056ebdb023476b5a545e744e3fc22acd8c758f1e0d/rouge-1.0.0-py3-none-any.whl
Installing collected packages: rouge
Successfully installed rouge-1.0.0


In [7]:

from rouge import Rouge
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# from formatting import gen_serie
# from tokenizer import textblob_tokenizer


def cosine(texts, ref):
    vec = TfidfVectorizer(tokenizer=textblob_tokenizer,
                          stop_words='english',
                          use_idf=True)
    matrix = vec.fit_transform(texts)

    cosine_similarities = cosine_similarity(matrix[0:1], matrix).flatten()

    nb_sentences_in_base_summary = len(ref.split('.'))

    cosine_similarities = list(cosine_similarities)
    cos_results = []
    for i in range(0, nb_sentences_in_base_summary):
        n = cosine_similarities.index(max(cosine_similarities))
        cos_results.append(texts[n])
        del cosine_similarities[n]

    res = ' '.join(cos_results)

    r = Rouge()
    rouge = r.get_scores(res, ref)

    return gen_serie('Cosine Similarity', rouge, res)

In [8]:
import warnings

import numpy as np
import pandas as pd
import scipy.sparse
from rouge import Rouge
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances_argmin_min

# from formatting import gen_serie
# from tokenizer import textblob_tokenizer, tokenize

warnings.filterwarnings("ignore")


def k_mean_distance(data, cx, cy, i_centroid, cluster_labels):
    distances = [np.sqrt((x - cx) ** 2 + (y - cy) ** 2) for (x, y) in data[cluster_labels == i_centroid]]
    return distances


def delete_row_lil(mat, i):
    if not isinstance(mat, scipy.sparse.lil_matrix):
        raise ValueError("works only for LIL format -- use .tolil() first")
    mat.rows = np.delete(mat.rows, i)
    mat.data = np.delete(mat.data, i)
    mat._shape = (mat._shape[0] - 1, mat._shape[1])
    return mat


def cluster(texts, ref, clusters_nb):
    vec = TfidfVectorizer(tokenizer=textblob_tokenizer,
                          stop_words='english',
                          use_idf=True)
    matrix = vec.fit_transform(texts)

    km = KMeans(n_clusters=clusters_nb, max_iter=10000, init='k-means++').fit(matrix)

    sentences = []

    nb_sentences_in_base_summary = len(tokenize(ref))
    cnt = 0
    for i in range(0, len(texts)):
        closest, dist = pairwise_distances_argmin_min(km.cluster_centers_, matrix)
        for idx in closest:
            sentences.append(texts[idx])
            cnt += 1
            if cnt == nb_sentences_in_base_summary:
                break
        else:
            for idx in closest:
                length, _ = matrix.shape
                if idx < length:
                    matrix = delete_row_lil(matrix.tolil(), idx)
            continue
        break

    final = [x for x in texts if x in sentences]
    return ' '.join(final)


def kmean(text, ref):
    df = pd.DataFrame()

    for i in range(2, 11):
        res = cluster(text, ref, i)
        r = Rouge()
        rouge = r.get_scores(' '.join(res), ref)
        df = df.append(gen_serie('K-mean-' + str(i), rouge, res), ignore_index=True)

    return df

In [9]:
import heapq
import re

# import nltk


def compute(sentences, stopwords, formatted):
    word_frequencies = {}
    sentence_scores = {}

    for word in nltk.word_tokenize(formatted):
        if word not in stopwords:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1

    maximum_frequency = max(word_frequencies.values())

    for word in word_frequencies.keys():
        word_frequencies[word] = (word_frequencies[word] / maximum_frequency)

    for sent in sentences:
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_frequencies.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word]
                    else:
                        sentence_scores[sent] += word_frequencies[word]
    return sentence_scores


def text_rank(raw, text, ref):
    nb = len(ref.split('.'))

    formatted = re.sub('[^a-zA-Z]', ' ', raw)
    formatted = re.sub(r'\s+', ' ', formatted)

    scores = compute(text, nltk.corpus.stopwords.words('english'), formatted)
    sentences = heapq.nlargest(nb, scores, key=scores.get)

    return ' '.join(sentences)

In [13]:
import pandas as pd
from gensim.summarization import summarize
from rouge import Rouge

# from cosine_similarity import cosine
# from formatting import gen_serie, columns
# from kmean import kmean
# from scraper import get_article
# from text_rank import text_rank
# from tokenizer import tokenize


def computeal(topic):
    raw, ref = get_article(topic)

    sent = tokenize(raw)

    df = pd.DataFrame()

    ratio = len(ref) / len(raw)

    # TextRank
    result = text_rank(raw, sent, ref)

    r = Rouge()
    rouge = r.get_scores(result, ref)

    df = df.append(gen_serie('TextRank', rouge, result), ignore_index=True)

    # Gensim
    ret = summarize(raw, ratio)
    r = Rouge()
    rouge = r.get_scores(ret, ref)
    df = df.append(gen_serie('Gensim', rouge, ret), ignore_index=True)

    # KMean
    df = df.append(kmean(sent, ret))

    # Cosine
    df = df.append(cosine(sent, ref), ignore_index=True)

    # Rearrange columns
    df = df[columns]

    # df.to_csv('out/' + topic + '.csv')

    return df.to_json(orient='records')

computeal("Apple")

'[{"name":"TextRank","f":0.2644067747,"p":0.24375,"r":0.2888888889,"text":"Five species of aphids commonly attack apples: apple grain aphid, rosy apple aphid, apple aphid, spirea aphid, and the woolly apple aphid. Apples are milled or pressed to produce apple juice, which may be drunk unfiltered (called apple cider in North America), or filtered. Apples are also made into apple butter and apple jelly. An apple is an edible fruit produced by an apple tree (Malus domestica). When cooked, some apple cultivars easily form a puree known as apple sauce. Apple scab: Apple scab causes leaves to develop olive-brown spots with a velvety texture that later turn brown and become cork-like in texture. The larvae of the apple clearwing moth (red-belted clearwing) burrow through the bark and into the phloem of apple trees, potentially causing significant damage. Other pests that affect apple trees include Codling moths and apple maggots. The only apples native to North America are crab apples, which 