### Chapter9 - Scientific Q&A Platforms:

### Import Libraries:

In [3]:
import pandas as pd
import numpy as np

from tqdm import tqdm
import transformers
import logging
transformers.logging.get_verbosity = lambda: logging.NOTSET
import torch
import tensorflow
from transformers import pipeline

### Import Data:

In [10]:
from pymed import PubMed
pubmed = PubMed()

query = "environmental biotechnology"

results = pubmed.query(query, max_results=1000)

In [11]:
articleList = []
for article in tqdm(results):
    articleDict = article.toDict()
    articleList.append(articleDict)

1000it [00:06, 156.60it/s]


In [12]:
df = pd.DataFrame(articleList)
df.head()

Unnamed: 0,pubmed_id,title,abstract,keywords,journal,publication_date,authors,methods,conclusions,results,copyrights,doi,xml
0,34425576,"Activating Natural Killer Cell Receptors, Sele...",Expression of the extensively glycosylated Ebo...,"[Ebolavirus glycoprotein, HPV, Natural cytotox...",Journal of innate immunity,2021-08-24,"[{'lastname': 'Jarahian', 'firstname': 'Mostaf...",,,,© 2021 The Author(s) Published by S. Karger AG...,10.1159/000517628,"[[[], [<Element 'Year' at 0x0000025C3978E3B0>,..."
1,34425554,Effect of Akt activation on apoptosis-related ...,The current study investigated whether the exp...,"[Akt signaling pathway, apoptosis, crop milk, ...",Poultry science,2021-08-24,"[{'lastname': 'Xie', 'firstname': 'P', 'initia...",,,,Copyright © 2021 The Authors. Published by Els...,10.1016/j.psj.2021.101392,"[[[], [<Element 'Year' at 0x0000025C35CA6130>,..."
2,34425520,An integrated NAD,NAD,"[Dehydrogenase-based biosensor, Elimination of...",Biosensors & bioelectronics,2021-08-24,"[{'lastname': 'Song', 'firstname': 'Haiyan', '...",,,,Copyright © 2021 Elsevier B.V. All rights rese...,10.1016/j.bios.2021.113573,"[[[], [<Element 'Year' at 0x0000025C35CA9950>,..."
3,34425478,"Design, synthesis, and antitumor activity eval...",Steroidal compounds were proven to be efficien...,"[Antitumour activity, Epoxides, Olefins, Oxime...",Bioorganic & medicinal chemistry,2021-08-24,"[{'lastname': 'Gomes', 'firstname': 'Ana R', '...",,,,Copyright © 2021 Elsevier Ltd. All rights rese...,10.1016/j.bmc.2021.116360,"[[[], [<Element 'Year' at 0x0000025C35CACF90>,..."
4,34425441,Impact of chicken litter pre-application treat...,Treatment of manures prior to land application...,"[Antibiotic resistance, Chicken litter, Compos...",The Science of the total environment,2021-08-24,"[{'lastname': 'Subirats', 'firstname': 'Jessic...",,,,Crown Copyright © 2021. Published by Elsevier ...,10.1016/j.scitotenv.2021.149718,"[[[], [<Element 'Year' at 0x0000025C35CB0F40>,..."


In [39]:
df = df[["pubmed_id", "title", "abstract", "keywords", "journal", "publication_date", "doi"]]
df.head()

Unnamed: 0,pubmed_id,title,abstract,keywords,journal,publication_date,doi
0,34425576,"Activating Natural Killer Cell Receptors, Sele...",Expression of the extensively glycosylated Ebo...,"[Ebolavirus glycoprotein, HPV, Natural cytotox...",Journal of innate immunity,2021-08-24,10.1159/000517628
1,34425554,Effect of Akt activation on apoptosis-related ...,The current study investigated whether the exp...,"[Akt signaling pathway, apoptosis, crop milk, ...",Poultry science,2021-08-24,10.1016/j.psj.2021.101392
2,34425520,An integrated NAD,NAD,"[Dehydrogenase-based biosensor, Elimination of...",Biosensors & bioelectronics,2021-08-24,10.1016/j.bios.2021.113573
3,34425478,"Design, synthesis, and antitumor activity eval...",Steroidal compounds were proven to be efficien...,"[Antitumour activity, Epoxides, Olefins, Oxime...",Bioorganic & medicinal chemistry,2021-08-24,10.1016/j.bmc.2021.116360
4,34425441,Impact of chicken litter pre-application treat...,Treatment of manures prior to land application...,"[Antibiotic resistance, Chicken litter, Compos...",The Science of the total environment,2021-08-24,10.1016/j.scitotenv.2021.149718


### Sentence Transformers:

In [167]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('paraphrase-TinyBERT-L6-v2')

In [168]:
docs = ["My first paragraph. That contains information", "Python is a programming language."]

In [172]:
docs = df["abstract"].values

In [177]:
document_embeddings = model.encode(docs)

In [178]:
query = "What are steroidal compounds proven to do?"
query_embedding = model.encode(query)

In [179]:
print("Similarity:", util.pytorch_cos_sim(query_embedding, document_embeddings))

Similarity: tensor([[ 8.9598e-02,  1.2240e-01,  1.9424e-01,  4.8254e-01,  1.1814e-01,
          2.0367e-01,  1.4933e-01,  5.2991e-02,  2.2114e-01,  3.1849e-02,
          6.6480e-02,  9.2596e-02,  6.4210e-02,  1.2116e-01,  1.2205e-01,
          9.9618e-02,  1.4555e-01,  3.0704e-03,  1.2252e-01,  1.2551e-01,
          6.2500e-02,  1.7781e-02,  1.8447e-01,  1.3289e-01, -6.7766e-02,
          1.3289e-01,  1.1897e-01,  4.4311e-02,  4.0585e-02,  9.3084e-02,
          6.7600e-02, -3.9858e-02,  2.3015e-01,  5.2281e-02,  1.9038e-01,
          1.8571e-01,  9.7743e-02,  3.2882e-01, -5.1931e-02,  9.2845e-02,
          2.0760e-01, -3.1844e-02,  5.8609e-02,  2.2579e-01,  3.0205e-01,
          1.0740e-01,  1.2252e-01,  5.4497e-02,  1.0068e-01,  1.2776e-01,
          9.3011e-02,  7.2960e-03,  6.8634e-02,  1.1790e-01,  1.8488e-01,
          5.7039e-02,  7.0202e-02,  1.6987e-01,  1.2852e-01,  3.6330e-01,
         -1.9999e-03,  2.1357e-01,  1.1928e-01,  5.0524e-02,  1.9264e-01,
          2.0326e-01,  1.0

In [176]:
distances = util.pytorch_cos_sim(query_embedding, document_embeddings)

In [183]:
import scipy

In [184]:
queries = ["What are steroidal compounds proven to do?"]
query_embeddings = model.encode(queries)

In [187]:
closest_n = 5

In [195]:
corpus = df["abstract"].values
pubmed_id = df["pubmed_id"].values
title = df["title"].values


In [197]:
print("\nTop 5 most similar sentences in corpus:")
for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], document_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n=========================================================")
    print("==========================Query==============================")
    print("===",query,"=====")
    print("=========================================================")


    for idx, distance in results[0:closest_n]:
        print("Score:   ", "(Score: %.4f)" % (1-distance) , "\n" )
        print("Paragraph:   ", corpus[idx].strip(), "\n" )
        row_dict = df.loc[df.index== corpus[idx]].to_dict()
        print("paper_id:  " , row_dict["pubmed_id"][corpus[idx]] , "\n")
        print("Title:  " , row_dict["title"][corpus[idx]] , "\n")
        print("-------------------------------------------")


Top 5 most similar sentences in corpus:


=== What are steroidal compounds proven to do? =====
Score:    (Score: 0.4926) 

Paragraph:    Steroidal compounds were proven to be efficient drugs against several types of cancer. Oximes are also chemical structures frequently associated with anticancer activity. The main goal of this work was to combine the two referred structures by synthesizing steroidal oximes and evaluating them in several cancer cell lines. Compounds (17E)-5α-androst-3-en-17-one oxime (3,4 - OLOX), (17E)-3α,4α-epoxy-5α-androstan-17-one oxime (3,4 - EPOX), (17E)-androst-4-en-17-one oxime (4,5 - OLOX) and (17E)-4α,5α-epoxyandrostan-17-one oxime (4,5 - EPOX) were synthesized and their cytotoxicity evaluated in four human cancer cell lines, namely colorectal adenocarcinoma (WiDr), non-small cell lung cancer (H1299), prostate cancer (PC3) and hepatocellular carcinoma (HepG2). A human non-tumour cell line, CCD841 CoN (normal colon cell line) was also used. MTT assay, flow cy

KeyError: 'Steroidal compounds were proven to be efficient drugs against several types of cancer. Oximes are also chemical structures frequently associated with anticancer activity. The main goal of this work was to combine the two referred structures by synthesizing steroidal oximes and evaluating them in several cancer cell lines. Compounds (17E)-5α-androst-3-en-17-one oxime (3,4 - OLOX), (17E)-3α,4α-epoxy-5α-androstan-17-one oxime (3,4 - EPOX), (17E)-androst-4-en-17-one oxime (4,5 - OLOX) and (17E)-4α,5α-epoxyandrostan-17-one oxime (4,5 - EPOX) were synthesized and their cytotoxicity evaluated in four human cancer cell lines, namely colorectal adenocarcinoma (WiDr), non-small cell lung cancer (H1299), prostate cancer (PC3) and hepatocellular carcinoma (HepG2). A human non-tumour cell line, CCD841 CoN (normal colon cell line) was also used. MTT assay, flow cytometry, fluorescence and hemocompatibility techniques were performed to further analyse the cytotoxicity of the compounds. 3,4 - OLOX was the most effective compound in decreasing tumour cell proliferation in all cell lines, especially in WiDr (IC'