In [1]:
#from Bio import Entrez
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import itertools
import string
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_colwidth', 1000)

In [2]:
# !pip install bio
# !pip install seaborn

In [3]:
def search(keyword):
    '''
    returns IDList of research articles related the keyword
    
    Arg:
        keyword (str): keyword of the interest
        
    return:
        IDList (Dict): List of publication IDs related to the keywords
    '''
    
    Entrez.email = 'stawar59@gmail.com'
    pyapim = Entrez.esearch(db='pubmed', 
                            sort='relevance', 
                            retmax='1000',
                            retmode='xml', 
                            term=keyword)
    lst_id = Entrez.read(pyapim)
    return lst_id

In [4]:
def fetch_details(id_list):
    '''
    returns article information from pubmed
    
    Arg: 
        id_list (dict): id list of publications returned from serach function
        
    return:
        results (dict): full information of articles 
    '''
    iDs = ','.join(id_list)
    Entrez.email = 'stawar59@gmail.com'
    pyapim = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=iDs)
    opt = Entrez.read(pyapim)
    return opt

In [5]:
def Author_list(papers):
    print(papers['PubmedArticle'])
    author_lst=[i['MedlineCitation']['Article']['AuthorList']['CoauthorList']\
                      for i in papers['PubmedArticle']]
    dff=[pd.DataFrame(author_lst[i]) for i in range(len(author_lst))]
    names_df=pd.concat(dff, axis=0, sort=True )
    author_df=names_df[['ForeName', 'LastName', 'CoauthorList']]\
                    .groupby(['ForeName', 'LastName','CoauthorList']).size()\
                    .reset_index(name='count').sort_values(by='count', ascending=False)
    top=author_df.head(10)
    google_url='https://scholar.google.co.kr/scholar?hl=ko&as_sdt=0%2C5&q='
    name=top['ForeName']+' '+top['LastName']+' '+top['CoauthorList']
    opt=top.reset_index(drop=True).join(pd.DataFrame({'Google Scholar':[google_url+i for i in name.str.replace(' ', '+')+'+review&oq=']}))
    
    return opt

In [6]:
def key_from_papers(papers):
    fetch_key_word_papers=[i['MedlineCitation']['KeywordList'] for i in papers['PubmedArticle']]
    lst_key_papers=list(itertools.chain.from_iterable(list(itertools.chain.from_iterable(fetch_key_word_papers))))
    paper_ki=pd.DataFrame({'key word from papers':lst_key_papers})
    kiword=paper_ki['key word from papers'].str.lower()
    return kiword

In [7]:
def title_key(papers):
    tlt_mod=[i['MedlineCitation']['Article']['ArticleTitle'].lower()\
            .replace(',','').replace('.','').replace(':', '').replace('?','')\
            .replace('<sub>', '').replace('</sub>','').replace('<sup>','').replace('</sup>','')\
            .replace('<i>','').replace('</i>','')\
            .replace(search_word.lower(),'') for i in papers['PubmedArticle']]
    model=TfidfVectorizer(ngram_range=(2,2),stop_words='english')
    X=model.fit_transform(tlt_mod)
    model_df=pd.DataFrame(X.todense(), columns=sorted(model.vocabulary_))
    key_idx=model_df.sum().sort_values(ascending=False)
    
    return key_idx


In [8]:
def Abstract_key(papers):
    
    abrlst=[]
    for i in papers['PubmedArticle']:
        try:
            abrlst.append(i['MedlineCitation']['Article']['Abstract']['AbstractText'][0].lower()\
            .replace(',','').replace('.','').replace(':', '').replace('?','')\
            .replace('<sub>', '').replace('</sub>','').replace('<sup>','').replace('</sup>','')\
            .replace('<i>','').replace('</i>','').replace(search_word.lower(),''))
        except:
            continue
    model=TfidfVectorizer(ngram_range=(2,2),stop_words='english')
    X=model.fit_transform(abrlst)
    model_df=pd.DataFrame(X.todense(), columns=sorted(model.vocabulary_))
    key_idx=model_df.sum().sort_values(ascending=False)
    
    return key_idx

In [9]:
search_word='bioactive'

In [10]:
results = search(search_word)
id_list = results['IdList']
papers = fetch_details(id_list)

NameError: name 'Entrez' is not defined

In [None]:
df=Author_list(papers)
df

In [None]:
sns.countplot(x ="count", data=df)
 
# Show the plot
plt.show()

In [None]:
df['Google Scholar']

In [None]:
key_paper_lst=key_from_papers(papers)

In [None]:
key_paper_lst=key_from_papers(papers)
key_paper_lst.value_counts().head(20)

In [None]:
a=key_paper_lst.value_counts().head(20)
a=a.tolist()
a=pd.DataFrame({"count":a})

b= key_paper_lst.head(20)
b=b.tolist()
b=pd.DataFrame({"words":b})

dff = pd.concat([b,a],axis=1)
dff

In [None]:
import plotly.express as px
fig = px.bar(dff, x='words', y='count')
fig.show()

In [None]:
from wordcloud import WordCloud

In [None]:
from wordcloud import WordCloud, STOPWORDS

STOPWORDS.add('https')  # remove htps to the world Cloud

def Plot_world(text):
    
    comment_words = ' '
    stopwords = set(STOPWORDS) 
    
    for val in text: 

        # typecaste each val to string 
        val = str(val) 

        # split the value 
        tokens = val.split() 

        # Converts each token into lowercase 
        for i in range(len(tokens)): 
            tokens[i] = tokens[i].lower() 

        for words in tokens: 
            comment_words = comment_words + words + ' '


    wordcloud = WordCloud(width = 5000, height = 4000, 
                    background_color ='black', 
                    stopwords = stopwords, 
                    min_font_size = 10).generate(comment_words) 

    # plot the WordCloud image                        
    plt.figure(figsize = (12, 12), facecolor = 'k', edgecolor = 'k' ) 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 

    plt.show()

In [None]:
text = dff.words.values

Plot_world(text)

In [None]:
type(papers)

In [None]:
papers

In [None]:
key_title=title_key(papers)
key_title

In [None]:
(key_title-key_title.mean())/key_title.std()

In [None]:
key_abstract=Abstract_key(papers)
key_abstract

In [None]:
key_abstract[:20]

In [None]:
(key_abstract-key_abstract.mean())/key_abstract.std()

In [None]:
# search_word = "bone"
search_word+' '+key_abstract.index[0]