In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!pip install -U biopython
!pip install -U pandas



## Import libraries

In [None]:
import itertools
import os
import pandas as pd
import re

In [None]:
def search(query:str, max_results:int):
    """
    Search for results in Pubmed with a given query
    """
    Entrez.email = 'your.email@example.com'
    try:
        handle = Entrez.esearch(db='pubmed',
                                sort='relevance',
                                retmax=max_results,
                                retmode='xml',
                                term=query
                                )
        results = Entrez.read(handle)
        return results
    except:
        return None

def fetch_details(id_list:list):
    """
    Get details from a given list of id papers
    """
    try:
        ids = ','.join(id_list)
        Entrez.email = 'your.email@example.com'
        handle = Entrez.efetch(db='pubmed',
                               retmode='xml',
                               id=ids)
        results = Entrez.read(handle)
        return results
    except:
        return None

#### ✎ Only some keywords that render the revelant results

#### Try the most effective keywords

In [None]:
effective_kw = ['nutrition and mental health', 'nutrition and feel', 'nutrition and mood',
               'food and psychology', 'food and mental health', 'food and mood',
               'nutrient and psychology', 'nutrient and mental health']

from Bio import Entrez

for kw in effective_kw:
    results = search(kw, 3)
    id_list = results['IdList']
    papers = fetch_details(id_list)
    print('-----------\n\nQuery:', kw)
    if papers is not None:
        for i, paper in enumerate(papers['PubmedArticle']):
            print("\n{}) {}".format(i+1, paper['MedlineCitation']['Article']['ArticleTitle']))
            if (paper['MedlineCitation']['Article'].get('Abstract') is not None):
                print("----\n{}".format(paper['MedlineCitation']['Article'].get('Abstract').get('AbstractText')))


-----------

Query: nutrition and mental health

1) Promoting Mental Health and Wellness in Youth Through Physical Activity, Nutrition, and Sleep.
----
['The medical benefits to youth conferred by physical activity, balanced nutrition, and quality sleep have been increasingly encouraged by medical and mental health providers. Emerging evidence continues to reveal benefits for youth mental health and well-being, including for youth with psychiatric disorders. This evidence seems multifactorial through both neurobiological and psychosocial systems, with common mechanisms present between physical activity, nutrition, and sleep. This article reviews the benefits of optimizing physical activity, nutrition, and sleep; how to assess these lifestyle domains with patients and their parents; and appropriate interventions to optimize well-being in youth.']

2) Nutrition and behavioral health disorders: depression and anxiety.
----
['Suboptimal nutrition has been implicated in the underlying patho

#### ✎ The abstract of results is not really clear for relationship between Food and Mental Health.

-------------------------

# Get papers with Food terms and Mental Health terms

## Create keywords

**Notes:** These entity files have been manually modified some keywords.

In [None]:
GENERAL_PATH = "/content/drive/MyDrive/gena-db-master/src/DATASET"
DATA_PATH = GENERAL_PATH + "/data"
RESULT_PATH = GENERAL_PATH + "/results"

In [None]:
with open(f'{RESULT_PATH}/entities/nutrition_entities.txt', 'r', encoding='utf-8') as f:
    foods = f.readlines()
with open(f'{RESULT_PATH}/entities/chebi_entities.txt', 'r', encoding='utf-8') as f:
    chebis = f.readlines()
with open(f'{RESULT_PATH}/entities/mental_health_entities.txt', 'r', encoding='utf-8') as f:
    mental_healths = f.readlines()

len(foods), len(chebis), len(mental_healths)

(2910, 198, 676)

In [None]:
combine_keywords = [p for p in itertools.product(*[list([f.replace('\n', '') for f in foods + chebis]), list([m.replace('\n', '') for m in mental_healths])])]
print('There are totally', len(combine_keywords), 'keywords.')
print('Some examples:')
print(combine_keywords[:20])

There are totally 2101008 keywords.
Some examples:
[('hawkfish family', 'opiate dependence'), ('hawkfish family', 'death anxiety'), ('hawkfish family', 'vascular dementia'), ('hawkfish family', 'reactive depression'), ('hawkfish family', 'depersonalization disorder'), ('hawkfish family', 'dysthymia'), ('hawkfish family', 'perseveration'), ('hawkfish family', 'ophidiophobia'), ('hawkfish family', 'sommatization'), ('hawkfish family', 'transient tic disorder'), ('hawkfish family', 'orgasmic disorder'), ('hawkfish family', 'thought blocking'), ('hawkfish family', 'anxiety disorders'), ('hawkfish family', 'sleepiness'), ('hawkfish family', 'voyeurism'), ('hawkfish family', 'central sleep apnea syndrome'), ('hawkfish family', 'pseudodementia'), ('hawkfish family', 'gender identity disorder'), ('hawkfish family', 'astereognosia'), ('hawkfish family', 'cocaine abuse')]


## Get Papers

In [None]:
from Bio import Entrez
abstracts = []
for (food, mood) in combine_keywords[2950000:]:
    print(food, ' ==== ', mood)
    results = search(food + '[Title/Abstract] AND ' + mood + '[Title/Abstract]', max_results=3)
    if results is not None:
        id_list = results['IdList']
        papers = fetch_details(id_list)
        if papers is not None:
            for i, paper in enumerate(papers['PubmedArticle']):
                if (paper['MedlineCitation']['Article'].get('Abstract') is not None):
                    abstracts.append((paper['MedlineCitation']['PMID'], paper['MedlineCitation']['Article']['ArticleTitle'], paper['MedlineCitation']['Article'].get('Abstract')['AbstractText'][0]))

In [None]:
import pandas as pd

abstracts = [{'PMID': '123', 'Title': 'Example Title', 'Abstract': 'Example Abstract'},
             {'PMID': '456', 'Title': 'Another Title', 'Abstract': 'Another Abstract'}]

abstracts_df = pd.DataFrame(abstracts)

if not abstracts_df.empty:
    abstracts_df.columns = ['PMID', 'Title', 'Abstract']
    print(abstracts_df.tail(5))
else:
    print("DataFrame is empty.")


  PMID          Title          Abstract
0  123  Example Title  Example Abstract
1  456  Another Title  Another Abstract


In [None]:
abstracts_df.to_csv(f"{DATA_PATH}/papers_raw/papers_16.csv", index=False)

-------------------------------------

## Filter and clean papers

In [None]:
papers = pd.read_csv(f"{DATA_PATH}/papers_raw/final_papers.csv")
add_papers = [papers]

for i in range(15):
    add_papers.append(pd.read_csv(f"{DATA_PATH}/papers_raw/final_papers_2.csv"))

papers = pd.concat(add_papers)

papers = papers[["Title", "Abstract"]]
papers = papers.drop_duplicates(keep='first')
papers = papers[papers['Title'].notna() & papers['Abstract'].notna()]
papers = papers.reset_index(drop=True)
papers.head(5)

Unnamed: 0,Title,Abstract
0,Rapid onset of functional tic-like behaviours ...,Clinicians have reported an increase in functi...
1,Cannabis Improves Obsessive-Compulsive Disorde...,Although several lines of evidence support the...
2,Is Persistent Motor or Vocal Tic Disorder a Mi...,Persistent motor or vocal tic disorder (PMVT) ...
3,Rage attacks in Tourette Syndrome and Chronic ...,Tourette syndrome (TS) and chronic motor/vocal...
4,Pharmacotherapy for tics in adult patients wit...,Tourette syndrome (TS) and persistent motor/vo...


In [None]:
def getTitle(title):
    import re
    only_title_regex = r"^\[(.*)\]"
    if not isinstance(title, str):
        print(title)
        print(type(title))
    if re.match(only_title_regex, title):
        return re.match(only_title_regex, title).group(1)
    else:
        return title

In [None]:
def removeHTML(content):
    import re
    html_regex = r"\<[^>]*\>"
    return re.sub(html_regex, '', content)

In [None]:
papers.to_csv(f"{DATA_PATH}/papers_raw/final_papers.csv", index_label="ID")