In [None]:
!pip install pymed
!pip install mechanize
!pip install html2text
!pip install pandas

In [None]:
import pandas as pd
import requests
import io
import pymed
import mechanize
import re
import time
import sys
import html2text
import http.cookiejar
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

# 1. Download all genes dataset from HGNC with OMIM ID

In [None]:
url = 'https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_prev_sym&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_acc_ids&col=gd_pub_refseq_ids&col=md_mim_id&status=Approved&status=Entry%20Withdrawn&hgnc_dbtag=on&order_by=gd_app_sym_sort&format=text&submit=submit'

r = requests.get(url)
if r.ok:
    data = r.content.decode('utf8')
    df = pd.read_csv(io.StringIO(data), sep='\t')

df.shape

In [None]:
df[df['OMIM ID(supplied by OMIM)'].notnull()].shape

We hane 47461 genes and 16304 of them have OMIM ID

In [None]:
df.to_csv('gdrive/MyDrive/Биоинформатика/ИБ/Проект/Data/1_genenames.csv', index=False)
# df.to_csv('Data/1_genenames.csv', index=False)

# 2. Parse PubMed ID for genes which are in HGMD and OMIM

We create parcer which enter name of gene on http://www.hgmd.org/

If gene available it search OMIM link to gene and find all of the PubMed IDs for these gene

## 2.1. Login

In [None]:
hgmd_login_url = "http://www.hgmd.cf.ac.uk/docs/login.html"
email_address = ""
password = ""

def initialize_browser():
    browser = mechanize.Browser()
    # Cookie Jar
    cj = http.cookiejar.LWPCookieJar()
    browser.set_cookiejar(cj)

    # Browser options
    browser.set_handle_equiv(True)
#    br.set_handle_gzip(True)
    browser.set_handle_redirect(True)
    browser.set_handle_referer(True)
    browser.set_handle_robots(False)

    # Follows refresh 0 but not hangs on refresh > 0
    browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

    # Want debugging messages?
#    br.set_debug_http(True)
#    br.set_debug_redirects(True)
#    br.set_debug_responses(True)

    # User-Agent (this is cheating, ok?)
    browser.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
    browser.addheaders.append(('email', email_address))

    return browser


def login_hgmd(browser):
    H2T = html2text.HTML2Text()

    response = browser.open(hgmd_login_url)
    html = response.read()

    # print response to STDOUT for debugging purposes
    # the html2text library is used for formatting the output in a more readable form
    print('Response:')
    print(html)
    print('---------')

    # print all the forms in the current page
    print('Forms:')
    print([f for f in browser.forms()])
    print('---------')

    # select login form
    browser.select_form(nr=0)
    print('Login form:')
    print(browser.form)
    print('---------')

    # print all controls in the current form, for debugging purposes
    print('Form controls:')
    print([c.name for c in browser.form.controls])
    print('---------')

    # set username and password
    browser.form['email'] = email_address
    browser.form['password'] = password

    # submit form
    response_form = browser.submit()

    # Now, you should have successfully logged in. The contents of the page will be changed. Check the contents of br.read()
    html_response = response_form.read()
    print('Login form response:')
    print(html_response)
    print('---------')

    # wait 2 seconds to not overload the server
    time.sleep(2)

    return browser

br = initialize_browser()
br = login_hgmd(br)

## 2.2. Parce

We will iterate over all genes with OMIM ID and will find all PubMed articles

In [None]:
df_omim = df[df['OMIM ID(supplied by OMIM)'].notnull()]
df_omim.head()

In [None]:
search_ngmd_url = 'http://www.hgmd.cf.ac.uk/ac/index.php'
gene_url = 'http://www.hgmd.cf.ac.uk/ac/gene.php?gene={}'

cur = 0
total = df_omim.shape[0]

genes_in_hgmd = 0
pubmed_articles_num = 0

# omim link pattern
pattern = re.compile('(http:\/\/omim\.org\/entry\/[0-9]*)')

# PubMed id pattern
pubmed_p = re.compile('\[PubMed\: ([0-9]*)\]')

# flag = False

result_pubmed_dict = {'gene_symb': [], 'omim_id': [], 'pubmed_id': []}

for (i, row) in df_omim.iterrows():
    print('Processed:', cur, '/', total, 'Genes in HGMD:', genes_in_hgmd, 'PubMed Articles Num:', pubmed_articles_num, )

    symb = row['Approved symbol']

    response = br.open(gene_url.format(symb))
    html = response.read().decode('utf-8')

    omim_links = pattern.findall(html)

    if len(omim_links) > 0:
        genes_in_hgmd += 1
#         flag = True
        omim_link = omim_links[0]
        
        response = br.open(omim_link)
        html = response.read().decode('utf-8')

        pubmed_ids = pubmed_p.findall(html)

        pubmed_articles_num += len(pubmed_ids)
        
        for pid in pubmed_ids:
            result_pubmed_dict['gene_symb'].append(symb)
            result_pubmed_dict['omim_id'].append(int(omim_link.split('/')[-1]))
            result_pubmed_dict['pubmed_id'].append(int(pid))
    
#     if flag:
#         break

    time.sleep(1)

    cur += 1

In [None]:
result_pubmed_df = pd.DataFrame(result_pubmed_dict)

result_pubmed_df.to_csv('gdrive/MyDrive/Биоинформатика/ИБ/Проект/Data/2_true_pubmedids_part1.csv', index=False)
# result_pubmed_df.to_csv('Data/2_true_pubmedids_part1.csv', index=False)

In [None]:
print('Number of genes:', len(result_pubmed_df.gene_symb.unique()))
print('Number of PubMed Articles:', len(result_pubmed_df.pubmed_id.unique()))

# 3. Download titles and abstract from PubMed

## 3.1. Download random PubMed articles

In [None]:
# PubMed min and max id to download random samples
pubmed_min_id = 1
pubmed_max_id = 33297669

negative_class_size = 200000

# PubMed settings
tool_name = 'MyTool'
pubmed_query = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={}&tool={}&email={}'

# title pattern
title_pattern = re.compile('title[\s]*{[\s]*name[\s]*\"([^\}]+)')

# abstract pattern
abstract_pattern = re.compile('abstract[\s]*\"([^\"]+)')

def get_pubmed_title_and_abstract(id):
    title = None
    abstract = None

    r = requests.get(pubmed_query.format(id, tool_name, email_address))
    clean_text = r.text.replace('\n', '').replace('\t', '')
    titles = title_pattern.findall(clean_text)

    if len(titles) > 0:
        title = titles[0][:titles[0].rfind('"')]

        abstracts = abstract_pattern.findall(clean_text)
        if len(abstracts) > 0 and abstracts[0] is not None:
            abstract = abstracts[0]
    
    time.sleep(0.2)

    # print(id, title, abstract)

    return title, abstract

In [None]:
negative_ids = list(np.random.choice(pubmed_max_id - pubmed_min_id, negative_class_size, replace=False) + pubmed_min_id + 1)

results = { 'pubmed_id': [], 'title': [], 'abstract': [] }

cur = 0
real_num = 0

for i in range(len(negative_ids)):
    print('Processed:', cur, '/', negative_class_size, 'Not Nones:', real_num)

    id = negative_ids[i]
    res = get_pubmed_title_and_abstract(id)
    if res[0] is not None:
        results['pubmed_id'].append(id)
        results['title'].append(res[0])
        results['abstract'].append(res[1])
        real_num += 1

    cur += 1

In [None]:
false_articles_df = pd.DataFrame(results)

false_articles_df.loc[false_articles_df.abstract.isna(), 'abstract'] = ''
false_articles_df.title = false_articles_df.title.str.split('",', n=1, expand=True)[0]
false_articles_df.title = false_articles_df.title.str.lstrip('[').str.rstrip('].')

false_articles_df.to_csv('gdrive/MyDrive/Биоинформатика/ИБ/Проект/Data/3_false_articles.csv', index=False)
# false_articles_df.to_csv('Data/3_false_articles.csv', index=False)

## 3.2. Download true PubMed articles

In [None]:
positive_ids = list(result_pubmed_df.pubmed_id.unique())

true_results = { 'pubmed_id': [], 'title': [], 'abstract': [] }

cur = 0
real_num = 0

for i in range(len(positive_ids)):
    print('Processed:', cur, '/', len(positive_ids), 'Not Nones:', real_num)

    id = positive_ids[i]
    res = get_pubmed_title_and_abstract(id)
    if res[0] is not None:
        true_results['pubmed_id'].append(id)
        true_results['title'].append(res[0])
        true_results['abstract'].append(res[1])
        real_num += 1

    cur += 1

In [None]:
true_articles_df = pd.DataFrame(true_results)

true_articles_df.loc[true_articles_df.abstract.isna(), 'abstract'] = ''
true_articles_df.title = true_articles_df.title.str.split('",', n=1, expand=True)[0]
true_articles_df.title = true_articles_df.title.str.lstrip('[').str.rstrip('].')

true_articles_df.to_csv('gdrive/MyDrive/Биоинформатика/ИБ/Проект/Data/3_true_articles.csv', index=False)
# true_articles_df.to_csv('Data/3_true_articles.csv', index=False)

## 3.3. Remove articles from false df if it in true df

In [None]:
true_articles_df = pd.read_csv('gdrive/MyDrive/Биоинформатика/ИБ/Проект/Data/3_true_articles.csv').fillna('')
false_articles_df = pd.read_csv('gdrive/MyDrive/Биоинформатика/ИБ/Проект/Data/3_false_articles.csv').fillna('')
# true_articles_df = pd.read_csv('Data/3_true_articles.csv').fillna('')
# false_articles_df = pd.read_csv('Data/3_false_articles.csv').fillna('')

false_articles_df = false_articles_df[~false_articles_df.pubmed_id.isin(positive_ids)]

In [None]:
print('true_articles_df:', true_articles_df.shape[0])
print('false_articles_df:', false_articles_df.shape[0])

## 3.4. Concat DataSets

In [None]:
true_articles_df['label'] = int(True)
false_articles_df['label'] = int(False)

all_articles_df = pd.concat([true_articles_df, false_articles_df]).reset_index(drop=True)

In [None]:
# all_articles_df.to_csv('gdrive/MyDrive/Биоинформатика/ИБ/Проект/Data/3_all_articles.csv', index=False)
all_articles_df.to_csv('Data/3_all_articles.csv', index=False)

## 3.5. Create validation set

In [None]:
all_articles_df.label.sum() / all_articles_df.shape[0]

In [None]:
test = all_articles_df.sample(2000)
test.label.sum() / test.shape[0]

In [None]:
all_articles_df.shape

In [None]:
TRAIN_articles_df = all_articles_df.drop(test.index)

In [None]:
TRAIN_articles_df.to_csv('gdrive/MyDrive/Биоинформатика/ИБ/Проект/Data/3_TRAIN_articles.csv', index=False)
test.to_csv('gdrive/MyDrive/Биоинформатика/ИБ/Проект/Data/3_TEST_articles.csv', index=False)
# TRAIN_articles_df.to_csv('Data/3_TRAIN_articles.csv', index=False)
# test.to_csv('Data/3_TEST_articles.csv', index=False)