In [30]:
import requests
import json

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from IPython.display import HTML, display
from tabulate import tabulate

from bs4 import BeautifulSoup

## PhantomJS for the first request

In [2]:
driver = webdriver.PhantomJS()
base_url = 'http://ieeexplore.ieee.org/search/searchresult.jsp'
query = 'Generative Adversarial Networks'
driver.get(base_url + '?newsearch=true&rowsPerPage=10&queryText=%s' % query)

# wait for the ajax to load
WebDriverWait(driver, 60).until(
    EC.presence_of_element_located((By.CLASS_NAME, "icon-pdf")))

elems = driver.find_elements_by_class_name('icon-pdf')
pdfs = [elem.get_attribute('href') for elem in elems]

pdfs

['http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8094357',
 'http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8124931',
 'http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8063435',
 'http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7934380',
 'http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8106771',
 'http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8099685',
 'http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8100224',
 'http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8100008',
 'http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8099501',
 'http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8099694']

## get the cookies and send other queries

In [3]:
cookies = {cookie['name']: cookie['value'] for cookie in driver.get_cookies()}

In [4]:
data = {
    "queryText":"CNN",
    "newsearch":"true",
    "rowsPerPage":"100"
}

headers = {
    "Accept": "application/json, text/plain, */*",
    "Content-Type": "application/json;charset=utf-8"
}

rest_url = "http://ieeexplore.ieee.org/rest/search"

In [5]:
session = requests.Session()
response = session.post(rest_url, json=data, headers=headers, cookies=cookies)

In [6]:
print(response.status_code)
print(len(response.text))

200
268750


In [7]:
records = response.json()['records']

print(len(records))
records

100


[{'abstract': 'The rapid development of machine learning is enabling a plenty of novel applications such as image and speech recognition for embedded and mobile devices. However, state-of-the-art deep learning models like convolutional neural networks (CNNs) are demanding so much on-chip storage and compute resources that they cannot be smoothly handled by low-power mobile or embedded systems. In order to fit la...',
  'accessType': {'message': 'Included in Your Digital Subscription',
   'type': 'subscribed'},
  'articleContentType': 'Early Access Articles',
  'articleNumber': '8165964',
  'authors': [{'id': None,
    'nativeName': None,
    'normalizedName': 'Y. Wang',
    'preferredName': 'Ying Wang',
    'searchablePreferredName': 'Ying Wang'},
   {'id': None,
    'nativeName': None,
    'normalizedName': 'H. Li',
    'preferredName': 'Huawei Li',
    'searchablePreferredName': 'Huawei Li'},
   {'id': None,
    'nativeName': None,
    'normalizedName': 'X. Li',
    'preferredName': 

In [8]:
header = ['Title', 'id', 'doi', 'pdflink']
table = [[record['title'], record['articleNumber'], record['doi'], record['pdfLink']] for record in records]

display(HTML(tabulate(table, headers=header, showindex="always", tablefmt='html')))

Unnamed: 0,Title,id,doi,pdflink
0,A Case of On-chip Memory Sub-system Design for Low-Power [::CNN::] Accelerators,8165964,10.1109/TCAD.2017.2778060,/stamp/stamp.jsp?tp=&arnumber=8165964
1,Joint Hand Detection and Rotation Estimation Using [::CNN::],8128503,10.1109/TIP.2017.2779600,/stamp/stamp.jsp?tp=&arnumber=8128503
2,Appearance-based gaze block estimation via [::CNN::] classification,8122270,10.1109/MMSP.2017.8122270,/stamp/stamp.jsp?tp=&arnumber=8122270
3,Adaptive feature learning [::CNN::] for behavior recognition in crowd scene,8120636,10.1109/ICSIPA.2017.8120636,/stamp/stamp.jsp?tp=&arnumber=8120636
4,Ship detection in SAR images based on an improved faster R-[::CNN::],8124934,10.1109/BIGSARDATA.2017.8124934,/stamp/stamp.jsp?tp=&arnumber=8124934
5,Novel hybrid [::CNN::]-SVM model for recognition of functional magnetic resonance images,8122741,10.1109/SMC.2017.8122741,/stamp/stamp.jsp?tp=&arnumber=8122741
6,Welding defect classification based on convolution neural network ([::CNN::]) and Gaussian kernel,8124091,10.1109/ISITIA.2017.8124091,/stamp/stamp.jsp?tp=&arnumber=8124091
7,Everything is Image: [::CNN::]-based Short-Term Electrical Load Forecasting for Smart Grid,8121794,10.1109/ISPAN-FCST-ISCC.2017.78,/stamp/stamp.jsp?tp=&arnumber=8121794
8,A late fusion approach for harnessing multi-[::cnn::] model high-level features,8122666,10.1109/SMC.2017.8122666,/stamp/stamp.jsp?tp=&arnumber=8122666
9,Diabetic retinopathy screening based on [::CNN::],8124433,10.23919/ELMAR.2017.8124433,/stamp/stamp.jsp?tp=&arnumber=8124433


## Retrieve PDF page and find the PDF url

In [22]:
pdf1_html = session.get('http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8094357')

print(pdf_html1.status_code)

200


In [24]:
pdf1_soup = BeautifulSoup(pdf1_html.text, 'lxml')

print(pdf1_soup.prettify())

<html>
 <head>
  <script charset="utf-8" src="http://staticieeexplore.ieee.org/assets/vendor/jquery/jquery.js?cv=20171107_000000" type="text/javascript">
  </script>
  <!-- Fingerprint Cookie -->
  <script src="http://staticieeexplore.ieee.org/assets/vendor/js-cookie/src/js.cookie.js?cv=20171107_000000" type="text/javascript">
  </script>
  <script src="http://staticieeexplore.ieee.org/assets/vendor/fingerprintjs2/fingerprint2.js?cv=20171107_000000" type="text/javascript">
  </script>
  <script src="http://staticieeexplore.ieee.org/assets/js/lib/core/fingerprint.js?cv=20171107_000000" type="text/javascript">
  </script>
  <script type="text/javascript">
   Xplore.Fingerprint.init();
  </script>
  <!-- BEGIN: Unica Page Tag -->
  <!-- Copyright 2004-2006 Unica Corporation.  All rights reserved. -->
  <script language="JavaScript">
   var PAGE_TAGGING = 'ON';
			var NTPT_IMAGE_LOCATION = 'http://tagxplore.ieee.org/';
			var unValue =  '7863';
			var NTPT_PGEXTRA="un="+unValue+"&ip="+'200

## Download the pdf into local file (maybe using multi-thread download manager later)

In [38]:
pdf1_url = pdf1_soup.findAll('iframe')[-1]['src']

pdf1_url_clean = pdf1_url[:pdf1_url.find('pdf') + 3]

pdf1_request = session.get(pdf1_url_clean, stream=True)

with open('data/pdf1.pdf', 'wb') as fd:
    for chunk in pdf1_request.iter_content(chunk_size=128):
        fd.write(chunk)