# WosClient

In [1]:
# !pip install suds-jurko

from suds import client
from base64 import b64encode as _b64encode
from collections import OrderedDict as _OrderedDict

class WosClient():
    """Query the Web of Science.
       You must provide user and password only to user premium WWS service.
       with WosClient() as wos:
           results = wos.search(...)"""

    base_url = 'http://search.webofknowledge.com'
    auth_url = base_url + '/esti/wokmws/ws/WOKMWSAuthenticate?wsdl'
    search_url = base_url + '/esti/wokmws/ws/WokSearch?wsdl'
    searchlite_url = base_url + '/esti/wokmws/ws/WokSearchLite?wsdl'

    def __init__(self, user=None, password=None, SID=None, close_on_exit=True,
                 lite=False):
        """Create the SOAP clients. user and password for premium access."""

        self._SID = SID
        self._close_on_exit = close_on_exit
        search_wsdl = self.searchlite_url if lite else self.search_url
        self._auth = client.Client(self.auth_url)
        self._search = client.Client(search_wsdl)

        if user and password:
            auth = '%s:%s' % (user, password)
            auth = _b64encode(auth.encode('utf-8')).decode('utf-8')
            headers = {'Authorization': ('Basic %s' % auth).strip()}
            self._auth.set_options(headers=headers)

    def __enter__(self):
        """Automatically connect when used with 'with' statements."""
        self.connect()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        """Close connection after closing the 'with' statement."""
        if self._close_on_exit:
            self.close()

    def __del__(self):
        """Close connection when deleting the object."""
        if self._close_on_exit:
            self.close()

    def connect(self):
        """Authenticate to WOS and set the SID cookie."""
        if not self._SID:
            self._SID = self._auth.service.authenticate()
            print('Authenticated (SID: %s)' % self._SID)

        self._search.set_options(headers={'Cookie': 'SID="%s"' % self._SID})
        self._auth.options.headers.update({'Cookie': 'SID="%s"' % self._SID})
        return self._SID

    def close(self):
        """Close the session."""
        if self._SID:
            self._auth.service.closeSession()
            self._SID = None

    def search(self, query, count=100, offset=1):
    # def search(self, query, offset=1):
        """Perform a query. Check the WOS documentation for v3 syntax."""
        if not self._SID:
            raise RuntimeError('Session not open. Invoke .connect() before.')

        qparams = _OrderedDict([('databaseId', 'WOS'),
                                ('userQuery', query),
                                ('queryLanguage', 'en')])

        rparams = _OrderedDict([('firstRecord', offset),
                                ('count', count),
                                ('sortField', _OrderedDict([('name', 'RS'),
                                                            ('sort', 'D')]))])

        return self._search.service.search(qparams, rparams)

In [2]:

from xml.etree import ElementTree as _ET
from xml.dom import minidom as _minidom
import re as _re

def single(wosclient, wos_query, xml_query=None, count=10, offset=1):
    """Perform a single Web of Science query and then XML query the results."""
    result = wosclient.search(wos_query, count, offset)
    print (result)
    xml = _re.sub(' xmlns="[^"]+"', '', result.records, count=1).encode('utf-8')
    if xml_query:
        xml = _ET.fromstring(xml)
        return [el.text for el in xml.findall(xml_query)]
    else:
        return _minidom.parseString(xml).toprettyxml()

def query(wosclient, wos_query, xml_query=None, count=10, offset=1, limit=100):
    """Query Web of Science and XML query results with multiple requests."""
    results = [single(wosclient, wos_query, xml_query, min(limit, count-x+1), x) for x in range(offset, count+1, limit)]
    if xml_query:
        return [el for res in results for el in res]
    else:
        pattern = _re.compile(r'.*?<records>|</records>.*', _re.DOTALL)
        return ('<?xml version="1.0" ?>\n<records>' +
                '\n'.join(pattern.sub('', res) for res in results) +
                '</records>')

def doi_to_wos(wosclient, doi):
    """Convert DOI to WOS identifier."""
    results = query(wosclient, 'DO=%s' % doi, './REC/UID', count=1)
    return results[0].lstrip('WOS:') if results else None

In [3]:
soap = WosClient(lite=True)
soap.connect()

Authenticated (SID: C53pSTtQ5V2zBW7uKJi)


C53pSTtQ5V2zBW7uKJi

In [160]:
import pprint

results = soap.search("AU='YUKSEL SEREF' ")
pprint.pprint(results)

(searchResults){
   queryId = "2"
   recordsFound = 39
   recordsSearched = 69042800
   records[] = 
      (liteRecord){
         uid = "WOS:000262673000006"
         title[] = 
            (labelValuesPair){
               label = "Title"
               value[] = 
                  "Does Helicobacter pylori-induced inflammation of gastric mucosa determine the severity of symptoms in functional dyspepsia?",
            },
         doctype[] = 
            (labelValuesPair){
               label = "Doctype"
               value[] = 
                  "Article",
            },
         source[] = 
            (labelValuesPair){
               label = "Issue"
               value[] = 
                  "1",
            },
            (labelValuesPair){
               label = "Pages"
               value[] = 
                  "66-70",
            },
            (labelValuesPair){
               label = "Published.BiblioDate"
               value[] = 
                  "JAN",
            }

In [161]:
# number of publishment
results.recordsFound

39

In [162]:
idx_max = results.recordsFound
dict_list = []
for i in range(idx_max):
    # uid
    uid_tmp = results.records[i].uid

    # title of the paper
    title_tmp = results.records[i].title[0].value[0]

    # publishment type
    type_tmp = results.records[i].doctype[0].value[0]

    # name of journal
    journal_tmp = next((item for item in results.records[i].source if item["label"] == "SourceTitle")).value[0]

    # Publish year
    year_tmp = next((item for item in results.records[i].source if item["label"] == "Published.BiblioYear")).value[0]

    # Publish date
    if 'date' in results.records[i].source:
        date_tmp = next((item for item in results.records[i].source if item["label"] == "Published.BiblioDate")).value[0]
    else:
        date_tmp = []

    # co-authors
    coauthor_tmp = results.records[i].authors[0].value

    # keywords
    if 'keywords' in results.records[i]:
        keyword_tmp = results.records[i].keywords[0].value
    else:
        keyword_tmp = []
    
    dict_tmp = {'uid':uid_tmp,'title':title_tmp,'type':type_tmp,'journal':journal_tmp,
                'year':year_tmp,'date':date_tmp,'coauthor':coauthor_tmp,'keyword':keyword_tmp}
    dict_list.append(dict_tmp)

In [52]:
results.records[0].uid

WOS:000254477800003

In [107]:
# title of the paper
results.records[0].title[0].value[0]
#next((item for item in results.records[0].title if item['label']=='Title')).value

Assessment of genotoxicity in rats treated with the antidiabetic agent, pioglitazone

In [95]:
# publishment type
results.records[0].doctype[0].value[0]
#next((item for item in results.records[0].doctype if item['label']=='Doctype')).value

Article

In [104]:
# name of journal
next((item for item in results.records[0].source if item["label"] == "SourceTitle")).value[0]

ENVIRONMENTAL AND MOLECULAR MUTAGENESIS

In [108]:
# Publish year
next((item for item in results.records[0].source if item["label"] == "Published.BiblioYear")).value[0]

2008

In [98]:
# Publish date
next((item for item in results.records[0].source if item["label"] == "Published.BiblioDate")).value[0]

APR

In [100]:
# co-authors
results.records[0].authors[0].value
# results[3][0][4][0][1]

[Bedir, Abdulkerim,
 Aliyazicioglu, Yuksel,
 Bilgici, Birsen,
 Yurclakul, Zafer,
 Uysal, Mehmet,
 Suvaci, Duygu Erol,
 Okuyucu, Ali,
 Kahraman, Hakki,
 Hokelek, Murat,
 Alvur, Muhlise]

In [128]:
results.records[0].keywords[0].value

[genotoxicity, pioglitazone, comet assay, single cell gel electrophoresis]

**Search and save part**

In [4]:
import pandas as pd
import time
import pickle

In [5]:
namelist = pd.read_csv('Publishments.csv').drop(columns='Unnamed: 0')

In [6]:
namelist

Unnamed: 0,SearchNames,Author,NoPapers
0,AU=AYDIN AYDIN,AYDIN AYDIN,4
1,AU=GILGIL ERDAL,GILGIL ERDAL,5
2,AU=HALAC METIN,HALAC METIN,63
3,AU=YUCE ABDULHAKIM,YUCE ABDULHAKIM,0
4,AU=KIRIS ABDULKADIR,KIRIS ABDULKADIR,54
5,AU=KUCUKBAYRAK ABDULKADIR,KUCUKBAYRAK ABDULKADIR,30
6,AU=YILDIRIM ABDULKADIR,YILDIRIM ABDULKADIR,33
7,AU=CEKIN ABDULKADIR,CEKIN ABDULKADIR,0
8,AU=SENGUN ABDULKADIR,SENGUN ABDULKADIR,13
9,AU=BEDIR ABDULKERIM,BEDIR ABDULKERIM,38


In [7]:
namelist.count()

SearchNames    3575
Author         3575
NoPapers       3575
dtype: int64

3461 out of 3575 have paper less equal to 100. And 2708 people have paper between 1 to 100.
And 2663 unique names.

In [8]:
one_hundred = namelist.loc[(namelist.NoPapers<=100) & (namelist.NoPapers>0)].copy()

In [9]:
one_hundred.drop_duplicates(subset='Author',inplace=True)

In [44]:
one_hundred = one_hundred.reset_index().drop(columns='index')

In [None]:
cnt = 0
author_dict = {}
subfolder = './scrab_data'

In [54]:
for author in one_hundred.Author:
    author = one_hundred.iloc[j].Author
    print(author)
    results = soap.search("AU='%s'"%author)
    with open('%s/%s.txt'%(subfolder,author), 'w') as f:
        for item in results:
            item = str(item)
            f.write("%s\n" % item)
    
    idx_max = results.recordsFound
    dict_list = []
    for i in range(idx_max):
        # uid
        uid_tmp = results.records[i].uid

        # title of the paper
        title_tmp = results.records[i].title[0].value[0]

        # publishment type
        type_tmp = results.records[i].doctype[0].value[0]

        # name of journal
        journal_tmp = next((item for item in results.records[i].source if item["label"] == "SourceTitle")).value[0]

        # Publish year
        year_tmp = next((item for item in results.records[i].source if item["label"] == "Published.BiblioYear")).value[0]

        # Publish date
        if 'date' in results.records[i].source:
            date_tmp = next((item for item in results.records[i].source if item["label"] == "Published.BiblioDate")).value[0]
        else:
            date_tmp = []

        # co-authors
        coauthor_tmp = results.records[i].authors[0].value

        # keywords
        if 'keywords' in results.records[i]:
            keyword_tmp = results.records[i].keywords[0].value
        else:
            keyword_tmp = []

        dict_tmp = {'uid':uid_tmp,'title':title_tmp,'type':type_tmp,'journal':journal_tmp,
                    'year':year_tmp,'date':date_tmp,'coauthor':coauthor_tmp,'keyword':keyword_tmp}
        dict_list.append(dict_tmp)
    author_dict[author]=dict_list
    
    cnt = cnt+1
    
    if cnt % 2400==0: # WOS constraints that each ID can only search for 2500 times. Just set smaller counts to renew the ID.
        soap = WosClient(lite=True)
        soap.connect()
    
    if cnt%100 == 0:
        print('=================================')
        print('=================================')
        print(cnt/len(one_hundred.Author)*100,'%')
        print('=================================')
        print('=================================')
        
    time.sleep(0.5)

BALTA SERKAN
BOYAR SERKAN
ELCIN SERKAN
ERDAL SERKAN
KARATAS SERKAN
KIRBAS SERKAN
KURTGOZ SERKAN
PERKMEN SERKAN
CAKIR SERKAN
OZTURK SERKAN
SAHIN SERKAN
SENOL SERKAN
YAGCI SERVER
DEMIR SERVET
KACAR SEVAL
SEZER SEVGI
YUCA SEVIL
GOKSUGUR SEVIL
KAMALI SEVIL
BERTLEK SEYDI
OKUMUS SEYDI
EMIR SEYFI
KOSE SEYIT
APILIOGULLARI SEZA
KALAFAT SEZAI
TEMELLI SEZAI
SASMAZ SEZAI
OCAK SEZEN
KISA SEZER
OZSOY SECKIN
ARSLAN SIDDIK
MALKOC SIDDIK
BAYTAK SITKI
DUMAN SITKI
ERMIS SITKI
CORBACIOGLU SITKI
AKKOC SONER
ASKIN SUAT
KESKIN SUAT
ZEYREK SUAT
BISKIN SULTAN
KABA SULTAN
KURT SULTAN
BASAK SUNA
SACAR SUZAN
YAZICI SUZAN
KARAKUS SUZAN
AYHAN SUZI
ARSLAN SONMEZ
PAMUK SONMEZ
DIKICI SUBER
CARSANCAKLI SULEYMAN
ALTINTAS SULEYMAN
AYVAZ SULEYMAN
BASLAR SULEYMAN
CENGIZ SULEYMAN
CENKCI SULEYMAN
DEMIR SULEYMAN
DEMIRCI SULEYMAN
ERCAN SULEYMAN
GOREN SULEYMAN
SEVILGEN SULEYMAN
SAHIN SULEYMAN
KAHRAMAN SULEYMAN
KARACELIL SULEYMAN
KARSLI SULEYMAN
KAVAK SULEYMAN
KIZIL SULEYMAN
SARIBAS SULEYMAN
SEMIZ SULEYMAN
SEYDI SULEYMAN
TURK SU

KOCA IRFAN
UCGUN IRFAN
YILMAZ IRFAN
SAKA IRFAN
SIAP IRFAN
PARMAKSIZ ISKENDER
ACAR ISMAIL
AKKAS ISMAIL
AKSOY ISMAIL
ALBAYRAK ISMAIL
ARSLAN ISMAIL
ERDEN ISMAIL
BOYRAZ ISMAIL
CAPAR ISMAIL
DEMIR ISMAIL
DOGAN ISMAIL
GUNES ISMAIL
EREN ISMAIL
ERMIS ISMAIL
ERTURK ISMAIL
GUL ISMAIL
UZUN ISMAIL
97.6342470897484 %
UGURTAS ISMAIL
KARA ISMAIL
KATI ISMAIL
KENAR ISMAIL
KURNAZ ISMAIL
KURT ISMAIL
KOKSAL ISMAIL
KULAHLI ISMAIL
MARULCU ISMAIL
METIN ISMAIL
UCUN ISMAIL
ULUCAY ISMAIL
YILDIZ ISMAIL
YUKSEL ISMAIL
CAKIR ISMAIL
UYSAL ISMAIL
OZTURK ISMAIL
SAHIN ISMAIL
SENER ISMAIL
AKCA ISMET
BASARAN ISMET
DURAN ISMET
GOCER ISMET
OZDEMIR ISMET
OZTURK ISMET
BABACAN ISRAFIL
DOS IZZET
CICEKBILEK IZZET
GUCLU SABAN
CETIN SABAN
YILMAZ SAZIYE
AKSOY SEFIKA
EKIM SEFIKA
KARACA SEMSETTIN
SAHIN SEMSETTIN
KARABUGA SEMISTAN
TASDEMIR SENER
SISE SENGUL
OZDEMIR SENNUR
OKAY SENOL
AYKUT SEREF
KURT SEREF
OKUDUCU SEREF
OLGAR SEREF
YUKSEL SEREF
SIMSEK SEREF
KAHRAMAN SERIF
AKALIN SERIFE
ARIKAN SEVKET
ATA SEVKET
CIVELEK SEVKET
TARIM SEYD

In [114]:
result = author_dict.copy()

Deleting ',' between family and given name. Convert the co-author name into upper case.

In [116]:
for author in one_hundred.Author:
    for i in range(len(result[author])):
        tmp = result[author][i]['coauthor'].copy()
        for j in range(len(tmp)):
            tmp[j]=tmp[j].replace(",","").upper()
        result[author][i]['coauthor'] = tmp

In [117]:
result['AYDIN AYDIN'][0]['coauthor']

['KUL SIBEL', 'AYDIN AYDIN', 'DINC HASAN', 'ERDURAN EROL']

Save the dictionary and re-load it again.

In [118]:
# the output file is 49.4Mb
with open('author_dict.txt', 'wb') as handle:
    pickle.dump(result, handle)

In [119]:
with open('author_dict.txt', 'rb') as handle:
    b = pickle.loads(handle.read())

In [120]:
b['AYDIN AYDIN'][0]['coauthor']

['KUL SIBEL', 'AYDIN AYDIN', 'DINC HASAN', 'ERDURAN EROL']

In [121]:
# showing that it's the same dictionary list we want.
result == b

True