# WosClient

In [1]:
import pandas as pd
import time
import pickle
import math

In [2]:
# !pip install suds-jurko

from suds import client
from base64 import b64encode as _b64encode
from collections import OrderedDict as _OrderedDict

class WosClient():
    """Query the Web of Science.
       You must provide user and password only to user premium WWS service.
       with WosClient() as wos:
           results = wos.search(...)"""

    base_url = 'http://search.webofknowledge.com'
    auth_url = base_url + '/esti/wokmws/ws/WOKMWSAuthenticate?wsdl'
    search_url = base_url + '/esti/wokmws/ws/WokSearch?wsdl'
    searchlite_url = base_url + '/esti/wokmws/ws/WokSearchLite?wsdl'

    def __init__(self, user=None, password=None, SID=None, close_on_exit=True,
                 lite=False):
        """Create the SOAP clients. user and password for premium access."""

        self._SID = SID
        self._close_on_exit = close_on_exit
        search_wsdl = self.searchlite_url if lite else self.search_url
        self._auth = client.Client(self.auth_url)
        self._search = client.Client(search_wsdl)

        if user and password:
            auth = '%s:%s' % (user, password)
            auth = _b64encode(auth.encode('utf-8')).decode('utf-8')
            headers = {'Authorization': ('Basic %s' % auth).strip()}
            self._auth.set_options(headers=headers)

    def __enter__(self):
        """Automatically connect when used with 'with' statements."""
        self.connect()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        """Close connection after closing the 'with' statement."""
        if self._close_on_exit:
            self.close()

    def __del__(self):
        """Close connection when deleting the object."""
        if self._close_on_exit:
            self.close()

    def connect(self):
        """Authenticate to WOS and set the SID cookie."""
        if not self._SID:
            self._SID = self._auth.service.authenticate()
            print('Authenticated (SID: %s)' % self._SID)

        self._search.set_options(headers={'Cookie': 'SID="%s"' % self._SID})
        self._auth.options.headers.update({'Cookie': 'SID="%s"' % self._SID})
        return self._SID

    def close(self):
        """Close the session."""
        if self._SID:
            self._auth.service.closeSession()
            self._SID = None

    def search(self, query, count=100, offset=1):
    # def search(self, query, offset=1):
        """Perform a query. Check the WOS documentation for v3 syntax."""
        if not self._SID:
            raise RuntimeError('Session not open. Invoke .connect() before.')

        qparams = _OrderedDict([('databaseId', 'WOS'),
                                ('userQuery', query),
                                ('queryLanguage', 'en')])

        rparams = _OrderedDict([('firstRecord', offset),
                                ('count', count),
                                ('sortField', _OrderedDict([('name', 'RS'),
                                                            ('sort', 'D')]))])

        return self._search.service.search(qparams, rparams)

In [3]:

from xml.etree import ElementTree as _ET
from xml.dom import minidom as _minidom
import re as _re

def single(wosclient, wos_query, xml_query=None, count=10, offset=1):
    """Perform a single Web of Science query and then XML query the results."""
    result = wosclient.search(wos_query, count, offset)
    print (result)
    xml = _re.sub(' xmlns="[^"]+"', '', result.records, count=1).encode('utf-8')
    if xml_query:
        xml = _ET.fromstring(xml)
        return [el.text for el in xml.findall(xml_query)]
    else:
        return _minidom.parseString(xml).toprettyxml()

def query(wosclient, wos_query, xml_query=None, count=10, offset=1, limit=100):
    """Query Web of Science and XML query results with multiple requests."""
    results = [single(wosclient, wos_query, xml_query, min(limit, count-x+1), x) for x in range(offset, count+1, limit)]
    if xml_query:
        return [el for res in results for el in res]
    else:
        pattern = _re.compile(r'.*?<records>|</records>.*', _re.DOTALL)
        return ('<?xml version="1.0" ?>\n<records>' +
                '\n'.join(pattern.sub('', res) for res in results) +
                '</records>')

def doi_to_wos(wosclient, doi):
    """Convert DOI to WOS identifier."""
    results = query(wosclient, 'DO=%s' % doi, './REC/UID', count=1)
    return results[0].lstrip('WOS:') if results else None

In [4]:
def pub_of_author(Querystring):
    results = soap.search(Querystring, offset = 1)
    if results.recordsFound > 100:
        new_records = results.records
        for i in range(math.ceil(results.recordsFound/100)-1):
            new_records = new_records + soap.search(Querystring, offset = (i+1)*100+1).records
        results.records = new_records
    return results


In [5]:
soap = WosClient(lite=True)
soap.connect()

Authenticated (SID: E2gKEWbsjBhxtJxeDdq)


E2gKEWbsjBhxtJxeDdq

**Search and save part**

In [6]:
namelist = pd.read_csv('Publishments.csv').drop(columns='Unnamed: 0')

In [7]:
namelist

Unnamed: 0,SearchNames,Author,NoPapers
0,AU=AYDIN AYDIN,AYDIN AYDIN,4
1,AU=GILGIL ERDAL,GILGIL ERDAL,5
2,AU=HALAC METIN,HALAC METIN,63
3,AU=YUCE ABDULHAKIM,YUCE ABDULHAKIM,0
4,AU=KIRIS ABDULKADIR,KIRIS ABDULKADIR,54
5,AU=KUCUKBAYRAK ABDULKADIR,KUCUKBAYRAK ABDULKADIR,30
6,AU=YILDIRIM ABDULKADIR,YILDIRIM ABDULKADIR,33
7,AU=CEKIN ABDULKADIR,CEKIN ABDULKADIR,0
8,AU=SENGUN ABDULKADIR,SENGUN ABDULKADIR,13
9,AU=BEDIR ABDULKERIM,BEDIR ABDULKERIM,38


In [8]:
namelist.count()

SearchNames    3575
Author         3575
NoPapers       3575
dtype: int64

3461 out of 3575 have paper less equal to 100. And 2708 people have paper between 1 to 100.
And 2663 unique names.

In [9]:
one_hundred = namelist.loc[namelist.NoPapers>0].copy()

In [10]:
one_hundred.drop_duplicates(subset='Author',inplace=True)

In [11]:
one_hundred = one_hundred.reset_index().drop(columns='index')

In [12]:
one_hundred.count()

SearchNames    94
Author         94
NoPapers       94
dtype: int64

In [13]:
cnt = 0
# author_dict = {}
subfolder = './scrab_data'

In [14]:
with open('author_dict.txt', 'rb') as handle:
    author_dict = pickle.loads(handle.read())

In [15]:
len(author_dict)

2663

In [17]:
for author in one_hundred.Author:
    #author = one_hundred.iloc[j].Author
    print(author)
    results = pub_of_author("AU='%s'"%author)
    # results = soap.search("AU='%s'"%author)
    with open('%s/%s.txt'%(subfolder,author), 'w') as f:
        for item in results:
            item = str(item)
            f.write("%s\n" % item)
    
    idx_max = results.recordsFound
    dict_list = []
    for i in range(idx_max):
        # uid
        uid_tmp = results.records[i].uid

        # title of the paper
        title_tmp = results.records[i].title[0].value[0]

        # publishment type
        type_tmp = results.records[i].doctype[0].value[0]

        # name of journal
        journal_tmp = next((item for item in results.records[i].source if item["label"] == "SourceTitle")).value[0]

        # Publish year
        year_tmp = next((item for item in results.records[i].source if item["label"] == "Published.BiblioYear")).value[0]

        # Publish date
        if 'date' in results.records[i].source:
            date_tmp = next((item for item in results.records[i].source if item["label"] == "Published.BiblioDate")).value[0]
        else:
            date_tmp = []

        # co-authors
        coauthor_tmp = results.records[i].authors[0].value

        # keywords
        if 'keywords' in results.records[i]:
            keyword_tmp = results.records[i].keywords[0].value
        else:
            keyword_tmp = []

        dict_tmp = {'uid':uid_tmp,'title':title_tmp,'type':type_tmp,'journal':journal_tmp,
                    'year':year_tmp,'date':date_tmp,'coauthor':coauthor_tmp,'keyword':keyword_tmp}
        dict_list.append(dict_tmp)
    author_dict[author]=dict_list
    
    cnt = cnt+1
    
    if cnt % 2400==0: # WOS constraints that each ID can only search for 2500 times. Just set smaller counts to renew the ID.
        soap = WosClient(lite=True)
        soap.connect()
    
    if cnt%100 == 0:
        print('=================================')
        print('=================================')
        print(cnt/len(one_hundred.Author)*100,'%')
        print('=================================')
        print('=================================')
        
    time.sleep(0.5)

DOGAN ABDULLAH
SANCAKTUTAR AHMET
BEKIR AHMET
DEMIR AHMET
KILIC AHMET
KAHRAMAN AHMET
KAYA AHMET
ONAT AHMET
CELIK AHMET
UYSAL AHMET
YILDIRIM AHMET
SAHIN AHMET
BAYRAKTAR ALEMDAR
SEVINC ALPER
AYDIN ALI
DOGAN ALI
GURBUZ ALI
YILDIRIM ALI
YILDIZ ALI
CELIK ALI
COSKUN ALI
SAHIN ALI
KISACIK BUNYAMIN
AYDIN HASAN
KAYA HASAN
AKTAS MEHMET
ARSLAN MEHMET
BULUT MEHMET
YUKSEL MEHMET
KILIC MEHMET
UNLU MEHMET
ERDEM MEHMET
ERDOGAN MEHMET
KAYA MEHMET
SONMEZ MEHMET
GUL MEHMET
GUNDUZ MEHMET
KANTER MEHMET
KARACA MEHMET
KAYRAK MEHMET
AYDIN MEHMET
TUZCU MEHMET
SEZER MEHMET
YAMAN MEHMET
YAZICI MEHMET
YILDIRIM MEHMET
YILMAZ MEHMET
YUCEL MEHMET
OZ MEHMET
OZCAN MEHMET
SAHIN MEHMET
KURT MEVLUT
AYDIN MURAT
DOGAN MURAT
KARA MURAT
KAYA MURAT
UYSAL MURAT
YUKSEL MURAT
AYDIN MUSTAFA
BENEKLI MUSTAFA
CENGIZ MUSTAFA
DEMIR MUSTAFA
YILDIRIM MUSTAFA
DOGAN MUSTAFA
GUL MUSTAFA
KURT MUSTAFA
YAVUZ MUSTAFA
TEKIN MUSTAFA
YILDIZ MUSTAFA
YILMAZ MUSTAFA
CELIK MUSTAFA
OZEN MUSTAFA
UNAL MUSTAFA
AKSOY NURTEN
AKKOC NURULLAH
YUKSEL OSMAN
YILD

In [18]:
result = author_dict.copy()

Deleting ',' between family and given name. Convert the co-author name into upper case.

In [19]:
for author in one_hundred.Author:
    for i in range(len(result[author])):
        tmp = result[author][i]['coauthor'].copy()
        for j in range(len(tmp)):
            tmp[j]=tmp[j].replace(",","").upper()
        result[author][i]['coauthor'] = tmp

In [20]:
result['TEKIN MUSTAFA'][0]['coauthor']

['KIM HYUNG-GOO',
 'KURTH INGO',
 'LAN FEI',
 'MELICIANI IRENE',
 'WENZEL WOLFGANG',
 'EOM SOO HYUN',
 'KANG GIL BU',
 'ROSENBERGER GEORG',
 'TEKIN MUSTAFA',
 'OZATA METIN',
 'BICK DAVID P.',
 'SHERINS RICHARD J.',
 'WALKER STEVEN L.',
 'SHI YANG',
 'GUSELLA JAMES F.',
 'LAYMAN LAWRENCE C.']

Save the dictionary and re-load it again.

In [21]:
# the output file is 49.4Mb
with open('author_dict_all.txt', 'wb') as handle:
    pickle.dump(result, handle)

In [119]:
with open('author_dict.txt', 'rb') as handle:
    b = pickle.loads(handle.read())

In [120]:
b['AYDIN AYDIN'][0]['coauthor']

['KUL SIBEL', 'AYDIN AYDIN', 'DINC HASAN', 'ERDURAN EROL']

In [121]:
# showing that it's the same dictionary list we want.
result == b

True