# Scrapping
This notebook is written for scrapping paper data of **not dismissed author** from [Web of Science](http://apps.webofknowledge.com/WOS_GeneralSearch_input.do?product=WOS&search_mode=GeneralSearch&SID=F41mtBBV1mNZKmygFN7&preferencesSaved=). (Paper data of **dismissed author** have been scraped by anthor group who collabrates with us in this project.)

- The search strategy is searching the full name in **not dismissed author list**.
- Since the data scraped from [Web of Science](http://apps.webofknowledge.com/WOS_GeneralSearch_input.do?product=WOS&search_mode=GeneralSearch&SID=F41mtBBV1mNZKmygFN7&preferencesSaved=) is in `.xlsx` format, we use **BeautifulSoup** package to parse it. (The parsing code is stored in file `parsing.py`)
- We store those aimed features in `pandas.DataFrame` format: title, uid, publish_date, vol, pubtype, issue, language, doctype, source, keywords, abstract, headings, subheadings, traditional_subjects, extended_subjects, category_info, addresses_info.

In [1]:
# Scraping and parsing package/module 
from suds.client import Client
from scraper_WoS import *
from parsing import *

# Math packages
import numpy as np
import pandas as pd

# Other packages
import re
import csv
import time
import pickle
import warnings; warnings.simplefilter('ignore')

In [11]:
# For the sake of copyright and privacy, we can not share it. 
USER_NAME = 'SWISS10_reproj'
PASSWORD = 'Welcome#10 '

In [12]:
def pub_of_author(Querystring):
    results = soap.search(Querystring, offset = 1)
    if results.recordsFound > 100:
        new_records = results.records
        for i in range(int(results.recordsFound/100)):
            time.sleep(0.5)
            research = soap.search(Querystring, offset = (i+1)*100+1)
            time.sleep(0.5)
            new_records = new_records + research.records
        #results = new_records
        results.records = new_records
    return results

In [13]:
soap = WosClient(user= USER_NAME, password= PASSWORD, lite=False)
soap.connect()

Authenticated (SID: F27iZVmy7KbYkszKRlh)


F27iZVmy7KbYkszKRlh

In [5]:
# Get the list of not dismissed author (get from another group)
not_dismissed_df = pd.read_excel('./Data/nodismissed_complete.xlsx')
# Drop abbreviation
not_dismissed_df['Author'] = not_dismissed_df['Author'].map(lambda x:x.lstrip('AU='))
not_dismissed_df['Author'] = not_dismissed_df['Author'].map(lambda x: x\
                                                            if (re.sub('\s+', '', x).isalpha())\
                                                            else np.nan)
not_dismissed_df.dropna(inplace=True)
not_dismissed_list = not_dismissed_df['Author'].unique()
# Drop names which less than 2 words
not_dismissed_list = [i for i in not_dismissed_list if len(i.split(' '))>=2]
not_dismissed_list[0:10]

['Eroglu Ilhan',
 'Koksal Deniz',
 'Yesil Nesibe Karahan',
 'Sarpel Tunay',
 'k Emel',
 'Bariskin Elif',
 'Yusufoglu Edagani',
 'slani Mahmoud Ali Asghar',
 'Veenhof Rob',
 'Okutucu Sercan']

In [6]:
# According to the full names in not dismissed author list, scrape their paper data
searchnames_dismiss_auths = []
for line in not_dismissed_list: 
    QueryString = line.strip()
    QueryString_author = 'AU=' + QueryString
    searchnames_dismiss_auths.append(QueryString_author)
searchnames_dismiss_auths[0:10]  

['AU=Eroglu Ilhan',
 'AU=Koksal Deniz',
 'AU=Yesil Nesibe Karahan',
 'AU=Sarpel Tunay',
 'AU=k Emel',
 'AU=Bariskin Elif',
 'AU=Yusufoglu Edagani',
 'AU=slani Mahmoud Ali Asghar',
 'AU=Veenhof Rob',
 'AU=Okutucu Sercan']

In [None]:
dismissed_dataset = pd.DataFrame()
for i in range(len(searchnames_dismiss_auths)):
        print(i,searchnames_dismiss_auths[i])
        if i == 1000: # WOS constraints that each ID can only search for 2500 times. Just set smaller counts to renew the ID.
            soap = WosClient(user= 'SWISS10_reproj', password= 'Welcome#10 ', lite=False)
            soap.connect()
        QueryString = searchnames_dismiss_auths[i]
        try:
            results = soap.search(QueryString)
        except Exception as e:
            print('Error on loop',QueryString)
            print(e)
        if results.recordsFound > 1000:
            time.sleep(0.5)
            continue
        try:
            results = pub_of_author(QueryString)
            #time.sleep(0.5)
            Soup = BeautifulSoup(results.records,'lxml')
            dataset = construct_dataset(Soup)
            dataset['FullName'] = QueryString.lstrip('AU=')
            dismissed_dataset=dismissed_dataset.append(dataset,ignore_index=True)
        except Exception as e:
            print('Error on loop',QueryString)
            print(e)

In [None]:
# with open('./Data/not_dismissed_dataset.pickle', 'wb') as handle:
#     pickle.dump(dismissed_dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('./Data/not_dismissed_dataset.pickle', 'rb') as handle:
    dismissed_dataset = pickle.load(handle)