In [230]:
# Getting Journal Name and Authors From DOI number

import requests
def get_journal_and_authors_from_doi(doi):
    url = f"https://api.crossref.org/works/{doi}"
    
    try:
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            journal_name = data['message']['container-title'][0]  
            authors = data['message']['author']
            author_names = [f"{author['given']} {author['family']}" for author in authors]
            return journal_name, author_names
        else:
            print(f"Error 2. Status code: {response.status_code}")
    except:
        print("Error 1.")
        return None

In [231]:
# Getting the Link of Journal Data page from main ScimagoJR page

from bs4 import BeautifulSoup

def search_page(journal_name):
  search_name=journal_name.strip().replace(" ","+")
  search_name=search_name.replace(':','%3A')
  search_url=f"https://www.scimagojr.com/journalsearch.php?q={search_name}"
  r = requests.get(search_url)
  if r.status_code==200:
    soup = BeautifulSoup(r.content, 'html5lib')
    try:
      search_result=soup.find('div',class_="search_results").find("a")['href']
    except TypeError:
      search_result=None
      print(journal_name,"No results were found")
  else:
    search_result=None
    print(journal_name,'Page not found 0')
  return search_result

In [232]:
#Getting Quartile from Journal Data

import requests
from bs4 import BeautifulSoup
import pandas as pd

def quartile_list(search_result):
    if search_result is not None:
        result_url = f'https://www.scimagojr.com/{search_result}'
        try:
            response = requests.get(result_url)
            soup = BeautifulSoup(response.text, 'html.parser')
            quartile_th = soup.find('th', text='Quartile')
            if quartile_th:
                quartile_table = quartile_th.find_parent('table')
                if quartile_table:
                    df = pd.read_html(str(quartile_table), match='Quartile')[0]
                    latest_rating_indices = df.groupby('Category').Year.agg('idxmax')
                    latest_ratings = df.loc[latest_rating_indices]
                    unique_ratings = latest_ratings['Quartile'].unique()[0]
                else:
                    unique_ratings = None
                    print(result_url, "No quartile category found")
            else:
                unique_ratings = None
                print(result_url, "No quartile found on page")
        except ValueError:
            unique_ratings = None
        except requests.HTTPError:
            unique_ratings = None
            print("Page Not Found")
    else:
        unique_ratings = None
    return unique_ratings


In [233]:
# Getting DOI data from Google Spreadsheet

import gspread
from oauth2client.service_account import ServiceAccountCredentials
import numpy as np
def get_doi_list_from_spreadsheet(sheetName):
    scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
    credentials = ServiceAccountCredentials.from_json_keyfile_name("facultyrecruitmentupgradation-5e06eb0edef3.json", scope)

    gc = gspread.authorize(credentials)
    worksheet = gc.open(sheetName).sheet1

    data = worksheet.get_all_records()
    doi_dataframe = pd.DataFrame(data)
    doi_check = np.array(doi_dataframe['CATEGORY']).tolist()
    doi_done = 0
    for i in range(len(doi_check)):
        if doi_check[i] != '':
            doi_done+=1
    doi_data = doi_dataframe['DOI NUMBER']
    doi_list = np.array(doi_data).tolist()
    doi_list = doi_list[doi_done:]
    for i in range(len(doi_list)):
        if str(doi_list[i][0:15]) == "https://doi.org":
            doi_list[i] = doi_list[i][16:]
    return doi_list,doi_done
sheetName = "ApplicationsData"

In [234]:
# Post Processing Incorrect Entries

def post_process_invalid_inputs(Journal_name,doi):
    Journal_name = Journal_name.split(':')[0]
    return Journal_name,doi

In [235]:
# Writing into Google Spreadsheet 
  
def enter_into_sheet(df,sheetName,start):
        scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
        credentials = ServiceAccountCredentials.from_json_keyfile_name('facultyrecruitmentupgradation-5e06eb0edef3.json', scope)
        gc = gspread.authorize(credentials)
        spreadsheet = gc.open(sheetName)

        worksheet = spreadsheet.worksheet('Sheet1')  
        worksheet.delete_rows(start, start + len(df) -1)
        data_sheet = []
        n = len(df)
        for i in range(n):
                data_sheet.append(list(map(str,np.array(df.loc[i]).tolist())))
        worksheet.insert_rows(data_sheet, start)


In [236]:
# Main

sheetName = "ApplicationsData"
doi_list,end = get_doi_list_from_spreadsheet(sheetName)

data=[]
for doi in doi_list:
  journal_name, author_names = get_journal_and_authors_from_doi(doi)
  search_result=search_page(journal_name)
  quartile=quartile_list(search_result)
  if quartile == None:
    journal_name = post_process_invalid_inputs(journal_name,doi)[0]
  search_result=search_page(journal_name)
  quartile=quartile_list(search_result)
  data.append([doi,journal_name,quartile,len(author_names)])

df=pd.DataFrame(data,columns=['DOI','Journal name','Category','No of authors'])

enter_into_sheet(df,sheetName,end+2)
df

Unnamed: 0,DOI,Journal name,Category,No of authors
0,10.1109/ACCESS.2021.3057500,IEEE Access,Q1,3
