# JIF Score Extractor

This notebook helps to retrieve JIF scores by ISSN codes.

It is a semi-automatic process with a few manual steps.

Preperations:
- Create a Python virtual environment as descibed in the repository README Setup instructions.
- Select the Python kernel from the virtual environment (venv/bin/python)
- If needed, create sub-folder "data/data2022/pub_extracts"
- If needed, create sub-folder "data/data2022/temp"

To use:
- Copy excel files containing ISSN codes into sub-folder "data/data2022/pub_extracts"
- Edit the settings as needed
- Run each cell
- When you reach the cell with instructions about fetching JCR codes, perform the manual steps.
- Finish by running the remaining cells

Generated artifacts during the process include:
- a set of CSV files containing ISSN codes in sub-folder "JIF_ISSN_lists"
- a final Excel file containing JIF scores named "JIF_Scores_[Year]_[date].xlsx"


In [None]:
# Before executing, select the python kernel from the local virtual env
# Requires the ipykernel is installed

print("Verify can run jupyter cell")

### Edit settings

In [None]:
# Settings

MAX_ISSN_CODES_PER_CSV_FILE = 600     # The maximum number of ISSN codes to write to each CSV file.
DATA_PATH = "../data/data2022"        # The relative path to the input Excel files containing the ISSN codes.
INPUT_COL_NAMES = ["ISSN", "ISSN-L"]  # A list of column names to process.


In [None]:
# Imports
import os
import math
import pandas as pd
import numpy as np
import openpyxl
from datetime import datetime
print("Finished imports")

### Compile list of ISSN codes

In [None]:
input_path = os.path.join(DATA_PATH, "pub_extracts")

print(f"Searching for input files in path: {input_path}")

if not os.path.isdir(input_path):
    raise Exception(f"Path does not exists: {input_path}")


vals_list = []

for filename in os.listdir(input_path):
    f = os.path.join(input_path, filename)
    if os.path.isfile(f):
        print(f)

        df = pd.read_excel(
            f,
            sheet_name=0, # first sheet
            header=0,
            engine="openpyxl",
            keep_default_na=False,
            usecols=INPUT_COL_NAMES,
            dtype=str
        )

        for col in df:
            vals_list = vals_list + df[col].tolist()


print(f"Total nr of ISSN codes before cleaning: {len(vals_list)}")


In [None]:
# Clean the list of ISSN codes

issn_x_df = pd.DataFrame(vals_list, columns=['ISSN']).apply(lambda x: x.astype(str).str.upper())

print(issn_x_df.shape)


# Remove blank rows

issn_x_df["ISSN"] = issn_x_df["ISSN"].str.strip()

issn_x_df.replace('', np.nan, inplace=True)
issn_x_df.dropna(inplace=True)

print(issn_x_df.shape)


# Sort by ISSN

issn_x_df.sort_values(by=["ISSN"], inplace=True)


# Remove duplicate values
issn_x_df.drop_duplicates(inplace=True)

print(issn_x_df.shape)


# Split into ISSN and ISBN codes

isbn_idx = issn_x_df['ISSN'].str.startswith('ISBN')
isbn_df = issn_x_df[isbn_idx]
issn_df = issn_x_df[~isbn_idx]

print(f"Nr of ISSN's: {len(issn_df)}")
print(f"Nr of ISBN's: {len(isbn_df)}")


### Save to csv files

For use to extract from Clarivate.

The ISSN files will have names in format "issn_list_[year]_file[file number].csv"

In [None]:
# Save to csv files

year = datetime.today().year


# Create output folder if not exists

output_path = os.path.join(DATA_PATH, "JIF_ISSN_lists")

if not os.path.exists(output_path):
    os.makedirs(output_path)


# Save the ISBN file
isbn_df.to_csv( os.path.join(output_path, f"isbn_list_{year}.csv"), index=False, header=False )

# Save the ISSN files
n_files = math.ceil(len(issn_df) / MAX_ISSN_CODES_PER_CSV_FILE)

for file_nr in range(1, n_files+1):
    filename = f"issn_list_{year}_file{file_nr}.csv"
    startidx = (file_nr-1) * MAX_ISSN_CODES_PER_CSV_FILE
    endidx = (file_nr*MAX_ISSN_CODES_PER_CSV_FILE)
    print(f"Now saving file {filename} with rows for index between {startidx} and {endidx}")
    issn_df.iloc[startidx:endidx].to_csv( os.path.join(output_path, filename), index=False, header=False )

print("Done")

### MANUAL STEPS: Retrieve JCR codes

- In Clarivate, select the desired columns.
- For each CSV file in /JIF_ISSN_list:
    - Use the list of ISSN codes to filter journals by ISSN / eISSN code in Clarivate.
    - Click Apply to activate the filter selection.
    - Click export. Choose XLS (excel)
    - Wait for the download.
- Move the downloaded files to the /temp folder in this project.

When all files are extracted and moved to /temp, continue with the below steps.

### Clean the CSV files

In [None]:
# Modify to clean the partial extracts

temp_path = os.path.join(DATA_PATH, "temp")
print(f"Searching for partial, temporary files in path: {temp_path}")

if not os.path.isdir(temp_path):
    raise Exception(f"Path does not exists: {temp_path}")


for filename in os.listdir(temp_path):
    f = os.path.join(temp_path, filename)
    if os.path.isfile(f):
        print(f"Now working on file {f}")
        rows_to_delete = []
        wb = openpyxl.load_workbook(f)
        sheet = wb.active
        #print(f"Nr of rows in excel sheet {sheet.max_row}")

        for row in sheet.iter_rows():

            if (row[0].value == "" or row[0].value == None):
                rowidx = row[0].row
                rows_to_delete.append(rowidx)
                #print(f"Deleting an empty row {rowidx}.")

            elif (row[0].value.startswith("Journal Data Filtered") or row[0].value.startswith("Copyright (c)") or row[0].value.startswith("By exporting the selected data")):
                rowidx = row[0].row
                rows_to_delete.append(rowidx)
                #print(f"Deleting row {rowidx} with beginning text: {row[0].value, row[1].value}" )

        #print(f"Now deleting rows with indexes {rows_to_delete}")
        for i in reversed(rows_to_delete):
            sheet.delete_rows(i, 1)

        # Saving to a new file with prefix "cleaned_"
        
        new_file_path = os.path.join(temp_path, "cleaned_"+filename)
        sheet.title = "JCR"
        wb.save(new_file_path)
        print(f"Done editing excel file {f}. Saved to {new_file_path}")



### Combine into one JCR scores output file

In [None]:
# Combine the partial extracts into one complete JIF file
# Save the resulting cleaned output file to /JIF_scores/JCR_JournalResults_[year]_byISSN_[date].xlsx

import glob


# Create output folder if not exists

output_path = os.path.join(DATA_PATH, "JIF_scores")

if not os.path.exists(output_path):
    os.makedirs(output_path)


# Output filename

today = datetime.today().date()
year = today.year
output_filename = f"JCR_JournalResults_{year}_byISSN_run{today.strftime('%Y%m%d')}.xlsx"
output_path = os.path.join(DATA_PATH, "JIF_scores", output_filename)


# Safest to delete existing file if it exists

if os.path.isfile(output_path):
     os.remove(output_path)

output_df = pd.DataFrame()


temp_path = os.path.join(DATA_PATH, "temp")
print(f"Searching for cleaned, temporary files in path: {temp_path}")

filenames = glob.glob(temp_path + "/cleaned_*.xlsx")


for file in filenames:
   df = pd.concat(pd.read_excel( file, sheet_name=None), ignore_index=True, sort=False)

   output_df = output_df.append( df, ignore_index=True)

output_df.to_excel(output_path, index=False, sheet_name="JCR")
print(f"Saved final excel file to {output_path}")
