# Convert relevant data from JSON to CSV file

The following code is for the Author-Keywords and Subject-Areas of BioInformatics Papers.

NOTE: I used the following code for all 6 JSON files, changing only the file name in notebook 1 (JSON input) and notebook 12 (CSV output)

In [2]:
# Load json file

import json
import pandas

json_data = open(r'/Users/loganhornbuckle/Documents/Python/DataScience/raw_json_full/good/scopus2016-2017.json').read()
data=json.loads(json_data)

In [3]:
# check if json file properly read

print(len(data))

7342


In [4]:
# append author-keyword keys to keyword list

keyword = []

for x in range (len(data)):
    if 'abstracts-retrieval-response' in data[x].keys():
        if ('authkeywords' in data[x]['abstracts-retrieval-response'].keys()):
            if (data[x]['abstracts-retrieval-response']['authkeywords']) == None:
                keyword.append("NaN")
            else:
                keyword.append(data[x]['abstracts-retrieval-response']['authkeywords']['author-keyword'])

In [5]:
# append subject keys to list subject

subAbbrev = []
subCode = []
subText = []
                
# append subkeys of subject: @abbrev, @code, and #text to seperate lists
for x in range (len(data)):
    if 'abstracts-retrieval-response' in data[x].keys():
        if data[x]['abstracts-retrieval-response']['subject-areas'] == None:
            subAbbrev.append("NaN")
            subCode.append("NaN")
            subText.append("NaN")
        else:
            if type(data[x]['abstracts-retrieval-response']['subject-areas']['subject-area']) is list:
                for i in range(len(data[x]['abstracts-retrieval-response']['subject-areas']['subject-area'])):
                    subAbbrev.append(data[x]['abstracts-retrieval-response']['subject-areas']['subject-area'][i]['@abbrev'])
                    subCode.append(data[x]['abstracts-retrieval-response']['subject-areas']['subject-area'][i]['@code'])
                    subText.append(data[x]['abstracts-retrieval-response']['subject-areas']['subject-area'][i]['#text'])
            else:
                subAbbrev.append(data[x]['abstracts-retrieval-response']['subject-areas']['subject-area']['@abbrev'])
                subCode.append(data[x]['abstracts-retrieval-response']['subject-areas']['subject-area']['@code'])
                subText.append(data[x]['abstracts-retrieval-response']['subject-areas']['subject-area']['#text'])

In [6]:
# print length of lists to check for errors

print("keyword length: " , len(keyword))
print("text length: " , len(subText))
print("abbrev length: " , len(subAbbrev))
print("code length: " , len(subCode))

keyword length:  7339
text length:  20085
abbrev length:  20085
code length:  20085


In [7]:
# crate dataFrame
df = pandas.DataFrame()

#Create column for Author Keyword
df["Author Keyword"] = keyword
df[:5]

Unnamed: 0,Author Keyword
0,
1,"[Bioinformatics tool, Gene variant, Single Nuc..."
2,"[Bioinformatics, CRYAB, HSPB5, Molecular dynam..."
3,
4,"[Benders decomposition, Bioinformatics, Combin..."


In [8]:
# Create column for Subject Area: Abbreviation


for n in range (len(subAbbrev)):
    df["Abbreviation"] = subAbbrev[n]

for n in range (len(subAbbrev)):
    df["Abbreviation"][n] = subAbbrev[n]

df[:5]

Unnamed: 0,Author Keyword,Abbreviation
0,,COMP
1,"[Bioinformatics tool, Gene variant, Single Nuc...",BIOC
2,"[Bioinformatics, CRYAB, HSPB5, Molecular dynam...",MEDI
3,,MEDI
4,"[Benders decomposition, Bioinformatics, Combin...",BIOC


In [9]:
# Create column for Subject Area: Code

for n in range(len(subCode)):
    df["Code"] = subCode[n]

for n in range(len(subCode)):
    df["Code"][n] = subCode[n]

df[:5]

Unnamed: 0,Author Keyword,Abbreviation,Code
0,,COMP,1702
1,"[Bioinformatics tool, Gene variant, Single Nuc...",BIOC,1311
2,"[Bioinformatics, CRYAB, HSPB5, Molecular dynam...",MEDI,2716
3,,MEDI,2734
4,"[Benders decomposition, Bioinformatics, Combin...",BIOC,1313


In [10]:
# Create column for Subject Area: Text

for n in range(len(subText)):
    df["Text"] = subText[n]
    
for n in range(len(subText)):
    df["Text"][n] = subText[n]

df[:5]

Unnamed: 0,Author Keyword,Abbreviation,Code,Text
0,,COMP,1702,Artificial Intelligence
1,"[Bioinformatics tool, Gene variant, Single Nuc...",BIOC,1311,Genetics
2,"[Bioinformatics, CRYAB, HSPB5, Molecular dynam...",MEDI,2716,Genetics (clinical)
3,,MEDI,2734,Pathology and Forensic Medicine
4,"[Benders decomposition, Bioinformatics, Combin...",BIOC,1313,Molecular Medicine


In [11]:
# preview data frame

df[:100]

Unnamed: 0,Author Keyword,Abbreviation,Code,Text
0,,COMP,1702,Artificial Intelligence
1,"[Bioinformatics tool, Gene variant, Single Nuc...",BIOC,1311,Genetics
2,"[Bioinformatics, CRYAB, HSPB5, Molecular dynam...",MEDI,2716,Genetics (clinical)
3,,MEDI,2734,Pathology and Forensic Medicine
4,"[Benders decomposition, Bioinformatics, Combin...",BIOC,1313,Molecular Medicine
5,"[Diptericin, Drosophila melanogaster, Imd path...",MEDI,2737,Physiology (medical)
6,"[Bacillus, Cocoa fermentation, DegU regulator,...",ENGI,2207,Control and Systems Engineering
7,"[Bioinformatics, Computational proteomics, Mas...",ENGI,2210,Mechanical Engineering
8,"[Collusion-resistant fingerprinting, P2P conte...",MATH,2604,Applied Mathematics
9,"[DNA-binding protein prediction, Feature repre...",MATH,2611,Modeling and Simulation


In [12]:
# create csv file from data frame

df.to_csv('gr3_16_17.csv')