In [218]:
import pandas as pd
import numpy as np
import re
import statistics as stats
pd.options.display.max_rows = 1000

In [219]:
journal_data = [['PLoS One', 207, '£936.41', '£897.19', '£194.64'],['Journal of Biological Chemistry', 71, '£1384.22', '£1314.53', '£392.05'],
['NeuroImage', 36, '£2057.32', '£2289.25', '£466.87'], ['National Academy of Sciences', 32, '£860.94', '£762.33', '£497.21'],
['Nucleic Acids Research', 29, '£1162.34', '£852.00', '£442.15']]

In [220]:
journal_analysis = pd.DataFrame(journal_data,columns=['Journal','# of Articles','Mean open-access cost per article',
'Median open-access cost per article','Standard Deviation of open-access cost per article'])

In [221]:
# This dataframe consists of the solution for Challenge: Data cleaning and validation #

journal_analysis

Unnamed: 0,Journal,# of Articles,Mean open-access cost per article,Median open-access cost per article,Standard Deviation of open-access cost per article
0,PLoS One,207,£936.41,£897.19,£194.64
1,Journal of Biological Chemistry,71,£1384.22,£1314.53,£392.05
2,NeuroImage,36,£2057.32,£2289.25,£466.87
3,National Academy of Sciences,32,£860.94,£762.33,£497.21
4,Nucleic Acids Research,29,£1162.34,£852.00,£442.15


In [222]:
# Importing the Wellcome data #

wellcome = pd.read_csv("C:/Users/Mouse/Desktop/Thinkful/WELLCOME/Wellcome.csv", encoding='latin_1')

In [223]:
# Making the journal titles all lowercase to combine journal titles with different case-types #

wellcome['Journal title'] = wellcome['Journal title'].apply(lambda x: str(x).lower())

In [225]:
# Confirming the titles are all lowercase #

wellcome.head(10)

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
0,,CUP,psychological medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,j med chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,j med chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,j org chem,Regioselective opening of myo-inositol orthoes...,£685.88
5,PMC3579457,ACS,journal of medicinal chemistry,Comparative Structural and Functional Studies ...,£2392.20
6,PMC3709265,ACS,journal of proteome research,Mapping Proteolytic Processing in the Secretom...,£2367.95
7,23057412 PMC3495574,ACS,mol pharm,Quantitative silencing of EGFP reporter gene b...,£649.33
8,PMCID: PMC3780468,ACS (Amercian Chemical Society) Publications,acs chemical biology,A Novel Allosteric Inhibitor of the Uridine Di...,£1294.59
9,PMCID: PMC3621575,ACS (Amercian Chemical Society) Publications,acs chemical biology,Chemical proteomic analysis reveals the drugab...,£1294.78


In [226]:
# Creating a mapper to combine journal titles that are spelled differently or written in different formats #

def journal_mapper(j):
    if 'plos one' in j:
        return 'plos one'
    elif 'plos 1' in j:
        return 'plos one'
    elif 'plosone' in j:
        return 'plos one'
    elif 'public library of science' in j:
        return 'plos one'
    elif 'acta' in j:
        return 'acta'
    elif 'biological chemistry' in j:
        return 'journal of biological chemistry'
    elif 'j biol chem' in j:
        return 'journal of biological chemistry'
    elif 'neuroimage' in j:
        return 'neuroimage'
    elif 'nucleic acid' in j:
        return 'nucleic acids research'
    elif 'neurolmage' in j:
        return 'neuroimage'
    elif 'national academy of sciences' in j:
        return 'national academy of sciences'
    elif 'proceedings of national academy of sciences' in j:
        return 'national academy of sciences'
    elif 'molecular genetics' in j:
        return 'molecular genetics'
    else:
        return j

In [227]:
# Creating a new column, 'Journal title clean', with the newly cleaned journal titles #

wellcome['Journal title clean'] = wellcome['Journal title'].apply(lambda x: journal_mapper(x))

In [228]:
# Retrieving the list of unique articles for each journal. #

unique_list = wellcome.groupby(['Journal title clean']).nunique()

# Sorting the unique_list by article counts to get a better look at the most common journals #

sorted_unique = unique_list.sort_values(by=['Article title'], ascending=False)
sorted_unique.head(10)

Unnamed: 0_level_0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged),Journal title clean
Journal title clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
plos one,199,5,5,207,157,1
journal of biological chemistry,69,12,7,71,68,1
neuroimage,30,2,4,36,34,1
national academy of sciences,32,10,9,32,32,1
nucleic acids research,28,4,4,29,5,1
plos genetics,23,5,1,24,21,1
plos pathogens,24,2,1,24,22,1
molecular genetics,19,7,4,22,7,1
plos neglected tropical diseases,20,1,1,20,20,1
acta,18,8,17,20,20,1


In [229]:
# Creating a new column 'Cost' which will have the £ removed to work with the numbers easier #

wellcome['Cost'] = wellcome['COST (£) charged to Wellcome (inc VAT when charged)'].str.replace('£', '')

In [230]:
# Evaluating each top journal individually for cost analysis, starting with PLoS One #

plosone = wellcome[wellcome['Journal title clean'] == 'plos one']
plosone

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged),Journal title clean,Cost
1282,3517619,PLoS,plos one,HCN1 and HCN2 in Rat DRG Neurons: Levels in No...,£1001.03,plos one,1001.03
1283,3498109,PLoS,plos one,Fetal alcohol exposure and IQ at age 8: Eviden...,£1004.15,plos one,1004.15
1284,3515553,PLoS,plos one,Vitamin B-12 status during pregnancy and child...,£1011.45,plos one,1011.45
1285,3522679,PLoS,plos one,Validation of Dual Energy X-ray Absorptiometry...,£1011.45,plos one,1011.45
1286,3485223,PLoS,plos one,Associations of different phenotypes of wheezi...,£1015.73,plos one,1015.73
1287,PMC3547059,PLoS,plos one,"""Involvement of EphB1 receptors signalling in ...",£1023.41,plos one,1023.41
1288,3573029,PLoS,plos one,Reactive oxygen species modulate the barrier f...,£1039.87,plos one,1039.87
1289,3769269,PLoS,plos one,Chronic pravastatin but not atorvastatin treat...,£1061.24,plos one,1061.24
1290,3782430,PLoS,plos one,Expression of HIV-1 Vpu Leads to Loss of the V...,£1061.24,plos one,1061.24
1291,3797097,PLoS,plos one,Molecular phylogeny of a RING E3 ubiquitin lig...,£1061.24,plos one,1061.24


In [231]:
# Creating a blank list to perform statistics on #

plos_cost = []

In [232]:
# The data in the plosone['Cost'] column are strings, but we need numbers to do our cost analysis #
# Also, some of the costs are so high, we should remove them to eliminate the possibility of skewing because of inaccurate data #

def plos_cost_mapper(x):
    for i in x:
        if float(i) <= 10000:
            plos_cost.append(float(i))

plos_cost_mapper(plosone['Cost'])

In [233]:
# Retrive the statistics of the 'Cost' column for PLoS One journal articles #

print(stats.mean(plos_cost))
print(stats.median(plos_cost))
print(stats.stdev(plos_cost))

936.4101005025126
897.19
194.6431021851858


In [234]:
# Let's repeat the process for each subsequent journal #

jobc = wellcome[wellcome['Journal title clean'] == 'journal of biological chemistry']
jobc

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged),Journal title clean,Cost
16,22610094,AMBSB,journal of biological chemistry,Annexin-1 interaction with FPR2/ALX,£265.67,journal of biological chemistry,265.67
60,PMC3576085,American Soc for Biochemistry and Molecular Bi...,journal of biological chemistry,Understanding how noncatalytic carbohydrate bi...,£1100.00,journal of biological chemistry,1100.0
61,23239883 PMC3561570,American Society for Biochemistry and Molecula...,journal of biological chemistry,Molecular architecture and functional analysis...,£2259.64,journal of biological chemistry,2259.64
62,PMC3481300,American Society for Biochemistry and Molecula...,j biol chem.,Structure of the BMP receptor ALK2 and implica...,£1487.46,journal of biological chemistry,1487.46
63,PMC3234852,American Society for Biochemistry and Molecula...,j biol chem.,Inhibitory member of the apoptosis-stimulating...,£1531.08,journal of biological chemistry,1531.08
64,PMCID: PMC3642348,American Society for Biochemistry and Molecula...,journal of biological chemistry,Human and viral golgi anti-apoptotic protein (...,£1019.71,journal of biological chemistry,1019.71
65,PMID: 22992744 PMC3493908,American Society for Biochemistry and Molecula...,journal of biological chemistry,Dynamic exchange of myosin VI on endocytic str...,£1119.61,journal of biological chemistry,1119.61
66,PMCID: PMC3531748,American Society for Biochemistry and Molecula...,journal of biological chemistry,Uncoupling proteostasis and development in vit...,£1131.01,journal of biological chemistry,1131.01
67,PMCID: PMC3436574\n,American Society for Biochemistry and Molecula...,journal of biological chemistry,Structural Requirements for Recognition of Maj...,£1137.51,journal of biological chemistry,1137.51
68,PMID: 23223336 PMC3543027,American Society for Biochemistry and Molecula...,journal of biological chemistry,Visualization of structural changes accompanyi...,£1152.72,journal of biological chemistry,1152.72


In [235]:
jobc_cost = []

In [236]:
def jobc_cost_mapper(x):
    for i in x:
        if float(i) <= 10000:
            jobc_cost.append(float(i))

jobc_cost_mapper(jobc['Cost'])

In [237]:
print(stats.mean(jobc_cost))
print(stats.median(jobc_cost))
print(stats.stdev(jobc_cost))

1384.221884057971
1314.53
392.0523215205254


In [238]:
neuroimage = wellcome[wellcome['Journal title clean'] == 'neuroimage']
neuroimage

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged),Journal title clean,Cost
437,23672768,Elseveier Science,neuroimage,Gearing up for action: attentive tracking dyna...,£1758.89,neuroimage,1758.89
438,23046981,Elseveier Science,neuroimage,Good practice for conducting and reporting MEG...,£2345.00,neuroimage,2345.0
689,PMC3734349\n,Elsevier,neuroimage,Connectivity-based neurofeedback: dynamic caus...,£1747.16,neuroimage,1747.16
690,PMC3734350\n,Elsevier,neuroimage,The impact of distractor congruency on stimulu...,£1760.94,neuroimage,1760.94
691,PMC3734351\n,Elsevier,neuroimage,Distinct encoding of risk and value in economi...,£1762.69,neuroimage,1762.69
692,PMC3734352\n,Elsevier,neuroimage,Parcellation of the human substantia nigra bas...,£1762.69,neuroimage,1762.69
693,,Elsevier,neuroimage,Characterising reward outcome signals in senso...,£1779.76,neuroimage,1779.76
694,PMC3677092,Elsevier,neuroimage,Neural correlates of working memory in Tempora...,£1961.25,neuroimage,1961.25
695,PMCID: PMC3021391,Elsevier,neuroimage,Dynamic causal modelling of effective connecti...,£2100.54,neuroimage,2100.54
696,PMCID: PMC2877799,Elsevier,neuroimage,Action selection: a race model for selected an...,£2118.57,neuroimage,2118.57


In [239]:
neuroimage_cost = []

In [240]:
def neuroimage_cost_mapper(x):
    for i in x:
        if float(i) <= 10000:
            neuroimage_cost.append(float(i))

neuroimage_cost_mapper(neuroimage['Cost'])

In [241]:
print(stats.mean(neuroimage_cost))
print(stats.median(neuroimage_cost))
print(stats.stdev(neuroimage_cost))

2057.3180555555555
2289.245
466.8716104138233


In [242]:
naoc = wellcome[wellcome['Journal title clean'] == 'national academy of sciences']
naoc

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged),Journal title clean,Cost
429,2766312,Dartmouth Journal Services,proceedings of the national academy of sciences,Analysis of Synthetic Lethality Reveals Geneti...,£1241.10,national academy of sciences,1241.1
430,PMC3511132,Dartmouth Journal Services,proceedings of the national academy of sciences,SGTA antagonizes BAG6-mediated protein triage,£603.42,national academy of sciences,603.42
431,3704016,Dartmouth Journal Services,proceedings of the national academy of sciences,Inositol kinase and its product accelerate wou...,£660.59,national academy of sciences,660.59
960,PMID23213218 PMC3529057,Journal of the American Physiological Proceedi...,national academy of sciences,Morphing between expressions dissociates conti...,£1052.99,national academy of sciences,1052.99
1003,PMCID: PMC3780889,National Academy of Sciences,pnas (proceedings of the national academy of s...,Activation of the canonical IKK complex by K63...,£853.64,national academy of sciences,853.64
1004,PMCID: PMC3465389,National Academy of Sciences,proceddings of the national academy of science...,Evidence that Viral RNAs have Evolved for Effi...,£619.83,national academy of sciences,619.83
1005,PMCID:\n PMC3670340\n,National Academy of Sciences,proceedings of the national academy of sciences,Systematic identification of conserved bacteri...,£395.60,national academy of sciences,395.6
1006,PMCID:\n PMC3479458\n,National Academy of Sciences,proceedings of the national academy of sciences,Structural basis for the recognition and cleav...,£605.17,national academy of sciences,605.17
1007,PMCID:\n PMC3529010,National Academy of Sciences,proceedings of the national academy of sciences,Interactions between the nucleosome histone co...,£614.95,national academy of sciences,614.95
1008,PMC3479523,National Academy of Sciences,proceedings of the national academy of sciences,Selectively altering belief formation in the h...,£617.79,national academy of sciences,617.79


In [243]:
naoc_cost = []

In [244]:
def naoc_cost_mapper(x):
    for i in x:
        if float(i) <= 10000:
            naoc_cost.append(float(i))

naoc_cost_mapper(naoc['Cost'])

In [245]:
print(stats.mean(naoc_cost))
print(stats.median(naoc_cost))
print(stats.stdev(naoc_cost))

860.9425
762.3299999999999
497.20793077525065


In [246]:
nucleic = wellcome[wellcome['Journal title clean'] == 'nucleic acids research']
nucleic

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged),Journal title clean,Cost
1148,23595147 PMCID: PMC3675483,OUP,nucleic acids research,Nucleocapsid protein structures from orthobuny...,£1704.00,nucleic acids research,1704.0
1149,PMID: 23771140 /PMCID: PMC3753647,OUP,nucleic acids research,Impact of Target Site Distribution for Type I ...,£2184.00,nucleic acids research,2184.0
1150,3467080,OUP,nucleic acids research,Protein kinase CK2 inactivates PRH/Hhex using ...,£852.00,nucleic acids research,852.0
1151,3553950,OUP,nucleic acids research,The Type ISP Restriction-Modification enzymes ...,£852.00,nucleic acids research,852.0
1152,3553963,OUP,nucleic acids research,DNA cleavage by Type ISP Restriction-Modificat...,£852.00,nucleic acids research,852.0
1153,3592466,OUP,nucleic acids research,Organization of the BcgI restriction-modificat...,£852.00,nucleic acids research,852.0
1154,3592470,OUP,nucleic acids research,Organization of the BcgI restriction-modificat...,£852.00,nucleic acids research,852.0
1155,PMC3575838,OUP,nucleic acids research,Unwinding of primer-templates by archaeal fami...,£852.00,nucleic acids research,852.0
1156,PMC3627570\n\n\n\n\n\n,OUP,nucleic acids research,Resolving the polymorphism-in-probe problem is...,£852.00,nucleic acids research,852.0
1157,PMC3627603,OUP,nucleic acids research,Human SIRT1 regulates DNA-binding and stabilit...,£852.00,nucleic acids research,852.0


In [247]:
nucleic_cost = []

In [248]:
def nucleic_cost_mapper(x):
    for i in x:
        if float(i) <= 10000:
            nucleic_cost.append(float(i))

nucleic_cost_mapper(nucleic['Cost'])

In [249]:
print(stats.mean(nucleic_cost))
print(stats.median(nucleic_cost))
print(stats.stdev(nucleic_cost))

1162.344827586207
852.0
442.15093381769765
