In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import numpy as np

Terminology:

Source publication (or source) is the EPA funded research that was chosen to scrape citation from.
Target is citation that are used in source publication ( that is extracted from source)

Direct graph. From each source node should be the same amount of edges to target nodes as items in the reference list in each source publication

**Source -> Target**

##Part 1: Web-scraping of publications info from epa website

BeautifulSoup was used to gather links to pdf and publication titles. Other metadata also were gathered but not used.  

In [None]:
pub_title=[]
pub_link =[]
year = []
ISBN =[]
pages=[]
filesize  = []
format = []
research_num = []


I wanted to have more control over what publication from epa website will be scraped so I  chose manually publications and added its link to this part.
In future it can be rewritten to automatic scrape of everythong with 'Research' in the beginning.

In [None]:
epa_urls = [ 'https://www.epa.ie/publications/research/environment--health/research-259.php',
             'https://www.epa.ie/publications/research/climate-change/research-346-national-risk-assessment-of-impacts-of-climate-change.php',
             'https://www.epa.ie/publications/research/climate-change/research-429-building-coastal-and-marine-resilience-in-ireland.php',
             'https://www.epa.ie/publications/research/climate-change/research-369.php',
             'https://www.epa.ie/publications/research/climate-change/research-362-evaluating-irelands-climate-policy-performance.php',
             'https://www.epa.ie/publications/research/climate-change/research-360-methodologies-for-financing-and-costing-of-climate-impacts-and-future-adaptation-actions-transport-networks-in-ireland.php',
             'https://www.epa.ie/publications/research/climate-change/research-357-irelands-atmospheric-composition-and-climate-change-network.php',
             'https://www.epa.ie/publications/research/climate-change/research-352-synthesis-of-literature-and-preliminary-modelling-relevant-to-society-wide-scenarios-for-effective-climate-change-mitigation-in-ireland.php'
]

In [None]:
'''
epa_urls is a list of publication link on epa website
'''
#just to check if it works
for i in range(len(epa_urls)):
  print(epa_urls[i])


https://www.epa.ie/publications/research/environment--health/research-259.php
https://www.epa.ie/publications/research/climate-change/research-346-national-risk-assessment-of-impacts-of-climate-change.php
https://www.epa.ie/publications/research/climate-change/research-429-building-coastal-and-marine-resilience-in-ireland.php
https://www.epa.ie/publications/research/climate-change/research-369.php
https://www.epa.ie/publications/research/climate-change/research-362-evaluating-irelands-climate-policy-performance.php
https://www.epa.ie/publications/research/climate-change/research-360-methodologies-for-financing-and-costing-of-climate-impacts-and-future-adaptation-actions-transport-networks-in-ireland.php
https://www.epa.ie/publications/research/climate-change/research-357-irelands-atmospheric-composition-and-climate-change-network.php
https://www.epa.ie/publications/research/climate-change/research-352-synthesis-of-literature-and-preliminary-modelling-relevant-to-society-wide-scenarios-

In [None]:
'''
Web-scraping from epa website
Code extracts year, ISBN, pages, filesizes, format, title and authors of each publications and saves to previously defined lists.

'''

for i in range(len(epa_urls)):
  epa_url = epa_urls[i]

  # Use requests to retrieve data from a given URL
  epa_response = requests.get(epa_url)
  # Parse the whole HTML page using BeautifulSoup
  epa_soup = BeautifulSoup(epa_response.text, 'html.parser')
  publication_title = [h.text for h in epa_soup.find_all("h2", attrs={"class": "publications__title"})]
  for i in publication_title:
    research =  i.split(': ',1)
    pub_title.append(research[1])
    research_num.append(research[0])



  pub_info=epa_soup.find_all("div", attrs={"class": "publications__panel__content"})
  # Find link to download
  publication_link = [link.get('href') for link in epa_soup.find_all('a', attrs={"aria-label": "Download this publication"})]
  pub_link.append(publication_link[0])


  table = [data.text for data in epa_soup.find_all("div", attrs={"class": "publications__panel__content"})]

  mylist = table[0].split('\n')[1:-4]

  mylist=[mylist[i].split(': ') for i in range(len(mylist))]

  year.append(mylist[0][1])
  ISBN.append(mylist[1][1])
  pages.append(mylist[2][1])
  filesize.append(mylist[3][1])
  format.append(mylist[4][1])


In [None]:
pub_title

['Detection, Toxicology, Environmental Fate and Risk Assessment of Nanoparticles in the Aquatic Environment (DeTER)',
 'National Risk Assessment of Impacts of Climate Change: Bridging the Gap to Adaptation Action',
 'Building Coastal and Marine Resilience in Ireland',
 'CIViC: Critical Infrastructure Vulnerability to Climate Change',
 'Evaluating Ireland’s Climate Policy Performance',
 'Methodologies for Financing and Costing of Climate Impacts and Future Adaptation Actions: Transport Networks in Ireland',
 'Ireland’s Atmospheric Composition and Climate Change Network',
 'Synthesis of Literature and Preliminary Modelling Relevant to Society-wide Scenarios for Effective Climate Change Mitigation in Ireland']

In [None]:
pub_link

['https://www.epa.ie/publications/research/environment--health/Research_Report_259.pdf',
 'https://www.epa.ie/publications/research/climate-change/Research_Report_346.pdf',
 'https://www.epa.ie/publications/research/climate-change/Research_Report_429.pdf',
 'https://www.epa.ie/publications/research/climate-change/Research_Report_369.pdf',
 'https://www.epa.ie/publications/research/climate-change/Research_Report_362.pdf',
 'https://www.epa.ie/publications/research/climate-change/Research_Report_360.pdf',
 'https://www.epa.ie/publications/research/climate-change/Research_Report_357.pdf',
 'https://www.epa.ie/publications/research/climate-change/Research_Report_352.pdf']

In [None]:
'''
Pandas DataFrame will be transformed from dictionary
'''

mydict = dict({'title': pub_title,
               'research_number': research_num,
               'year': year,
               'ISBN': ISBN,
               'filesize': filesize,
               'format': format,
               'link': pub_link
               })

print(mydict)

{'title': ['Detection, Toxicology, Environmental Fate and Risk Assessment of Nanoparticles in the Aquatic Environment (DeTER)', 'National Risk Assessment of Impacts of Climate Change: Bridging the Gap to Adaptation Action', 'Building Coastal and Marine Resilience in Ireland', 'CIViC: Critical Infrastructure Vulnerability to Climate Change', 'Evaluating Ireland’s Climate Policy Performance', 'Methodologies for Financing and Costing of Climate Impacts and Future Adaptation Actions: Transport Networks in Ireland', 'Ireland’s Atmospheric Composition and Climate Change Network', 'Synthesis of Literature and Preliminary Modelling Relevant to Society-wide Scenarios for Effective Climate Change Mitigation in Ireland'], 'research_number': ['Research 259', 'Research 346', 'Research 429', 'Research 369', 'Research 362', 'Research 360', 'Research 357', 'Research 352'], 'year': ['2018', '2020', '2023', '2021', '2021', '2021', '2020', '2020'], 'ISBN': ['978-1-84095-796-9', '978-1-84095-948-2', '978-

In [None]:
'''
transforme dictionary to dataframe
pub_df is dataframe of publications
'''
pub_df= pd.DataFrame(mydict)

In [None]:
pub_df

Unnamed: 0,title,research_number,year,ISBN,filesize,format,link
0,"Detection, Toxicology, Environmental Fate and ...",Research 259,2018,978-1-84095-796-9,"2,041 KB",pdf,https://www.epa.ie/publications/research/envir...
1,National Risk Assessment of Impacts of Climate...,Research 346,2020,978-1-84095-948-2,"5,131 KB",pdf,https://www.epa.ie/publications/research/clima...
2,Building Coastal and Marine Resilience in Ireland,Research 429,2023,978-1-80009-091-0,"5,085 KB",pdf,https://www.epa.ie/publications/research/clima...
3,CIViC: Critical Infrastructure Vulnerability t...,Research 369,2021,978-1-84095-986-4,"4,036 KB",pdf,https://www.epa.ie/publications/research/clima...
4,Evaluating Ireland’s Climate Policy Performance,Research 362,2021,978-1-84095-969-7,"1,432 KB",pdf,https://www.epa.ie/publications/research/clima...
5,Methodologies for Financing and Costing of Cli...,Research 360,2021,978-1-84095-967-3,"4,075 KB",pdf,https://www.epa.ie/publications/research/clima...
6,Ireland’s Atmospheric Composition and Climate ...,Research 357,2020,978-1-84095-962-8,"8,278 KB",pdf,https://www.epa.ie/publications/research/clima...
7,Synthesis of Literature and Preliminary Modell...,Research 352,2020,978-1-84095-956-7,"4,106 KB",pdf,https://www.epa.ie/publications/research/clima...


In [None]:

# saving the dataframe
pub_df.to_csv('webscraper-publications.csv')


# Part 2. Extract references from PDF

There are plenty of packages that extract plain text or references from pdf file.
Unfortunately,  refextractor didn`t work well with the reference formating style that mostly all epa researches have.
pdfextract, pdftotext, pdfx, PDF2text all works but not great for this task.
So I chose tika package that extracts content of pdf as plain text.

In [None]:
!pip install tika
from tika import parser



In [None]:
links = pub_df['link']

In [None]:
result=[]

In [None]:



'''main fuction to extract references that unites small functions
  receives @refstr that is content of pdf file and is presented as plain text
'''
def reference_extractor(refstr):

    reflist = find_references(refstr).split('\n\n')
    source_title = find_source_report_name(refstr)
    reflist = delete_n(reflist)
    create_not_ref = create_list_not_references(reflist)
    #delete from newlist all items that are in not_references
    reflist = [item for item in reflist if item not in create_not_ref]


    return reflist, source_title


def find_references(refstr):
  '''
  To scrape references from the file at first we should find them in the text.
  Usually, word 'references appear in text two times (in the List of Content and as Title of references chapter)
  So we are looking for list of references only in the second part of our text
  Important!
  It`s just an assumption that works for mostly all epa publication.
  '''
  new_str =''

  ref_start = refstr.find('References', len(refstr)//2,len(refstr))
  ref_end = refstr.find('Glossary',len(refstr)//2,len(refstr))
  ref_end2 = refstr.find('Acronyms and Annotations',len(refstr)//2,len(refstr))
  ref_end3 = refstr.find('Abbreviations',len(refstr)//2,len(refstr))
  '''
  When index of
  '''

  if ref_end!=-1:
    new_str = refstr[ref_start+ len('References '):ref_end]
  elif ref_end2!=-1:
    new_str = refstr[ref_start+ len('References '):ref_end2]
  elif ref_end3!=-1:
    new_str= refstr[ref_start+ len('References '):ref_end3]


  return new_str

def delete_n(reflist):
  newlist = []
  for i in range(len(reflist)):
    reflist_split=reflist[i].split('\n')
    reflist_join = " ".join(reflist_split)
    newlist.append(reflist_join)
  return newlist

def create_list_not_references(newlist):
  not_references = []
  for i in range(len(newlist)-1):
    if((len(newlist[i])<4) and (len(newlist[i])>0)):
      not_references.append(newlist[i])
      not_references.append(newlist[i+1])
      not_references.append(newlist[i-1])
  return not_references


def find_source_report_name(str_link1):
  str_link1=str_link1.split('\n\n')
  str_link1 = " ".join(str_link1)
  source_name_begin = str_link1.find('Report')
  source_name_end = str_link1.find('Authors')
  source_name= str_link1[source_name_begin:source_name_end]
  return source_name

Changes:
Now it works for any amount of links given

In [None]:
pub_df

Unnamed: 0,title,research_number,year,ISBN,filesize,format,link
0,"Detection, Toxicology, Environmental Fate and ...",Research 259,2018,978-1-84095-796-9,"2,041 KB",pdf,https://www.epa.ie/publications/research/envir...
1,National Risk Assessment of Impacts of Climate...,Research 346,2020,978-1-84095-948-2,"5,131 KB",pdf,https://www.epa.ie/publications/research/clima...
2,Building Coastal and Marine Resilience in Ireland,Research 429,2023,978-1-80009-091-0,"5,085 KB",pdf,https://www.epa.ie/publications/research/clima...
3,CIViC: Critical Infrastructure Vulnerability t...,Research 369,2021,978-1-84095-986-4,"4,036 KB",pdf,https://www.epa.ie/publications/research/clima...
4,Evaluating Ireland’s Climate Policy Performance,Research 362,2021,978-1-84095-969-7,"1,432 KB",pdf,https://www.epa.ie/publications/research/clima...
5,Methodologies for Financing and Costing of Cli...,Research 360,2021,978-1-84095-967-3,"4,075 KB",pdf,https://www.epa.ie/publications/research/clima...
6,Ireland’s Atmospheric Composition and Climate ...,Research 357,2020,978-1-84095-962-8,"8,278 KB",pdf,https://www.epa.ie/publications/research/clima...
7,Synthesis of Literature and Preliminary Modell...,Research 352,2020,978-1-84095-956-7,"4,106 KB",pdf,https://www.epa.ie/publications/research/clima...


## Regular expression



In [None]:
source_target = []
target_count= []
source_titles = []
for i in range(len(links)):
  ref = reference_extractor(parser.from_file(links[i])['content'])
  #every source are concatened with target and 000 (marker) is added between for easier division later
  [ source_target.append(ref[1]+ '000' + j) for j in ref[0]]
  target_count.append(str(len(ref[0])))
  source_titles.append(ref[1])
  df1 = pd.DataFrame(source_target, columns=['targetdf'])
  #split source from targetdf with help of 000 marker
  df1['Source'] = df1['targetdf'].str.split('000', 1).str[0].str.strip()
  #preparation rows before extracting authors names from references
  df1['authors'] = df1['targetdf'].str.split('000', 1).str[1].str.strip()
  #extract 4 numbers that usually are year, in some cases doesnot work properly
  df1['year'] = df1['authors'].str.extract('(\d\d\d\d)', expand=True)
  #author are placed in one column
  df1['authors'] = df1['authors'].str.split('\s\d{4}\w*.\s*', 1).str[0].str.strip()
  df1['titleandextra'] = df1['targetdf'].str.split('\s\d{4}\w*.\s*', 1).str[1].str.strip()

  df1['Target'] = df1['titleandextra'].str.split('[^\d.\d°C]\.\s?', 1).str[0].str.strip()
  df1['extra'] = df1['titleandextra'].str.split('\?|\.\s?', 1).str[1].str.strip()

2023-08-21 13:07:36,101 [MainThread  ] [INFO ]  Retrieving https://www.epa.ie/publications/research/environment--health/Research_Report_259.pdf to /tmp/publications-research-environment-health-research_report_259.pdf.
INFO:tika.tika:Retrieving https://www.epa.ie/publications/research/environment--health/Research_Report_259.pdf to /tmp/publications-research-environment-health-research_report_259.pdf.
  df1['Source'] = df1['targetdf'].str.split('000', 1).str[0].str.strip()
  df1['authors'] = df1['targetdf'].str.split('000', 1).str[1].str.strip()
  df1['authors'] = df1['authors'].str.split('\s\d{4}\w*.\s*', 1).str[0].str.strip()
  df1['titleandextra'] = df1['targetdf'].str.split('\s\d{4}\w*.\s*', 1).str[1].str.strip()
  df1['Target'] = df1['titleandextra'].str.split('[^\d.\d°C]\.\s?', 1).str[0].str.strip()
  df1['extra'] = df1['titleandextra'].str.split('\?|\.\s?', 1).str[1].str.strip()
2023-08-21 13:07:38,931 [MainThread  ] [INFO ]  Retrieving https://www.epa.ie/publications/research/cli

I want to know how many references will be wiped out during cleaning.
To the first dataframe that consists of web scraped information I added num_of_ref.
num_of_ref - is variabe that holds how many references source publication had.
Because of nature of Bibliometric analysis, it`s not effective to leave all references, especially those that lead to websites, articles of goverment regulations and law.


In [None]:
pub_df['num_of_ref'] = target_count
pub_df['isEPA'] = 1
pub_df['source_title_from_ref'] = source_titles #this variable to check if tika module works correcrtly

In [None]:
pub_df

Unnamed: 0,title,research_number,year,ISBN,filesize,format,link,num_of_ref,isEPA,source_title_from_ref
0,"Detection, Toxicology, Environmental Fate and ...",Research 259,2018,978-1-84095-796-9,"2,041 KB",pdf,https://www.epa.ie/publications/research/envir...,134,1,"Report No. Detection, Toxicology, Environmenta..."
1,National Risk Assessment of Impacts of Climate...,Research 346,2020,978-1-84095-948-2,"5,131 KB",pdf,https://www.epa.ie/publications/research/clima...,174,1,Report No.346 National Risk Assessment of \nIm...
2,Building Coastal and Marine Resilience in Ireland,Research 429,2023,978-1-80009-091-0,"5,085 KB",pdf,https://www.epa.ie/publications/research/clima...,177,1,Report No. Building Coastal and Marine \nResi...
3,CIViC: Critical Infrastructure Vulnerability t...,Research 369,2021,978-1-84095-986-4,"4,036 KB",pdf,https://www.epa.ie/publications/research/clima...,107,1,Report No.369 CIViC: Critical \nInfrastructure...
4,Evaluating Ireland’s Climate Policy Performance,Research 362,2021,978-1-84095-969-7,"1,432 KB",pdf,https://www.epa.ie/publications/research/clima...,67,1,Report No.362 Evaluating Ireland’s Climate \nP...
5,Methodologies for Financing and Costing of Cli...,Research 360,2021,978-1-84095-967-3,"4,075 KB",pdf,https://www.epa.ie/publications/research/clima...,57,1,Report No.360 Methodologies for Financing and ...
6,Ireland’s Atmospheric Composition and Climate ...,Research 357,2020,978-1-84095-962-8,"8,278 KB",pdf,https://www.epa.ie/publications/research/clima...,14,1,Report No.357 Ireland’s Atmospheric Compositio...
7,Synthesis of Literature and Preliminary Modell...,Research 352,2020,978-1-84095-956-7,"4,106 KB",pdf,https://www.epa.ie/publications/research/clima...,247,1,Report No.352 Synthesis of Literature and Prel...


In [None]:
'''
this have no direct connection to other code. I wanted to add small table about sources to the github page

we can create a html table from DataFrame to add to website where graph will be published
'''

pubdf1 = pub_df[['title', 'year', 'num_of_ref', 'research_number', 'link']]
pubdf1.to_html()


'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>title</th>\n      <th>year</th>\n      <th>num_of_ref</th>\n      <th>research_number</th>\n      <th>link</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Detection, Toxicology, Environmental Fate and Risk Assessment of Nanoparticles in the Aquatic Environment (DeTER)</td>\n      <td>2018</td>\n      <td>134</td>\n      <td>Research 259</td>\n      <td>https://www.epa.ie/publications/research/environment--health/Research_Report_259.pdf</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>National Risk Assessment of Impacts of Climate Change: Bridging the Gap to Adaptation Action</td>\n      <td>2020</td>\n      <td>174</td>\n      <td>Research 346</td>\n      <td>https://www.epa.ie/publications/research/climate-change/Research_Report_346.pdf</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Building Coastal and Marine Resilience in Irela

In [None]:
df1

Unnamed: 0,targetdf,Source,authors,year,titleandextra,Target,extra
0,"Report No. Detection, Toxicology, Environmenta...","Report No. Detection, Toxicology, Environmenta...","Abou El-Nour, K.M.M.M., Eftaiha, A.A., Al-Wart...",2010,Synthesis and applications of silver nanopart...,Synthesis and applications of silver nanopart...,Arabian Journal of Chemistry 3: 135–140.
1,"Report No. Detection, Toxicology, Environmenta...","Report No. Detection, Toxicology, Environmenta...","Akaighe, N., Maccuspie, R.I., Navarro, D.A., A...",2011,Humic acid-induced silver nanoparticle format...,Humic acid-induced silver nanoparticle format...,Environmental Science & Technology 45: 3895–3...
2,"Report No. Detection, Toxicology, Environmenta...","Report No. Detection, Toxicology, Environmenta...","Alexander, J.W.,",2009,History of the medical use of silver. Surgica...,History of the medical use of silve,Surgical infections 10: 289–292.
3,"Report No. Detection, Toxicology, Environmenta...","Report No. Detection, Toxicology, Environmenta...","Amato, E., Diaz-Fernandez, Y.A., Taglietti, A....",2011,"Synthesis, characterization and antibacterial ...","Synthesis, characterization and antibacterial ...",Langmuir 27: 9165–9173.
4,"Report No. Detection, Toxicology, Environmenta...","Report No. Detection, Toxicology, Environmenta...","Anderson, R.A.,",2005,Algal Culturing Techniques. Elsevier Academic...,Algal Culturing Technique,Elsevier Academic Press/Phycological Society o...
...,...,...,...,...,...,...,...
972,Report No.352 Synthesis of Literature and Prel...,Report No.352 Synthesis of Literature and Prel...,"World Bank,",2019,World GDP data. Available online: http://api....,World GDP dat,Available online: http://api.worldbank.org/v2...
973,Report No.352 Synthesis of Literature and Prel...,Report No.352 Synthesis of Literature and Prel...,"Zoloth, L.,",2017,At the last well on Earth: climate change is ...,At the last well on Earth: climate change is ...,Journal of Feminist Studies in Religion 33: 1...
974,Report No.352 Synthesis of Literature and Prel...,Report No.352 Synthesis of Literature and Prel...,"Zscheischler, J., Westra, S., van den Hurk, B....",2018,Future climate risk from compound events. Nat...,Future climate risk from compound event,Nature Climate Change 8: 469–477. https://doi....
975,Report No.352 Synthesis of Literature and Prel...,Report No.352 Synthesis of Literature and Prel...,AD Anaerobic digestion BAU Business as usual C...,2019,Climate Action Plan CBA Cost–benefit analysis ...,Climate Action Plan CBA Cost–benefit analysis ...,


In [None]:
'''
is_EPA - boolean that True if Target are EPA funded, False if not (or maybe we don`t have this informatin in the reference )

Is true if Environmental Protection
                EPA
                Johnstown Castle
will be found in the last part of references after authors.
'''
reg_exp = r'(Environmental\sProtection)|(EPA)|(Johnstown\sCastle)'
df1['isEPA'] = np.where(df1['extra'].str.match(reg_exp)!=True, 0, 1)

In [None]:
df1[df1['isEPA']==1]

Unnamed: 0,targetdf,Source,authors,year,titleandextra,Target,extra,isEPA
18,"Report No. Detection, Toxicology, Environmenta...","Report No. Detection, Toxicology, Environmenta...","Cunningham, S. and Joshi, L.,",2015,Assessment of Exposure of Marine and Freshwat...,Assessment of Exposure of Marine and Freshwat...,"Environmental Protection Agency, Johnstown Ca...",1
33,"Report No. Detection, Toxicology, Environmenta...","Report No. Detection, Toxicology, Environmenta...","EPA (Environmental Protection Agency),",2002,"Water Treatment Manuals: Coagulation, Floccul...","Water Treatment Manuals: Coagulation, Floccul...","EPA, Johnstown Castle, Ireland.",1
34,"Report No. Detection, Toxicology, Environmenta...","Report No. Detection, Toxicology, Environmenta...","EPA (Environmental Protection Agency),",2011,Water Framework Status Update Based on Monito...,Water Framework Status Update Based on Monito...,"EPA, Johnstown Castle, Ireland.",1
142,Report No.346 National Risk Assessment of \nIm...,Report No.346 National Risk Assessment of \nIm...,"Bullock, C., Fealy, R., Clinch, J.P. and O’She...",2015,ADAPT: Quantifying the Costs and Benefits Ass...,ADAPT: Quantifying the Costs and Benefits Ass...,"Environmental Protection Agency, Johnstown Cas...",1
152,Report No.346 National Risk Assessment of \nIm...,Report No.346 National Risk Assessment of \nIm...,"Coll, J. and Sweeney, J.,",2013,Current and Future Vulnerabilities to Climate...,Current and Future Vulnerabilities to Climate...,"Environmental Protection Agency, Johnstown Cas...",1
153,Report No.346 National Risk Assessment of \nIm...,Report No.346 National Risk Assessment of \nIm...,"Coll, J., Bourke, D., Gormally, M., Sheehy Ske...",2012,Winners and Losers: Climate Change Impacts on...,Winners and Losers: Climate Change Impacts on...,"Environmental Protection Agency, Johnstown Cas...",1
177,Report No.346 National Risk Assessment of \nIm...,Report No.346 National Risk Assessment of \nIm...,"Desmond, M., O’Brien, P. and McGovern, F.,",2017,A Summary of the State of Knowledge on Climate...,A Summary of the State of Knowledge on Climate...,"Environmental Protection Agency, Johnstown Ca...",1
182,Report No.346 National Risk Assessment of \nIm...,Report No.346 National Risk Assessment of \nIm...,"Donnelly, A. and O’Neill, B.,",2013,Climate Change Impacts on Phenology: Implicat...,Climate Change Impacts on Phenology: Implicat...,"Environmental Protection Agency, Johnstown Ca...",1
187,Report No.346 National Risk Assessment of \nIm...,Report No.346 National Risk Assessment of \nIm...,"Dwyer, N.,",2012,"The Status of Ireland’s Climate, 2012. Enviro...","The Status of Ireland’s Climate, 2012. Enviro...","Environmental Protection Agency, Johnstown Ca...",1
217,Report No.346 National Risk Assessment of \nIm...,Report No.346 National Risk Assessment of \nIm...,"Hall, J., Murphy, C. and Sweeney, J.,",2012,Robust Adaptation to Climate Change in the Wa...,Robust Adaptation to Climate Change in the Wa...,"Environmental Protection Agency, Johnstown Ca...",1


In [None]:
df1 = df1[['Source', 'Target', 'isEPA', 'authors', 'extra', 'targetdf','year' ]]
df1

Unnamed: 0,Source,Target,isEPA,authors,extra,targetdf,year
0,"Report No. Detection, Toxicology, Environmenta...",Synthesis and applications of silver nanopart...,0,"Abou El-Nour, K.M.M.M., Eftaiha, A.A., Al-Wart...",Arabian Journal of Chemistry 3: 135–140.,"Report No. Detection, Toxicology, Environmenta...",2010
1,"Report No. Detection, Toxicology, Environmenta...",Humic acid-induced silver nanoparticle format...,0,"Akaighe, N., Maccuspie, R.I., Navarro, D.A., A...",Environmental Science & Technology 45: 3895–3...,"Report No. Detection, Toxicology, Environmenta...",2011
2,"Report No. Detection, Toxicology, Environmenta...",History of the medical use of silve,0,"Alexander, J.W.,",Surgical infections 10: 289–292.,"Report No. Detection, Toxicology, Environmenta...",2009
3,"Report No. Detection, Toxicology, Environmenta...","Synthesis, characterization and antibacterial ...",0,"Amato, E., Diaz-Fernandez, Y.A., Taglietti, A....",Langmuir 27: 9165–9173.,"Report No. Detection, Toxicology, Environmenta...",2011
4,"Report No. Detection, Toxicology, Environmenta...",Algal Culturing Technique,0,"Anderson, R.A.,",Elsevier Academic Press/Phycological Society o...,"Report No. Detection, Toxicology, Environmenta...",2005
...,...,...,...,...,...,...,...
972,Report No.352 Synthesis of Literature and Prel...,World GDP dat,0,"World Bank,",Available online: http://api.worldbank.org/v2...,Report No.352 Synthesis of Literature and Prel...,2019
973,Report No.352 Synthesis of Literature and Prel...,At the last well on Earth: climate change is ...,0,"Zoloth, L.,",Journal of Feminist Studies in Religion 33: 1...,Report No.352 Synthesis of Literature and Prel...,2017
974,Report No.352 Synthesis of Literature and Prel...,Future climate risk from compound event,0,"Zscheischler, J., Westra, S., van den Hurk, B....",Nature Climate Change 8: 469–477. https://doi....,Report No.352 Synthesis of Literature and Prel...,2018
975,Report No.352 Synthesis of Literature and Prel...,Climate Action Plan CBA Cost–benefit analysis ...,0,AD Anaerobic digestion BAU Business as usual C...,,Report No.352 Synthesis of Literature and Prel...,2019


In [None]:
#There is 42 epa funded publications

df1[df1['isEPA']==1]['Source'].count()

42

In [None]:
df1.groupby([df1['isEPA'], 'Source'])['Target'].count()[1][0:]

Source
Report No.  Building Coastal and Marine \nResilience in Ireland                                                                                            6
Report No. Detection, Toxicology, Environmental Fate \nand Risk Assessment of Nanoparticles in the  Aquatic Environment (DeTER)                            3
Report No.346 National Risk Assessment of \nImpacts of Climate Change: Bridging  the Gap to Adaptation Action                                             14
Report No.352 Synthesis of Literature and Preliminary Modelling\nRelevant to Society-wide Scenarios for Effective Climate Change Mitigation in Ireland     6
Report No.357 Ireland’s Atmospheric Composition \nand Climate Change Network                                                                               1
Report No.360 Methodologies for Financing and Costing \nof Climate Impacts and Future Adaptation  Actions: Transport Networks in Ireland                   4
Report No.362 Evaluating Ireland’s Climate \nPolicy

Important note!
Sometimes Gephi ignore headers from csv files, so it's better to open csv in Excel, then save again as scv utf-8 (maybe try freeze header if still not visible in Gephi).
Sometimes Excel doesn't read some spanish names or unusual signs, so adding utf-8-sig may be helpful.

In [None]:
df1.to_csv('citation_test_dataset_one_cycle.csv',encoding='utf-8-sig', header=df1.columns)

In [None]:
'''

List of warning sent bu gephi:


Issue{message=Node id='Report No.  Building Coastal and Marine , level=WARNING}	WARNING
Issue{message=Node id='Report No. Detection, Toxicology, Environmental Fate , level=WARNING}	WARNING
Issue{message=Node id='Report No.346 National Risk Assessment of , level=WARNING}	WARNING
Issue{message=Node id='Report No.352 Synthesis of Literature and Preliminary Modelling, level=WARNING}	WARNING
Issue{message=Node id='Report No.357 Ireland’s Atmospheric Composition , level=WARNING}	WARNING
Issue{message=Node id='Report No.360 Methodologies for Financing and Costing , level=WARNING}	WARNING
Issue{message=Node id='Report No.362 Evaluating Ireland’s Climate , level=WARNING}	WARNING
Issue{message=Node id='Report No.369 CIViC: Critical , level=WARNING}	WARNING
Issue{message=Parallel edges detected, remember to choose a merge strategy, level=INFO}	INFO
Issue{message=[Record #550] Ignoring edge due to empty source and/or target node ids, level=SEVERE}	SEVERE
Issue{message=[Record #659] Ignoring edge due to empty source and/or target node ids, level=SEVERE}	SEVERE
Issue{message=[Record #721] Ignoring edge due to empty source and/or target node ids, level=SEVERE}	SEVERE
Issue{message=[Record #977] Ignoring edge due to empty source and/or target node ids, level=SEVERE}	SEVERE

'''




Air
Biodiversity
Circular economy
Climate Change
Environment & Health
Environmental Technologies
Land use, soils and transport
Socio-economics
Waste
Water

## Step3: Build a test network of the citation
 2) An authors network, that shows what authors are cited the most
 Unfortunately, I can't solve it now. Some papers have 7+ authors and it`s hard to find nice way to built graph from them to whom? From a

In [None]:
df1

Unnamed: 0,Source,Target,isEPA,authors,extra,targetdf,year
0,"Report No. Detection, Toxicology, Environmenta...",Synthesis and applications of silver nanopart...,0,"Abou El-Nour, K.M.M.M., Eftaiha, A.A., Al-Wart...",Arabian Journal of Chemistry 3: 135–140.,"Report No. Detection, Toxicology, Environmenta...",2010
1,"Report No. Detection, Toxicology, Environmenta...",Humic acid-induced silver nanoparticle format...,0,"Akaighe, N., Maccuspie, R.I., Navarro, D.A., A...",Environmental Science & Technology 45: 3895–3...,"Report No. Detection, Toxicology, Environmenta...",2011
2,"Report No. Detection, Toxicology, Environmenta...",History of the medical use of silve,0,"Alexander, J.W.,",Surgical infections 10: 289–292.,"Report No. Detection, Toxicology, Environmenta...",2009
3,"Report No. Detection, Toxicology, Environmenta...","Synthesis, characterization and antibacterial ...",0,"Amato, E., Diaz-Fernandez, Y.A., Taglietti, A....",Langmuir 27: 9165–9173.,"Report No. Detection, Toxicology, Environmenta...",2011
4,"Report No. Detection, Toxicology, Environmenta...",Algal Culturing Technique,0,"Anderson, R.A.,",Elsevier Academic Press/Phycological Society o...,"Report No. Detection, Toxicology, Environmenta...",2005
...,...,...,...,...,...,...,...
972,Report No.352 Synthesis of Literature and Prel...,World GDP dat,0,"World Bank,",Available online: http://api.worldbank.org/v2...,Report No.352 Synthesis of Literature and Prel...,2019
973,Report No.352 Synthesis of Literature and Prel...,At the last well on Earth: climate change is ...,0,"Zoloth, L.,",Journal of Feminist Studies in Religion 33: 1...,Report No.352 Synthesis of Literature and Prel...,2017
974,Report No.352 Synthesis of Literature and Prel...,Future climate risk from compound event,0,"Zscheischler, J., Westra, S., van den Hurk, B....",Nature Climate Change 8: 469–477. https://doi....,Report No.352 Synthesis of Literature and Prel...,2018
975,Report No.352 Synthesis of Literature and Prel...,Climate Action Plan CBA Cost–benefit analysis ...,0,AD Anaerobic digestion BAU Business as usual C...,,Report No.352 Synthesis of Literature and Prel...,2019


In [None]:
#show and delete where Target = NaN
df1 = df1.dropna(subset='Target')

Report No.352 Synthesis of Literature and Preliminary Modelling Relevant to Society-wide Scenarios for Effective Climate Change Mitigation in Ireland 000Gusenbauer, M., 2019. Google Scholar to overshadow them all? Comparing the sizes of 12 academic search engines and bibliographic databases. Scientometrics 118: 177–214. https://doi.org/10.1007/ s11192-018-2958-5

-------------------
Target: Google Scholar to overshadow them all
Target must be: Google Scholar to overshadow them all? Comparing the sizes of 12 academic search engines and bibliographic databases.





In [None]:
df1["Target_Length"]= df1["Target"].str.len()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1["Target_Length"]= df1["Target"].str.len()


In [None]:
df1

Unnamed: 0,Source,Target,isEPA,authors,extra,targetdf,year,Target_Length
0,"Report No. Detection, Toxicology, Environmenta...",Synthesis and applications of silver nanopart...,0,"Abou El-Nour, K.M.M.M., Eftaiha, A.A., Al-Wart...",Arabian Journal of Chemistry 3: 135–140.,"Report No. Detection, Toxicology, Environmenta...",2010,50
1,"Report No. Detection, Toxicology, Environmenta...",Humic acid-induced silver nanoparticle format...,0,"Akaighe, N., Maccuspie, R.I., Navarro, D.A., A...",Environmental Science & Technology 45: 3895–3...,"Report No. Detection, Toxicology, Environmenta...",2011,91
2,"Report No. Detection, Toxicology, Environmenta...",History of the medical use of silve,0,"Alexander, J.W.,",Surgical infections 10: 289–292.,"Report No. Detection, Toxicology, Environmenta...",2009,36
3,"Report No. Detection, Toxicology, Environmenta...","Synthesis, characterization and antibacterial ...",0,"Amato, E., Diaz-Fernandez, Y.A., Taglietti, A....",Langmuir 27: 9165–9173.,"Report No. Detection, Toxicology, Environmenta...",2011,150
4,"Report No. Detection, Toxicology, Environmenta...",Algal Culturing Technique,0,"Anderson, R.A.,",Elsevier Academic Press/Phycological Society o...,"Report No. Detection, Toxicology, Environmenta...",2005,25
...,...,...,...,...,...,...,...,...
971,Report No.352 Synthesis of Literature and Prel...,Mental maps of the future: an intuitive logic...,0,"Wilson, I.,","In Fahey, L. and Robert M Randall, R.M. (eds)...",Report No.352 Synthesis of Literature and Prel...,1998,68
972,Report No.352 Synthesis of Literature and Prel...,World GDP dat,0,"World Bank,",Available online: http://api.worldbank.org/v2...,Report No.352 Synthesis of Literature and Prel...,2019,13
973,Report No.352 Synthesis of Literature and Prel...,At the last well on Earth: climate change is ...,0,"Zoloth, L.,",Journal of Feminist Studies in Religion 33: 1...,Report No.352 Synthesis of Literature and Prel...,2017,61
974,Report No.352 Synthesis of Literature and Prel...,Future climate risk from compound event,0,"Zscheischler, J., Westra, S., van den Hurk, B....",Nature Climate Change 8: 469–477. https://doi....,Report No.352 Synthesis of Literature and Prel...,2018,39


delete all Titles of publications that are shorter than 5 characters or longer then 350 characters
Usually, it is exactly that rows that were hard to divide into columns correctly.
So it`s basically cleaning
But for future it would better to find the way to prepare that data.

In [None]:
df1.drop(df1.loc[df1['Target_Length']<5].index, inplace=True)
df1.drop(df1.loc[df1['Target_Length']>350].index, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.drop(df1.loc[df1['Target_Length']<5].index, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.drop(df1.loc[df1['Target_Length']>350].index, inplace=True)


In [None]:
df1

Unnamed: 0,Source,Target,isEPA,authors,extra,targetdf,year,Target_Length
0,"Report No. Detection, Toxicology, Environmenta...",Synthesis and applications of silver nanopart...,0,"Abou El-Nour, K.M.M.M., Eftaiha, A.A., Al-Wart...",Arabian Journal of Chemistry 3: 135–140.,"Report No. Detection, Toxicology, Environmenta...",2010,50
1,"Report No. Detection, Toxicology, Environmenta...",Humic acid-induced silver nanoparticle format...,0,"Akaighe, N., Maccuspie, R.I., Navarro, D.A., A...",Environmental Science & Technology 45: 3895–3...,"Report No. Detection, Toxicology, Environmenta...",2011,91
2,"Report No. Detection, Toxicology, Environmenta...",History of the medical use of silve,0,"Alexander, J.W.,",Surgical infections 10: 289–292.,"Report No. Detection, Toxicology, Environmenta...",2009,36
3,"Report No. Detection, Toxicology, Environmenta...","Synthesis, characterization and antibacterial ...",0,"Amato, E., Diaz-Fernandez, Y.A., Taglietti, A....",Langmuir 27: 9165–9173.,"Report No. Detection, Toxicology, Environmenta...",2011,150
4,"Report No. Detection, Toxicology, Environmenta...",Algal Culturing Technique,0,"Anderson, R.A.,",Elsevier Academic Press/Phycological Society o...,"Report No. Detection, Toxicology, Environmenta...",2005,25
...,...,...,...,...,...,...,...,...
970,Report No.352 Synthesis of Literature and Prel...,Chair’s Final Report – UK Net-Zero Advisory G...,0,"Watson, J.,",UK Net-Zero Advisory Group to the Committee o...,Report No.352 Synthesis of Literature and Prel...,2019,84
971,Report No.352 Synthesis of Literature and Prel...,Mental maps of the future: an intuitive logic...,0,"Wilson, I.,","In Fahey, L. and Robert M Randall, R.M. (eds)...",Report No.352 Synthesis of Literature and Prel...,1998,68
972,Report No.352 Synthesis of Literature and Prel...,World GDP dat,0,"World Bank,",Available online: http://api.worldbank.org/v2...,Report No.352 Synthesis of Literature and Prel...,2019,13
973,Report No.352 Synthesis of Literature and Prel...,At the last well on Earth: climate change is ...,0,"Zoloth, L.,",Journal of Feminist Studies in Religion 33: 1...,Report No.352 Synthesis of Literature and Prel...,2017,61


##4 Creating dynamic graph to show many old or new sources are used

In [None]:
#convert year to data year and sort by year



In [None]:
df1.to_csv('21082023citation_test_dataset.csv',encoding='utf-8-sig', header=df1.columns)

In [None]:
df1['year'].unique()

array(['2010', '2011', '2009', '2005', '2013', '2014', '2012', '2008',
       '2015', '2016', '2006', '2017', '2018', '2002', '2007', '1996',
       '2001', '2019', '1997', '1998', '2020', '1992', '1995', '1991',
       '2000', '2003', '1942', '1972', '1976', '1999', '2004', '1904',
       '2021', '2022', '1973', '2030', '1983', '1990', '1921', '1993'],
      dtype=object)

In [None]:
df1['Source'].unique()

array(['Report No. Detection, Toxicology, Environmental Fate \nand Risk Assessment of Nanoparticles in the  Aquatic Environment (DeTER)',
       'Report No.346 National Risk Assessment of \nImpacts of Climate Change: Bridging  the Gap to Adaptation Action',
       'Report No.  Building Coastal and Marine \nResilience in Ireland',
       'Report No.369 CIViC: Critical \nInfrastructure Vulnerability  to Climate Change',
       'Report No.362 Evaluating Ireland’s Climate \nPolicy Performance',
       'Report No.360 Methodologies for Financing and Costing \nof Climate Impacts and Future Adaptation  Actions: Transport Networks in Ireland',
       'Report No.357 Ireland’s Atmospheric Composition \nand Climate Change Network',
       'Report No.352 Synthesis of Literature and Preliminary Modelling\nRelevant to Society-wide Scenarios for Effective Climate Change Mitigation in Ireland'],
      dtype=object)