In [2]:
__author__ = "Patrick Toschka https://orcid.org/0000-0003-2470-4590 https://twitter.com/PToschka https://github.com/Tishwings"
__license__ = "CCBY4.0 Patrick Toschka"
__version__ = "1.0"
__email__ = "patrick.toschka@adwmainz.de"
__status__ = "Development"

"""
This script parses the https://dhd2019.org/programm/ pages and returns all found events that are "valid" in one .ICS file that can be imported e.g. in Google Calendar.

Please feel free to use and improve!

Known Issues:
- Since not every div with class="programmblock" is build up in the same way, the ICS only contains events that could be made up of 1 title 1 url and 1 place. E.g. the break events are not catched with this and some titles are missing. 
- the regular expression to get the titles is not precise enough, e.g. a very long title could lead to not be recognized
- the uids of the events are current time generated, processing the script again will lead to new uids although most (all) of the events will be the same. Improving this would be very helpful, e.g. Google Calendar would only import new/updated events when the ICS is importted several times
- Timezone ist not specified, times will only be correct for visitors from german timezone
- the for loop that composes VEVENT string is not very efficient
- all \n should be appended as \r\n right from the beginning
- current date is hard coded, could also be parsed from website
- maybe google calendar has an api that allows the script to directly push events into a calendar?
- URLs in VEVENTS still contain spaces and " at beginning and end, but they are functional
"""

#used for saving and downloading the .ICS-file in the end from Google Colab 
# from google.colab import files

#composes the VEVENTS for a link and a hardcoded date (YYYYMMDD)
def calculateDay(url,day):

  import requests
  import re
  import datetime


  from bs4 import BeautifulSoup

  programm = url

  page = requests.get(programm)

  ## I wanted to work with soup but in the end I had to use REGEX because of the HTML5 containing non-closing tags
  soup = BeautifulSoup(page.content,"html.parser")
  programmblocks = list(soup.select('div.programmblock'))
  
  
  ical = ''
  starttime = ''
  endtime = ''
  titles = []
  places = []
  urls = []

  #iterate through all programblocks
  for programmblock in programmblocks:
    programmblockStr = str(programmblock)

    ### Search for the starttime of the block
    starttime = re.findall('\d*:\d* -', programmblockStr)[0]
    starttime = starttime.replace(':','')
    starttime = starttime.replace(' -','')
    if (len(starttime) < 4 and len(starttime) > 0):
     starttime = '0'+starttime

    ### Search for the endtime of the block
    endtime = re.findall('- \d*:\d*', programmblockStr)[0]
    endtime = endtime.replace(':','')
    endtime = endtime.replace('- ','')
    if (len(endtime) < 4 and len(endtime) > 0):
     endtime = '0'+endtime

    ### Search for all titles in the block
    titles = re.findall('<p>[\s\S]{,200}</p>', programmblockStr)
    for index,title in enumerate(titles):
      titles[index] = titles[index].replace('<p>','').replace('</p>','').replace('<em>','').replace('</em>','').replace('<br>','').replace('<br/>','').replace('\n','').replace('  ',' ').replace('  ',' ')

    ### Search for all places (=Locations) in the block
    places = re.findall('>\S{1,}</span>', programmblockStr)
    for index,place in enumerate(places):
      places[index] = places[index].replace('</span>','').replace('>','')
    

    ### Search for all URLs (=Descriptions) in the block
    urls = re.findall('href=\"\S*\"', programmblockStr)
    for index,url in enumerate(urls):
      urls[index] = urls[index].replace('href="','').replace('""','')


    ## compose VEVENTS
    index = 0
    while index < len(titles):
      if ((len(urls) == len(places) == len(titles)) and len(urls) > 0):
        ical = ical + 'BEGIN:VEVENT'
        ical = ical + '\nSUMMARY:' + titles[index] + '\nUID:' + str(datetime.datetime.now().time()).replace(':','') + 'PatrickToschka' + '\nDTSTART:' + day + 'T' + starttime + '00'
        ical = ical + '\nDTEND:' + day + 'T' + endtime + '00\n'
        ical = ical + 'DTSTAMP:20180125T' + str(int(float(str(datetime.datetime.now().time()).replace(':','')))) 
        ical = ical + '\nLocation:' + places[index]
        ical = ical + '\nDESCRIPTION: ' + urls[index]
        ical = ical + '\nEND:VEVENT\n'
        index = index+1
      else:
        index = index+1
      
  #returns set of VEVENTS in a string
  return ical


#Call function for every day and page of the DHd week, results are strings that contain VEVENTS 
ical1 = calculateDay('https://dhd2019.org/programm/mo','20190325')
ical2 = calculateDay('https://dhd2019.org/programm/di','20190326')
ical3 = calculateDay('https://dhd2019.org/programm/mi','20190327')
ical4 = calculateDay('https://dhd2019.org/programm/do','20190328')
ical5 = calculateDay('https://dhd2019.org/programm/fr','20190329')

#compose ical with header information and end tag
icalFinal = ''
icalFinal = 'BEGIN:VCALENDAR\nPRODID:PatrickToschka\nVERSION:2.0\nMETHOD:PUBLISH\nCALSCALE:GREGORIAN\n' + ical1 + ical2 + ical3 + ical4 + ical5 + 'END:VCALENDAR'

#replace all \n with \r\n, which seems to be a requirement for ICS-files
icalFinal = icalFinal.replace('\n','\r\n')

#write file
text_file = open("DHd2019_website_to_ics.ICS", "w")
text_file.write(icalFinal)
text_file.close()
print('test')

#download file
#files.download("DHd2019.ICS")
#print(icalFinal)

test
