# CAO Points Analysis

https://www.cao.ie/index.php?page=points&p=2021

In [1]:
# Regular expressions
import re
# Package for making HTTP requests
import requests as rq

# Dates and times
import datetime as dt

In [2]:
#Fetch the CAO points URL
resp = rq.get('http://www2.cao.ie/points/l8.php')
# Have a look
resp

<Response [200]>

<br>

## Save original data set

In [3]:
#Get the current date and time.
now = dt.datetime.now()

# Format as a string
nowstr = now.strftime('%Y%m%d_%H%M%S')

In [4]:
#Create a file path for the original data
path = 'data/cao2021_' + nowstr + '.html'

<br>

## Charset error on server


Technically, server states decoding as:
    ```Content-Type: text/html; charset=iso-8859-1
    ```
However, one line uses \x96 which isn't defined in iso-8859-1.
Therefore, we use the similar decoding standard cp1252, which 
is very similar but includes \x96.

In [5]:
#The server uses the wrong encoding, fix it.
original_encoding = resp.encoding
#Change to cp1252
resp.encoding = 'cp1252'

In [6]:
#Save the original html file.
with open(path, 'w') as f:
    f.write(resp.text)

In [7]:
#Compile the regular expression for matching lines.
re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)([0-9]{3})(\*?) *')

In [8]:
# The file path for the csv file.
path = 'data/cao2021_csv_' + nowstr + '.csv'
    
#Loop throught the lines of the response content.
no_lines = 0

#Open the csv file for writing.
with open(path, 'w') as f:
    # Loop through lines of the response
    for line in resp.iter_lines():
        dline = line.decode('cp1252')
        # Match only the lines we want - the ones representing courses.
        if re_course.fullmatch(dline):
            #Add one to the lines counter
            no_lines = no_lines +1
            #Split the line on two or more spaces.
            linesplit = re.split('  +', dline)
            # Rejoin the substrings with commas in between           
            f.write(','.join(linesplit) + '\n')
            
# Print the total number of processed lines.            
print(f"Total number of lines is {no_lines}.")

Total number of lines is 922.
