# CAO Points Analysis

***

In [1]:
# Regular expressions
import re

# Convenient HTTP request.
import requests as rq

# Dates and Times
import datetime as dt

# Data Frame
import pandas as pd

# For downloading
import urllib.request as urlrq

In [2]:
# Get the current data and time
now = dt.datetime.now()

# Format as a string.
nowstr = now.strftime('%y%m%d_%H%M%S')

<br>

## 2021 CAO POINTS

http://www2.cao.ie/points/l8.php

***

In [3]:
# Fetch the CAO points url
resp = rq.get('http://www2.cao.ie/points/l8.php')
resp

<Response [200]>

# Save Original Data Set

***

In [4]:
# Create a file path for the original data
pathhtml = 'data/cao2021_' + nowstr + '.html'

# Error on Server

Technically, the server says we should decode as per:

'''
Content-Type: text/html; charset=iso-8859-1
'''

However, one line uses \x96 which isn't defined in iso-8859-1.

Therefore we use the similar decoding standard cp1252, which is very similar but includes #x96.

***

In [5]:
# The server uses the wrong encoding, fix it
original_encoding = resp.encoding

#Change to cp1252
resp.encoding = 'cp1252'

In [6]:
# Save the orignal html file
with open(pathhtml, 'w') as f:
    f.write(resp.text)

## Use regular expressions to slect lines we want

In [7]:
# Compile the regular expression for matching lines
re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)[0-9]{3}(\*?) *')

### Loop through the lines of the repsonse

In [9]:
# The file path for the csv file
path = 'data/cao2021_csv_' + nowstr + '.csv'

In [15]:
# Keep track of how many courses we process
no_lines = 0

# Open the csv file for writing
with open(path, 'w') as f:
    # Write a header row
    f.write(','.join(['code', 'title', 'pointsR1', 'pointsR2']) + '\n')
    # Loop through the lines of the response content
    for line in resp.iter_lines():
        #Decode the line, using the wrong encoding
        dline = line.decode('cp1252')
        # Match only the lines we want - the ones representing courses.
        if re_course.fullmatch(dline):
            #Add one to the lines counter
            no_lines = no_lines + 1
            #The course code
            course_code = dline[:5]
            #The course Title
            course_title = dline[7:57].strip()
            # Round one points
            course_points = re.split(' +', dline[60:])
            if len(course_points) != 2:
                course_points = course_points[:2]
            #join the fields using a comma
            linesplit = [course_code, course_title, course_points[0], course_points[1]]
           
            #Rejoin the substrings with commas inbetween
            f.write(','.join(linesplit) + '\n')
       
print(f"Total number of lines is {no_lines}.")

Total number of lines is 922.


#### NB: It was verified as of 03/11/2021 that there were 949 courses exactly in the CA 2021 points list

In [19]:
df2021 = pd.read_csv(path, encoding='cp1252')

In [20]:
df2021

Unnamed: 0,code,title,pointsR1,pointsR2
0,AL801,Software Design for Virtual Reality and Gaming,300,
1,AL802,Software Design in Artificial Intelligence for...,313,
2,AL803,Software Design for Mobile Apps and Connected ...,350,
3,AL805,Computer Engineering for Network Infrastructure,321,
4,AL810,Quantity Surveying,328,
...,...,...,...,...
917,WD211,Creative Computing,270,
918,WD212,Recreation and Sport Management,262,
919,WD230,Mechanical and Manufacturing Engineering,230,230
920,WD231,Early Childhood Care and Education,266,


<br>

## 2020 points

http://www.cao.ie/index.php?page=points&p=2020

***

In [21]:
url2020 = 'http://www2.cao.ie/points/CAOPointsCharts2020.xlsx'

### Save Original File

In [23]:
# Create a file path for the original data
pathxlsx = 'data/cao2020_' + nowstr + '.xlsx'

In [24]:
urlrq.urlretrieve(url2020, pathxlsx)

('data/cao2020_211106_101614.xlsx', <http.client.HTTPMessage at 0x1ed4e424c10>)

<br>

#### Load Spreadsheet using pandas

***

In [31]:
# Download and parse the Excel spreadsheet
df = pd.read_excel(url2020, skiprows=10)

In [32]:
df

Unnamed: 0,CATEGORY (i.e.ISCED description),COURSE TITLE,COURSE CODE2,R1 POINTS,R1 Random *,R2 POINTS,R2 Random*,EOS,EOS Random *,EOS Mid-point,...,avp,v,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8
0,Business and administration,International Business,AC120,209,,,,209,,280,...,,,,,,,,,,
1,Humanities (except languages),Liberal Arts,AC137,252,,,,252,,270,...,,,,,,,,,,
2,Arts,"First Year Art & Design (Common Entry,portfolio)",AD101,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
3,Arts,Graphic Design and Moving Image Design (portfo...,AD102,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
4,Arts,Textile & Surface Design and Jewellery & Objec...,AD103,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1459,Manufacturing and processing,Manufacturing Engineering,WD208,188,,,,188,,339,...,,,,,,,,,,
1460,Information and Communication Technologies (ICTs),Software Systems Development,WD210,279,,,,279,,337,...,,,,,,,,,,
1461,Information and Communication Technologies (ICTs),Creative Computing,WD211,271,,,,271,,318,...,,,,,,,,,,
1462,Personal services,Recreation and Sport Management,WD212,270,,,,270,,349,...,,,,,,,,,,


In [30]:
# spot check random row
df.iloc[753]

ADMISSION DATA 2020    Engineering and engineering trades
Unnamed: 1                         Electrical Engineering
Unnamed: 2                                          LC271
Unnamed: 3                                            261
Unnamed: 4                                            NaN
Unnamed: 5                                            NaN
Unnamed: 6                                            NaN
Unnamed: 7                                            261
Unnamed: 8                                            NaN
Unnamed: 9                                            348
Unnamed: 10                                             7
Unnamed: 11              Limerick Institute of Technology
Unnamed: 12                                           NaN
Unnamed: 13                                           NaN
Unnamed: 14                                           NaN
Unnamed: 15                                           NaN
Unnamed: 16                                           NaN
Unnamed: 17   

In [33]:
# spot check last row
df.iloc[-1]

CATEGORY (i.e.ISCED description)          Engineering and engineering trades
COURSE TITLE                        Mechanical and Manufacturing Engineering
COURSE CODE2                                                           WD230
R1 POINTS                                                                253
R1 Random *                                                              NaN
R2 POINTS                                                                NaN
R2 Random*                                                               NaN
EOS                                                                      253
EOS Random *                                                             NaN
EOS Mid-point                                                            369
LEVEL                                                                      8
HEI                                        Waterford Institute of Technology
Test/Interview #                                                         NaN

In [34]:
# Create a file path for the panda data.
path 2020 = 'data/cao2020_' + nowstr + '.csv'

SyntaxError: invalid syntax (<ipython-input-34-11bfc6568df9>, line 2)

In [None]:
# Save pandas data frame to disk
