# CAO Points Notebook 

*** 

The first section within this notebook will load the CAO point for the years 2021, 2020, & 2019 and provide a clear and concise overview of how to do so.

***

## Loading the data

In [1]:
# Convenient HTTP requests
import requests as rq
# Importing regular expressions
import re
import datetime as dt
import pandas as pd
import urllib.request as urlrq
import matplotlib.pyplot as plt 


In [2]:
#current date and time.
now = dt.datetime.now()

# Format the above as a string.
nowstr = now.strftime('%Y%m%d_%H%M%S')

## 2021 Data 

In [3]:
# First need to use 'requests' to pull the 2021
resp = rq.get('http://www2.cao.ie/points/l8.php')

# check
resp

<Response [200]>

### Saving the original data set 

***

In [4]:
path = 'data/original_cao2021_' +  nowstr + '.html'

In [5]:
# the server uses the wrong encoding, fixing it.

original_encoding = resp.encoding
#changing it to cp1252

resp.encoding = 'cp1252'

# save the original file

with open(path, 'w') as f:
    f.write(resp.text)

### Using regular expression

In [6]:
#compile regular expression for matching lines. 
# this will make the code below more efficient 
re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)([0-9]{3})(\*?) *')

In [7]:
path = 'data/cao2021_' +  nowstr + '.csv'

no_lines = 0 
#looping through the lines of the response content.
with open(path, 'w') as f:

    for line in resp.iter_lines():
        dline = line.decode('cp1252')
    # cleaning the data to get only the lines we need, which are the ones representing courses.
        if re_course.fullmatch(dline):
            no_lines = no_lines + 1
            #print(line)
            csv_version = re_course.sub(r'\1,\2,\3,\4', dline)
            #print(csv_version)
            #split the line on two or more spaces
            linesplit = re.split(' +,', dline)
            #print(','.join(linesplit))
            f.write(','.join(linesplit) + '\n')
print(f"The total number of lines is {no_lines}.")

The total number of lines is 922.


<br>

## 2020 Points

### Saving original file

In [8]:
path = 'data/original_cao2020_' +  nowstr + '.xlsx'

In [9]:
urlrq.urlretrieve('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx', path)

('data/original_cao2020_20211122_121117.xlsx',
 <http.client.HTTPMessage at 0x221a0c943a0>)

In [10]:
# download and parse the excel spreadsheet
#skiprow to skip the first 10 rows
df = pd.read_excel('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx', skiprows=10)

In [11]:
df

Unnamed: 0,CATEGORY (i.e.ISCED description),COURSE TITLE,COURSE CODE2,R1 POINTS,R1 Random *,R2 POINTS,R2 Random*,EOS,EOS Random *,EOS Mid-point,...,avp,v,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8
0,Business and administration,International Business,AC120,209,,,,209,,280,...,,,,,,,,,,
1,Humanities (except languages),Liberal Arts,AC137,252,,,,252,,270,...,,,,,,,,,,
2,Arts,"First Year Art & Design (Common Entry,portfolio)",AD101,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
3,Arts,Graphic Design and Moving Image Design (portfo...,AD102,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
4,Arts,Textile & Surface Design and Jewellery & Objec...,AD103,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1459,Manufacturing and processing,Manufacturing Engineering,WD208,188,,,,188,,339,...,,,,,,,,,,
1460,Information and Communication Technologies (ICTs),Software Systems Development,WD210,279,,,,279,,337,...,,,,,,,,,,
1461,Information and Communication Technologies (ICTs),Creative Computing,WD211,271,,,,271,,318,...,,,,,,,,,,
1462,Personal services,Recreation and Sport Management,WD212,270,,,,270,,349,...,,,,,,,,,,


In [12]:
#spotchecks with dataset 
df.iloc[754] 

CATEGORY (i.e.ISCED description)    Engineering and engineering trades
COURSE TITLE                                    Mechanical Engineering
COURSE CODE2                                                     LC288
R1 POINTS                                                          347
R1 Random *                                                        NaN
R2 POINTS                                                          346
R2 Random*                                                         NaN
EOS                                                                346
EOS Random *                                                       NaN
EOS Mid-point                                                      415
LEVEL                                                                8
HEI                                   Limerick Institute of Technology
Test/Interview #                                                   NaN
avp                                                                NaN
v     

In [13]:
df.iloc[1463]

CATEGORY (i.e.ISCED description)          Engineering and engineering trades
COURSE TITLE                        Mechanical and Manufacturing Engineering
COURSE CODE2                                                           WD230
R1 POINTS                                                                253
R1 Random *                                                              NaN
R2 POINTS                                                                NaN
R2 Random*                                                               NaN
EOS                                                                      253
EOS Random *                                                             NaN
EOS Mid-point                                                            369
LEVEL                                                                      8
HEI                                        Waterford Institute of Technology
Test/Interview #                                                         NaN

In [14]:
# creating a file path for the pandas data
path = 'data/original_cao2020_' +  nowstr + '.csv'

In [15]:
df.to_csv(path)   #saving pandas to disk