# Fundamentals of Data Analysis 2021
---
### Sam Tracey
### December 2021
### Analysis of CAO points 2019 / 2020 / 2021
---

In [1]:
# Import Necessary Python Libraries.

# Regular Expressions
import re
# Convenient HTTP Requests
import requests as rq
import csv
import pandas as pd
# Efficient working with datetimes
import datetime as dt

<br>

# Import CAO 2021 Points

Reference: http://www.cao.ie/index.php?page=points&p=2021

***

In [2]:
# Retrieve CAO points URL.
resp = rq.get('http://www2.cao.ie/points/l8.php')

<br>

## Save Original CAO 2021 Data Set.

***


In [3]:
# Get The Current Date and Time
now = dt.datetime.now()

# Format as a string
nowstr =  now.strftime('%Y%m%d_%H%M%S')

In [4]:
# Create a File Path for the Original Data
path = 'data/cao2021_' + nowstr + '.html'

In [5]:
# Server is using the incorrect encoding, we need to fix it.
original_encoding = resp.encoding
# Change to CP1252
resp.encoding = 'cp1252'

In [6]:
# Save the Original html file
with open(path, 'w') as f:
    f.write(resp.text)

<br>

## Use Regular Expressions to Select Correrct Lines

***

In [7]:
#Compile Regular Expression for Matching Lines.
#re_course = re.compile('([A-Z]{2}[0-9]{3})  (.*?) (\#?|([0-9]{4}|[0-9]{3})|\*?)  (.*?)')
re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*?)([0-9]{3,5})(\*?) *')


<br>

## Loop Through the Lines of the Response and Write to .csv file

***
 

In [8]:
# Define path in which to save .csv file.
path = 'data/cao2021_csv_' + nowstr + '.csv'
# Open the csv file for writing in to.
with open(path,'w', encoding='utf-8') as file:
    # Loop through the lines of responses.
    for line in resp.iter_lines():
        # Match only the lines we want - those representing courses
        if re_course.fullmatch(line.decode('cp1252')):
            # Add comma delimiters after each grouping and decode line (using incorrect decoding!)
            csv_ver = re_course.sub(r'\1, \2, \3, \4', line.decode('cp1252'))
            
            csv_ver = ' '.join(csv_ver.split())
            csv_ver = re.sub('[#*]', '', csv_ver)
            file.write(csv_ver + '\n')
            

<br>

## Reading 2020 CAO Points From Messy Excel File


Reference: http://www.cao.ie/index.php?page=points&p=2020&bb=points
***

In [9]:
# Define Path for writing Data
path = 'data/cao2020_xlsx_' + nowstr + '.xlsx'

In [10]:
# Define url to read data from
Cao2020_Url = 'http://www2.cao.ie/points/CAOPointsCharts2020.xlsx'

In [11]:
# Save an original version of the 2020 CAO Excel File directly from URL
# reference https://stackoverflow.com/questions/31126596/saving-response-from-requests-to-file
resp = rq.get(Cao2020_Url)
output = open(path, 'wb')
with open(path, 'wb') as output:
    output.write(resp.content)

In [12]:
# Read 2020 CAO points from .xslx URL
df = pd.read_excel(Cao2020_Url,
                   sheet_name='PointsCharts2020_V2',
                   skiprows=range(10),
                   usecols = "A:O",
                   index_col=None)


In [13]:
# Write dataframe to .csv file
df.to_csv(path, encoding='utf-8', index=False)

In [14]:
# Look at Dataframe
df

Unnamed: 0,CATEGORY (i.e.ISCED description),COURSE TITLE,COURSE CODE2,R1 POINTS,R1 Random *,R2 POINTS,R2 Random*,EOS,EOS Random *,EOS Mid-point,LEVEL,HEI,Test/Interview #,avp,v
0,Business and administration,International Business,AC120,209,,,,209,,280,8,American College,,,
1,Humanities (except languages),Liberal Arts,AC137,252,,,,252,,270,8,American College,,,
2,Arts,"First Year Art & Design (Common Entry,portfolio)",AD101,#+matric,,,,#+matric,,#+matric,8,National College of Art and Design,#,,
3,Arts,Graphic Design and Moving Image Design (portfo...,AD102,#+matric,,,,#+matric,,#+matric,8,National College of Art and Design,#,,
4,Arts,Textile & Surface Design and Jewellery & Objec...,AD103,#+matric,,,,#+matric,,#+matric,8,National College of Art and Design,#,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1459,Manufacturing and processing,Manufacturing Engineering,WD208,188,,,,188,,339,7,Waterford Institute of Technology,,,
1460,Information and Communication Technologies (ICTs),Software Systems Development,WD210,279,,,,279,,337,8,Waterford Institute of Technology,,,
1461,Information and Communication Technologies (ICTs),Creative Computing,WD211,271,,,,271,,318,8,Waterford Institute of Technology,,,
1462,Personal services,Recreation and Sport Management,WD212,270,,,,270,,349,8,Waterford Institute of Technology,,,


In [15]:
# Spot check to ensure that dataframe row 765 matches Excel row 777 (offset =10 rows skipped + header + zero indexing)
df.iloc[765]

CATEGORY (i.e.ISCED description)                                Arts
COURSE TITLE                                         Interior Design
COURSE CODE2                                                   LC340
R1 POINTS                                                        350
R1 Random *                                                      NaN
R2 POINTS                                                        NaN
R2 Random*                                                       NaN
EOS                                                              350
EOS Random *                                                     NaN
EOS Mid-point                                                    391
LEVEL                                                              8
HEI                                 Limerick Institute of Technology
Test/Interview #                                                 NaN
avp                                                              NaN
v                                 

## References

[1:Real-Python_REGEX](https://realpython.com/python-web-scraping-practical-introduction/)

[2:StackOverFlow-Iter_lines](https://stackoverflow.com/questions/16870648/python-read-website-data-line-by-line-when-available)

[3:REGEX_Syntax](https://docs.python.org/3/library/re.html)

[4:StackOverFlow-utf-8](https://stackoverflow.com/questions/13110629/decoding-utf-8-strings-in-python)

[5:Understanding_ISO-8859-1](https://mincong.io/2019/04/07/understanding-iso-8859-1-and-utf-8/)
