# Fundamentals of Data Analysis 2021
---
### Sam Tracey
### December 2021
### Analysis of CAO points 2019 / 2020 / 2021
---

In [1]:
# Regular Expressions
import re
# Convenient HTTP Requests
import requests as rq
import csv
import pandas as pd
# Efficient working with datetimes
import datetime as dt

In [2]:
# Retrieve CAO points URL.
resp = rq.get('http://www2.cao.ie/points/l8.php')

<br>

## Save Original CAO 2021 Data Set.

***


In [3]:
# Get The Current Date and Time
now = dt.datetime.now()

# Format as a string
nowstr =  now.strftime('%Y%m%d_%H%M%S')

In [4]:
# Create a File Path for the Original Data
path = 'data/cao2021_' + nowstr + '.html'

In [5]:
# Server is using the incorrect encoding, we need to fix it.
original_encoding = resp.encoding
# Change to CP1252
resp.encoding = 'cp1252'

In [6]:
# Save the Original html file
with open(path, 'w') as f:
    f.write(resp.text)

<br>

## Use Regular Expressions to Select Correrct Lines

***

In [7]:
#Compile Regular Expression for Matching Lines.
#re_course = re.compile('([A-Z]{2}[0-9]{3})  (.*?) (\#?|([0-9]{4}|[0-9]{3})|\*?)  (.*?)')
re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*?)([0-9]{3,5})(\*?) *')


<br>

## Loop Through the Lines of the Response and Write to .csv file

***
 

In [8]:
# Define path in which to save .csv file.
path = 'data/cao2021_csv_' + nowstr + '.csv'
# Open the csv file for writing in to.
with open(path,'w', encoding='utf-8') as file:
    # Loop through the lines of responses.
    for line in resp.iter_lines():
        # Match only the lines we want - those representing courses
        if re_course.fullmatch(line.decode('cp1252')):
            # Add comma delimiters after each grouping and decode line (using incorrect decoding!)
            csv_ver = re_course.sub(r'\1, \2, \3, \4', line.decode('cp1252'))
            
            csv_ver = ' '.join(csv_ver.split())
            csv_ver = re.sub('[#*]', '', csv_ver)
            file.write(csv_ver + '\n')
            

In [9]:
# Read 2020 CAO points from .xslx URL
Cao2020_Url = 'http://www2.cao.ie/points/CAOPointsCharts2020.xlsx'
# Define Path for writing Data
path = 'data/cao2020_csv_' + nowstr + '.csv'
df = pd.read_excel(Cao2020_Url,
                   sheet_name='PointsCharts2020_V2',
                   skiprows=range(11),
                   usecols = 'B,C,D,F',
                   index_col=None, header=None)
# Re-arrange columns to match structure of 2021 csv file.
df = df[[2, 1, 3, 5]]
df.to_csv(path, encoding='utf-8', index=False)

In [10]:
df

Unnamed: 0,2,1,3,5
0,AC120,International Business,209,
1,AC137,Liberal Arts,252,
2,AD101,"First Year Art & Design (Common Entry,portfolio)",#+matric,
3,AD102,Graphic Design and Moving Image Design (portfo...,#+matric,
4,AD103,Textile & Surface Design and Jewellery & Objec...,#+matric,
...,...,...,...,...
1459,WD208,Manufacturing Engineering,188,
1460,WD210,Software Systems Development,279,
1461,WD211,Creative Computing,271,
1462,WD212,Recreation and Sport Management,270,


## References

[1:Real-Python_REGEX](https://realpython.com/python-web-scraping-practical-introduction/)

[2:StackOverFlow-Iter_lines](https://stackoverflow.com/questions/16870648/python-read-website-data-line-by-line-when-available)

[3:REGEX_Syntax](https://docs.python.org/3/library/re.html)

[4:StackOverFlow-utf-8](https://stackoverflow.com/questions/13110629/decoding-utf-8-strings-in-python)

[5:Understanding_ISO-8859-1](https://mincong.io/2019/04/07/understanding-iso-8859-1-and-utf-8/)
