## 1. Load the Ryanair Reviews into AWS S3

In [2]:
import boto3
import sagemaker
import pandas as pd

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

### List existing S3 buckets

In [3]:
!aws s3 ls

2024-02-18 20:23:31 211125778552personalizepocvod
2024-02-18 01:27:18 aws-athena-query-results-211125778552-us-east-1
2024-04-13 17:52:38 aws-athena-query-results-us-east-1-211125778552
2024-03-27 00:35:44 aws-glue-assets-211125778552-us-east-1
2024-03-27 00:28:12 aws-glue-assets-211125778552-us-east-2
2024-02-15 21:19:44 sagemaker-studio-12jvao34qlkn
2024-02-15 22:38:05 sagemaker-studio-211125778552-3pjkfc2ijfr
2024-02-17 02:02:09 sagemaker-studio-211125778552-4dcj21sopi3
2024-02-19 03:02:29 sagemaker-studio-211125778552-4rfwbx1bibn
2024-02-15 20:23:46 sagemaker-studio-211125778552-4yhhjbuzjdq
2024-02-17 02:02:35 sagemaker-studio-211125778552-8xxlet4bnrv
2024-02-17 02:02:08 sagemaker-studio-211125778552-rfcwvtinree
2024-02-20 00:38:45 sagemaker-studio-211125778552-yu1t8p5304s
2024-03-21 19:20:47 sagemaker-studio-uyd2sz3oy3
2024-03-08 03:11:27 sagemaker-team11-stanford-dogs
2024-03-18 01:49:41 sagemaker-team6-distracted-drivers
2024-02-15 20:23:48 sagemaker-us-east-1-211125778552
2024-

### Create our own S3 bucket

In [4]:
!aws s3 mb s3://sagemaker-team4-bucket

make_bucket: sagemaker-team4-bucket


### Upload the Ryanair customer feedback dataset to the S3 bucket

In [5]:
s3_private_path = "s3://sagemaker-team4-bucket/ryanair-data/"

In [6]:
!aws s3 cp ryanair_reviews.csv s3://sagemaker-team4-bucket/ryanair-data/

upload: ./ryanair_reviews.csv to s3://sagemaker-team4-bucket/ryanair-data/ryanair_reviews.csv


### Create an Athena database to store the customer feedback

In [7]:
from pyathena import connect
import pandas as pd

In [8]:
database_name = "team4"
table_name = "ryanair_reviews"
bucket = "sagemaker-team4-bucket"

In [9]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [10]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [11]:
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)

CREATE DATABASE IF NOT EXISTS team4


In [12]:
pd.read_sql(statement, conn)

In [13]:
statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,database_name
0,default
1,dsoaws
2,sagemaker_featurestore
3,team-8-fec-db
4,team3-court-data


### Load the Ryanair data into the database

In [14]:
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
        record_id int,
        date_published date,
        overall_rating int, 
        passenger_country string,
        trip_verified string,
        comment_title string,
        comment string,
        aircraft string,
        type_of_traveller string,
        seat_type string,
        origin string,  
        destination string,
        date_flown date,
        seat_comfort int,
        cabin_staff_service int,
        food_and_beverages int,
        ground_service int,
        value_for_money int,
        recommended int,    
        inflight_entertainment int, 
        wifi_and_connectivity int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\,' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')""".format(
    database_name, table_name, s3_private_path
)

print(statement)

CREATE EXTERNAL TABLE IF NOT EXISTS team4.ryanair_reviews(
        record_id int,
        date_published date,
        overall_rating int, 
        passenger_country string,
        trip_verified string,
        comment_title string,
        comment string,
        aircraft string,
        type_of_traveller string,
        seat_type string,
        origin string,  
        destination string,
        date_flown date,
        seat_comfort int,
        cabin_staff_service int,
        food_and_beverages int,
        ground_service int,
        value_for_money int,
        recommended int,    
        inflight_entertainment int, 
        wifi_and_connectivity int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\,' LINES TERMINATED BY '\n' LOCATION 's3://sagemaker-team4-bucket/ryanair-data/'
TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')


In [15]:
pd.read_sql(statement, conn)

In [16]:
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,tab_name
0,ryanair_reviews


In [17]:
statement = """SELECT * FROM {}.{}
    LIMIT 100""".format(
    database_name, table_name
)

print(statement)

SELECT * FROM team4.ryanair_reviews
    LIMIT 100


In [18]:
df = pd.read_sql(statement, conn)
df.head(5)

Unnamed: 0,record_id,date_published,overall_rating,passenger_country,trip_verified,comment_title,comment,aircraft,type_of_traveller,seat_type,...,destination,date_flown,seat_comfort,cabin_staff_service,food_and_beverages,ground_service,value_for_money,recommended,inflight_entertainment,wifi_and_connectivity
0,0,2024-02-03,10,United Kingdom,Not Verified,"""""""bang on time and smooth flights""""""",Flew back from Faro to London Luton Friday 2nd...,Boeing 737 900,Family Leisure,Economy Class,...,Luton,,4.0,5.0,3.0,4.0,4.0,,,
1,1,2024-01-26,10,United Kingdom,Trip Verified,"""""""Another good affordable flight""""""","""Another good affordable flight with Ryanair. ...",pleasant staff at check-in and on board. We u...,,Couple Leisure,...,Belfast,,,3.0,5.0,3.0,5.0,5.0,,
2,2,2024-01-20,10,United Kingdom,Trip Verified,“Really impressed!”,"""Really impressed! You get what you pay for",this flight only cost £19.99. The seats were ...,and there was tons of legroom! (not in an eme...,flies almost everywhere,...,Boeing 737-800,,,,,,5.0,5.0,4.0,5.0
3,3,2024-01-07,6,United Kingdom,Trip Verified,“a decent offering from Ryanair”,"""I should like to review my flight from Faro t...",I was able to check-in my bag within less tha...,there was no communication with the passenger...,"a decent offering from Ryanair.""",...,Solo Leisure,,,,,3.0,2.0,1.0,3.0,3.0
4,4,2024-01-06,10,Israel,Trip Verified,“cabin crew were welcoming and friendly”,"""Flight left the gate ahead of schedule",fare was really cheap and cabin crew were wel...,Boeing 737-800,Solo Leisure,...,Dublin,,,4.0,5.0,,4.0,5.0,,


### Create Parquet Files

In [19]:
table_name_parquet = "ryanair_reviews_parquet"
s3_path_parquet = "s3://{}/ryanair-data/parquet".format(bucket)

In [20]:
# SQL statement to execute
statement = """CREATE TABLE IF NOT EXISTS {}.{}
WITH (format = 'PARQUET', external_location = '{}', partitioned_by = ARRAY['year']) AS
SELECT date_published,
        record_id,
        overall_rating, 
        passenger_country,
        trip_verified,
        comment_title,
        comment,
        aircraft,
        type_of_traveller,
        seat_type,
        origin,  
        destination,
        seat_comfort,
        cabin_staff_service,
        food_and_beverages,
        ground_service,
        value_for_money,
        recommended,    
        inflight_entertainment, 
        wifi_and_connectivity,
        DATE(date_flown) AS date_flown,
        CAST(YEAR(DATE(date_flown)) AS INTEGER) AS year
FROM {}.{}""".format(
    database_name, table_name_parquet, s3_path_parquet, database_name, table_name
)

print(statement)

CREATE TABLE IF NOT EXISTS team4.ryanair_reviews_parquet
WITH (format = 'PARQUET', external_location = 's3://sagemaker-team4-bucket/ryanair-data/parquet', partitioned_by = ARRAY['year']) AS
SELECT date_published,
        record_id,
        overall_rating, 
        passenger_country,
        trip_verified,
        comment_title,
        comment,
        aircraft,
        type_of_traveller,
        seat_type,
        origin,  
        destination,
        seat_comfort,
        cabin_staff_service,
        food_and_beverages,
        ground_service,
        value_for_money,
        recommended,    
        inflight_entertainment, 
        wifi_and_connectivity,
        DATE(date_flown) AS date_flown,
        CAST(YEAR(DATE(date_flown)) AS INTEGER) AS year
FROM team4.ryanair_reviews


In [21]:
pd.read_sql(statement, conn)

Unnamed: 0,rows


### Load the parquet partitions

In [22]:
statement = "MSCK REPAIR TABLE {}.{}".format(database_name, table_name_parquet)

print(statement)

MSCK REPAIR TABLE team4.ryanair_reviews_parquet


### Show the parquet partitions

In [23]:
statement = "SHOW PARTITIONS {}.{}".format(database_name, table_name_parquet)

print(statement)

SHOW PARTITIONS team4.ryanair_reviews_parquet


In [24]:
df_partitions = pd.read_sql(statement, conn)
df_partitions.head(5)

Unnamed: 0,partition
0,year=__HIVE_DEFAULT_PARTITION__


In [25]:
statement = """SELECT * FROM {}.{}
    LIMIT 100""".format(
    database_name, table_name_parquet
)

print(statement)

SELECT * FROM team4.ryanair_reviews_parquet
    LIMIT 100


In [26]:
df = pd.read_sql(statement, conn)
df.head(5)

Unnamed: 0,date_published,record_id,overall_rating,passenger_country,trip_verified,comment_title,comment,aircraft,type_of_traveller,seat_type,...,seat_comfort,cabin_staff_service,food_and_beverages,ground_service,value_for_money,recommended,inflight_entertainment,wifi_and_connectivity,date_flown,year
0,2024-02-03,0,10,United Kingdom,Not Verified,"""""""bang on time and smooth flights""""""",Flew back from Faro to London Luton Friday 2nd...,Boeing 737 900,Family Leisure,Economy Class,...,4.0,5.0,3.0,4.0,4.0,,,,,
1,2024-01-26,1,10,United Kingdom,Trip Verified,"""""""Another good affordable flight""""""","""Another good affordable flight with Ryanair. ...",pleasant staff at check-in and on board. We u...,,Couple Leisure,...,,3.0,5.0,3.0,5.0,5.0,,,,
2,2024-01-20,2,10,United Kingdom,Trip Verified,“Really impressed!”,"""Really impressed! You get what you pay for",this flight only cost £19.99. The seats were ...,and there was tons of legroom! (not in an eme...,flies almost everywhere,...,,,,,5.0,5.0,4.0,5.0,,
3,2024-01-07,3,6,United Kingdom,Trip Verified,“a decent offering from Ryanair”,"""I should like to review my flight from Faro t...",I was able to check-in my bag within less tha...,there was no communication with the passenger...,"a decent offering from Ryanair.""",...,,,,3.0,2.0,1.0,3.0,3.0,,
4,2024-01-06,4,10,Israel,Trip Verified,“cabin crew were welcoming and friendly”,"""Flight left the gate ahead of schedule",fare was really cheap and cabin crew were wel...,Boeing 737-800,Solo Leisure,...,,4.0,5.0,,4.0,5.0,,,,


In [27]:
statement = """SELECT RECOMMENDED FROM {}.{}
    LIMIT 100""".format(
    database_name, table_name_parquet
)

print(statement)
pd.read_sql(statement, conn)

SELECT RECOMMENDED FROM team4.ryanair_reviews_parquet
    LIMIT 100


Unnamed: 0,RECOMMENDED
0,
1,5.0
2,5.0
3,1.0
4,5.0
...,...
95,5.0
96,
97,
98,
