In [1]:
'''

@Author: Vighnesh Harish Bilgi
@Date: 2022-11-11
@Last Modified by: Vighnesh Harish Bilgi
@Last Modified time: 2022-11-11
@Title : Athena BOTO3

'''

'\n\n@Author: Vighnesh Harish Bilgi\n@Date: 2022-11-11\n@Last Modified by: Vighnesh Harish Bilgi\n@Last Modified time: 2022-11-11\n@Title : Athena BOTO3\n\n'

In [2]:
import boto3
import time
import pandas as pd

In [3]:
import os
os.environ['AWS_DEFAULT_REGION'] = 'ap-south-1'
os.environ['AWS_ACCESS_KEY_ID'] = os.environ.get('test1_access_key')
os.environ['AWS_SECRET_ACCESS_KEY'] = os.environ.get('test1_secret_access_key')

### Athena queries through Python

In [4]:
DATABASE_NAME = 'athena_db'
RESULT_OUTPUT_LOCATION = "s3://boto3-athena-output/"
TABLE_NAME = 'employee'
INPUT_LOCATION = 's3://csv-athena-bucket/'

#### 1. CREATE TABLE

In [12]:
def connect_to_athena():
    """

    Description:
        To connect to AWS Athena service.
    Parameter:
        No parameters
    Return:
        ServiceResource athena

    """
    athena =  boto3.client('athena')
    return athena

def create_database(athena):
    response = athena.start_query_execution(
        QueryString=f"create database if not exists {DATABASE_NAME}",
        ResultConfiguration={"OutputLocation": RESULT_OUTPUT_LOCATION}
    )

    return response["QueryExecutionId"]    

def has_query_succeeded(athena,execution_id):
    state = "RUNNING"
    max_execution = 5

    while max_execution > 0 and state in ["RUNNING", "QUEUED"]:
        max_execution -= 1
        response = athena.get_query_execution(QueryExecutionId=execution_id)
        if (
            "QueryExecution" in response
            and "Status" in response["QueryExecution"]
            and "State" in response["QueryExecution"]["Status"]
        ):
            state = response["QueryExecution"]["Status"]["State"]
            if state == "SUCCEEDED":
                return True

        time.sleep(30)

def create_table(athena):
    response = athena.start_query_execution(
        QueryString=f"""CREATE EXTERNAL TABLE IF NOT EXISTS `athena_db`.`employee` (
            `EmpID` string,
            `FirstName` string,
            `LastName` string,
            `Role` string,
            `Dept` string,
            `MobNo` bigint
            )
            ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
            WITH SERDEPROPERTIES ('field.delim' = ',')
            STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
            LOCATION 's3://csv-athena-bucket/'
            TBLPROPERTIES ('classification' = 'csv');""",
        ResultConfiguration={"OutputLocation": RESULT_OUTPUT_LOCATION}
    )

    return response["QueryExecutionId"]



def get_all_rows(athena):
    query = f"SELECT * from {DATABASE_NAME}.{TABLE_NAME}"
    response = athena.start_query_execution(
        QueryString=query,
        ResultConfiguration={"OutputLocation": RESULT_OUTPUT_LOCATION}
    )

    return response["QueryExecutionId"]

def get_query_results(athena,execution_id):
    response = athena.get_query_results(
        QueryExecutionId=execution_id
    )

    results = response['ResultSet']['Rows']
    return results

def query_to_df(query_result):
    header = []
    header_result = query_result[0]
    for key in header_result:
        for record in header_result[key]:
            for data_type in record:
                header.append(record[data_type]) 

    record_result = query_result[1:]
    data = []
    for result in record_result:
        row = []
        for key in result:
            for record in result[key]:
                for data_type in record:
                    row.append(record[data_type])
        data.append(row)             

    # Create the pandas DataFrame
    df = pd.DataFrame(data, columns=header)
    df = df.dropna()
    # print dataframe.
    print(df)

def main():

    athena = connect_to_athena()

    # 1. Create Database if exists
    execution_id = create_database(athena)
    print(f"Checking query execution for: {execution_id}")

    # 2. Check query execution
    query_status = has_query_succeeded(athena,execution_id=execution_id)
    print(f"Query state: {query_status}")

    # 3. Create Table
    execution_id = create_table(athena)
    print(f"Create Table execution id: {execution_id}")

    # 4. Check query execution
    query_status = has_query_succeeded(athena,execution_id=execution_id)
    print(f"Query state: {query_status}")

    # 5. Query Athena table
    execution_id = get_all_rows(athena)
    print(f"Get all Rows execution id: {execution_id}")

    query_status = has_query_succeeded(athena,execution_id=execution_id)
    print(f"Query state: {query_status}")

    # 6. Query Results
    query_result = get_query_results(athena,execution_id=execution_id)

    # 7. Print query result as a pandas dataframe
    print("Reading all records of the table:")
    query_to_df(query_result)

    # print(get_query_results(athena,execution_id=execution_id))
    # print(type(get_query_results(athena,execution_id=execution_id)))

if __name__ == "__main__":
    main()


Checking query execution for: 550eb233-80e5-4f36-9fa3-566af323a32f
Query state: True
Create Table execution id: 748a4d03-6122-4157-8108-52a5473895b8
Query state: True
Get all Rows execution id: 33e2e4e6-5f36-4c38-a0c0-0bb640536612
Query state: True
Reading all records of the table:
    empid firstname lastname                  role             dept  \
0  EMP006     Priya     Nair                    HR  Human Resources   
1  EMP007    Rachel     Zane            Accountant          Finance   
2  EMP001    George    Smith         Data Engineer               IT   
3  EMP002     Rahul    Kumar    FrontEnd Developer               IT   
4  EMP003    Priyal    Nayak        Sales Engineer        Marketing   
5  EMP004     Rahul    Kumar  Business Development              CEO   
6  EMP005  Vighnesh    Bilgi            BI Analyst        Marketing   

        mobno  
0  7711992233  
1  7766554433  
2  9911223344  
3  8220858443  
4  9876543210  
5  9988776655  
6  9049480396  


#### 2. READ TABLE

In [10]:
def connect_to_athena():
    """

    Description:
        To connect to AWS Athena service.
    Parameter:
        No parameters
    Return:
        ServiceResource athena

    """
    athena =  boto3.client('athena')
    return athena
  

def has_query_succeeded(athena,execution_id):
    """

    Description:
        To check the status of the query.
    Parameter:
        ServiceResource athena,
        string execution_id
    Return:
        bool 

    """

    state = "RUNNING"
    max_execution = 5

    while max_execution > 0 and state in ["RUNNING", "QUEUED"]:
        max_execution -= 1
        response = athena.get_query_execution(QueryExecutionId=execution_id)
        if (
            "QueryExecution" in response
            and "Status" in response["QueryExecution"]
            and "State" in response["QueryExecution"]["Status"]
        ):
            state = response["QueryExecution"]["Status"]["State"]
            if state == "SUCCEEDED":
                return True

        time.sleep(30)


def run_query(athena,query):
    """

    Description:
        To run query the query and store result of query in the output bucket location.
    Parameter:
        ServiceResource athena,
        string query
    Return:
        bool 

    """
    response = athena.start_query_execution(
        QueryString=query,
        ResultConfiguration={"OutputLocation": RESULT_OUTPUT_LOCATION}
    )

    return response["QueryExecutionId"]

def get_query_results(athena,execution_id):
    """

    Description:
        To fetch the records of the query as a list of dictionaries with each dictionary being one record including header.
    Parameter:
        ServiceResource athena,
        string execution_id
    Return:
        list results 

    """
    response = athena.get_query_results(
        QueryExecutionId=execution_id
    )

    results = response['ResultSet']['Rows']
    return results

def query_to_df(query_result):
    """

    Description:
        To extract the records from 'query_result' and display it as a Pandas DataFrame
    Parameter:
        list query_result
    Return:
        No values returned.

    """
    header = []
    header_result = query_result[0]
    for key in header_result:
        for record in header_result[key]:
            for data_type in record:
                header.append(record[data_type]) 

    record_result = query_result[1:]
    data = []
    for result in record_result:
        row = []
        for key in result:
            for record in result[key]:
                for data_type in record:
                    row.append(record[data_type])
        data.append(row)             

    # Create the pandas DataFrame
    df = pd.DataFrame(data, columns=header)
    df = df.dropna()
    # print dataframe.
    print(df)     

def main():

    athena = connect_to_athena()

    # Selecting FirstName, LastName and Mobile Number columns from the table
    print("Selecting FirstName, LastName and Mobile Number columns from the table")
    query = f"SELECT firstname, lastname, mobno from {DATABASE_NAME}.{TABLE_NAME}"
    execution_id = run_query(athena,query)
    print(f"Get query execution id: {execution_id}")

    query_status = has_query_succeeded(athena,execution_id=execution_id)
    print(f"Query state: {query_status}")

    query_result = get_query_results(athena,execution_id=execution_id)
    query_to_df(query_result)

    # Counting number of employees in Marketing Department from the table
    print("Counting number of employees in Marketing Department from the table")
    query = f"SELECT count(dept) as NumberOfMarketingEMployees from {DATABASE_NAME}.{TABLE_NAME} where dept = 'Marketing'"
    execution_id = run_query(athena,query)
    print(f"Get query execution id: {execution_id}")

    query_status = has_query_succeeded(athena,execution_id=execution_id)
    print(f"Query state: {query_status}")

    query_result = get_query_results(athena,execution_id=execution_id)
    query_to_df(query_result)

if __name__ == "__main__":
    main()

Selecting FirstName, LastName and Mobile Number columns from the table
Get query execution id: cd258455-c807-4872-8b68-33d6b578ece4
Query state: True
  firstname lastname       mobno
0    George    Smith  9911223344
1     Rahul    Kumar  8220858443
2    Priyal    Nayak  9876543210
3     Rahul    Kumar  9988776655
4  Vighnesh    Bilgi  9049480396
Counting number of employees in Marketing Department from the table
Get query execution id: 29a7e32c-6603-4931-ae71-5b56b108dcf9
Query state: True
  NumberOfMarketingEMployees
0                          2


#### 3. UPDATE TABLE

In [11]:
def connect_to_athena():
    """

    Description:
        To connect to AWS Athena service.
    Parameter:
        No parameters
    Return:
        ServiceResource athena

    """
    athena =  boto3.client('athena')
    return athena
  

def has_query_succeeded(athena,execution_id):
    """

    Description:
        To check the status of the query.
    Parameter:
        ServiceResource athena,
        string execution_id
    Return:
        bool 

    """
    state = "RUNNING"
    max_execution = 5

    while max_execution > 0 and state in ["RUNNING", "QUEUED"]:
        max_execution -= 1
        response = athena.get_query_execution(QueryExecutionId=execution_id)
        if (
            "QueryExecution" in response
            and "Status" in response["QueryExecution"]
            and "State" in response["QueryExecution"]["Status"]
        ):
            state = response["QueryExecution"]["Status"]["State"]
            if state == "SUCCEEDED":
                return True

        time.sleep(30)


def run_query(athena,query):
    """

    Description:
        To run query the query and store result of query in the output bucket location.
    Parameter:
        ServiceResource athena,
        string query
    Return:
        bool 

    """
    response = athena.start_query_execution(
        QueryString=query,
        ResultConfiguration={"OutputLocation": RESULT_OUTPUT_LOCATION}
    )

    return response["QueryExecutionId"]

def get_query_results(athena,execution_id):
    """

    Description:
        To fetch the records of the query as a list of dictionaries with each dictionary being one record including header.
    Parameter:
        ServiceResource athena,
        string execution_id
    Return:
        list results 

    """
    
    response = athena.get_query_results(
        QueryExecutionId=execution_id
    )

    results = response['ResultSet']['Rows']
    return results

def query_to_df(query_result):
    """

    Description:
        To extract the records from 'query_result' and display it as a Pandas DataFrame
    Parameter:
        list query_result
    Return:
        No values returned.

    """
    header = []
    header_result = query_result[0]
    for key in header_result:
        for record in header_result[key]:
            for data_type in record:
                header.append(record[data_type]) 

    record_result = query_result[1:]
    data = []
    for result in record_result:
        row = []
        for key in result:
            for record in result[key]:
                for data_type in record:
                    row.append(record[data_type])
        data.append(row)             

    # Create the pandas DataFrame
    df = pd.DataFrame(data, columns=header)
    df = df.dropna()
    # print dataframe.
    print(df)     

def main():

    athena = connect_to_athena()

    # Inserting 2 new employee details into the records.
    print("Inserting 2 new employee details into the records")
    query = f"""INSERT INTO {DATABASE_NAME}.{TABLE_NAME} VALUES ('EMP006', 'Priya', 'Nair', 'HR', 'Human Resources', 7711992233), 
                ('EMP007', 'Rachel', 'Zane', 'Accountant', 'Finance', 7766554433);"""
    execution_id = run_query(athena,query)
    print(f"Get query execution id: {execution_id}")

    query_status = has_query_succeeded(athena,execution_id=execution_id)
    print(f"Query state: {query_status}")

    query = f"SELECT * FROM {DATABASE_NAME}.{TABLE_NAME} ;"
    execution_id = run_query(athena,query)
    print(f"Get query execution id: {execution_id}")

    query_status = has_query_succeeded(athena,execution_id=execution_id)
    print(f"Query state: {query_status}")

    query_result = get_query_results(athena,execution_id=execution_id)
    query_to_df(query_result)


if __name__ == "__main__":
    main()

Inserting 2 new employee details into the records
Get query execution id: 77fd7acb-e1b7-4f5b-82a1-a159c5f2a261
Query state: True
Get query execution id: 084e2eb7-9416-4e34-b9fb-aa5a43be86f0
Query state: True
    empid firstname lastname                  role             dept  \
0  EMP006     Priya     Nair                    HR  Human Resources   
1  EMP007    Rachel     Zane            Accountant          Finance   
2  EMP001    George    Smith         Data Engineer               IT   
3  EMP002     Rahul    Kumar    FrontEnd Developer               IT   
4  EMP003    Priyal    Nayak        Sales Engineer        Marketing   
5  EMP004     Rahul    Kumar  Business Development              CEO   
6  EMP005  Vighnesh    Bilgi            BI Analyst        Marketing   

        mobno  
0  7711992233  
1  7766554433  
2  9911223344  
3  8220858443  
4  9876543210  
5  9988776655  
6  9049480396  


#### 4. DELETE TABLE

In [16]:
def connect_to_athena():
    """

    Description:
        To connect to AWS Athena service.
    Parameter:
        No parameters
    Return:
        ServiceResource athena

    """
    athena =  boto3.client('athena')
    return athena
  

def has_query_succeeded(athena,execution_id):
    """

    Description:
        To check the status of the query.
    Parameter:
        ServiceResource athena,
        string execution_id
    Return:
        bool 

    """
    state = "RUNNING"
    max_execution = 5

    while max_execution > 0 and state in ["RUNNING", "QUEUED"]:
        max_execution -= 1
        response = athena.get_query_execution(QueryExecutionId=execution_id)
        if (
            "QueryExecution" in response
            and "Status" in response["QueryExecution"]
            and "State" in response["QueryExecution"]["Status"]
        ):
            state = response["QueryExecution"]["Status"]["State"]
            if state == "SUCCEEDED":
                return True

        time.sleep(30)


def run_query(athena,query):
    """

    Description:
        To run query the query and store result of query in the output bucket location.
    Parameter:
        ServiceResource athena,
        string query
    Return:
        bool 

    """

    response = athena.start_query_execution(
        QueryString=query,
        ResultConfiguration={"OutputLocation": RESULT_OUTPUT_LOCATION}
    )

    return response["QueryExecutionId"]

def get_query_results(athena,execution_id):
    response = athena.get_query_results(
        QueryExecutionId=execution_id
    )

    results = response['ResultSet']['Rows']
    return results

def query_to_df(query_result):
    """

    Description:
        To extract the records from 'query_result' and display it as a Pandas DataFrame
    Parameter:
        list query_result
    Return:
        No values returned.

    """

    if query_result != []:

        header = []
        header_result = query_result[0]
        for key in header_result:
            for record in header_result[key]:
                for data_type in record:
                    header.append(record[data_type]) 

        record_result = query_result[1:]
        data = []
        for result in record_result:
            row = []
            for key in result:
                for record in result[key]:
                    for data_type in record:
                        row.append(record[data_type])
            data.append(row)             

        # Create the pandas DataFrame
        df = pd.DataFrame(data, columns=header)
        df = df.dropna()
        # print dataframe.
        print(df)

    else:
        print("No records exist in table or table is dropped.")      

def main():

    athena = connect_to_athena()

    # DROPPING TABLE.
    print("DROPPING TABLE")
    query = f"""DROP TABLE IF EXISTS {DATABASE_NAME}.{TABLE_NAME}"""
    execution_id = run_query(athena,query)
    print(f"Get query execution id: {execution_id}")

    query_status = has_query_succeeded(athena,execution_id=execution_id)
    print(f"Query state: {query_status}")

    query_result = get_query_results(athena,execution_id=execution_id)
    query_to_df(query_result)


if __name__ == "__main__":
    main()

DROPPING TABLE
Get query execution id: 5c53f2f4-062b-472d-a27d-1fd47da2e886
Query state: True
No records exist in table or table is dropped.
