# ETL Pipeline for Pterodactyl Application

### Index

- Install requierements
- Import libraries and setup key variables
- Setup directories, functions and folder creation
- Get Pterodactyl Application information
- Upload csv table files into Postgres

## Install requierements

In [None]:
pip install -r requirements.txt

## Import libraries and setup key variables
Remember to add you own credentials in the .env file for them to be loaded here

In [1]:
import datetime, csv, os
from sqlalchemy import create_engine, text
from pydactyl import PterodactylClient
from dotenv import load_dotenv
import pandas as pd

# Load .env file credentials
load_dotenv()

# Database connection
host = os.getenv('POSTGRES_HOST')
port = os.getenv('POSTGRES_PORT')
database = os.getenv('POSTGRES_DATABASE')
username = os.getenv('POSTGRES_USERNAME')
password = os.getenv('POSTGRES_PASSWORD')
connection = f'postgresql://{username}:{password}@{host}:{port}/{database}'

# Pterodactyl connection
pterodactyl_url = os.getenv('PTERODACTYL_URL')
application_api_key = os.getenv('PTERODACTYL_APP_KEY')
client_api_key = os.getenv('PTERODACTYL_CLI_KEY')

# Connecto to Pterodactyl Application API
api_app = PterodactylClient(pterodactyl_url, application_api_key, debug=False)
# Connecto to Pterodactyl Client API
api_cli = PterodactylClient(pterodactyl_url, client_api_key, debug=False)

## Setup directories, functions and folder creation

In [4]:
# Setup directories
pwd = os.getcwd() #os.path.dirname(os.path.realpath(__file__)) this is used for .py files
server_app_folder = os.path.join(pwd, 'server_app_data')

# Function definition
from functions import save_to_csv, sort_list_logs, flatten_list

# Create new folder if not exists
def mkdir(folder_dir):
    if not os.path.exists(folder_dir):
        os.makedirs(os.path.join(pwd, folder_dir))

# Create folder if not exist
mkdir(server_app_folder)

## Get Pterodactyl Application information
About: locations, nodes, nests, eggs, servers, clients

In [5]:
# Extracting data from Pterodactyl App
list_of_clients = api_app.user.list_users()
all_clients = [[client['attributes'] for client in clients]for clients in list_of_clients]

list_of_locations = api_app.locations.list_locations()
all_locations = [[location['attributes'] for location in locations]for locations in list_of_locations]

list_of_nodes = api_app.nodes.list_nodes()
all_nodes = [[node['attributes']for node in nodes] for nodes in list_of_nodes]

list_of_nodes_and_allocations = api_app.nodes.list_nodes(includes=['allocations'])
all_allocations = [[[{'node_id': node['attributes']['id'], **allocations['attributes']} for allocations in node['attributes']['relationships']['allocations']['data']] for node in nodes] for nodes in list_of_nodes_and_allocations][0]

list_of_nests_and_eggs = api_app.nests.list_nests(includes=['eggs'])
all_nests = [[nest['attributes'] for nest in nests] for nests in list_of_nests_and_eggs]
all_eggs = [[eggs['attributes'] for eggs in nests['attributes']['relationships']['eggs']['data']] for nests in list_of_nests_and_eggs]

list_of_servers_and_clients = api_app.servers.list_servers(includes=['subusers'])
all_servers = [[server['attributes'] for server in servers] for servers in list_of_servers_and_clients]
all_client_server = [[client_server['attributes'] for client_server in servers['attributes']['relationships']['subusers']['data']] for servers in list_of_servers_and_clients]

# Get the current timestamp with timezone information (UTC)
last_update = datetime.datetime.now(datetime.timezone.utc)

# Cleaning and filtering columns
df_clients = pd.DataFrame(all_clients[0])
df_clients = df_clients[['id', 'uuid', 'username', 'email', 'first_name', 'last_name', 'root_admin', '2fa', 'created_at', 'updated_at']].rename(columns={'username': 'client_name', 'root_admin': 'admin'})

df_locations = pd.DataFrame(all_locations[0])
df_locations = df_locations[['id', 'short', 'long', 'created_at', 'updated_at']]

df_nodes = pd.DataFrame(all_nodes[0])
df_nodes['allocated_memory'] = df_nodes['allocated_resources'].apply(lambda x: x.get('memory', None))
df_nodes['allocated_disk'] = df_nodes['allocated_resources'].apply(lambda x: x.get('disk', None))
df_nodes = df_nodes[['id', 'uuid', 'public', 'name', 'description', 'location_id', 'fqdn', 'scheme', 'behind_proxy', 'maintenance_mode', 'memory', 'disk', 'allocated_memory', 'allocated_disk', 'upload_size', 'daemon_listen', 'daemon_sftp', 'daemon_base','created_at', 'updated_at']].rename(columns={'': '', '': ''})

df_allocations = pd.DataFrame(flatten_list(all_allocations))
df_allocations = df_allocations[['id', 'port', 'assigned', 'node_id']]

df_nests = pd.DataFrame(all_nests[0])
df_nests = df_nests[['id', 'uuid', 'name', 'description', 'author', 'created_at', 'updated_at']]

df_eggs = pd.DataFrame(all_eggs[0])
df_eggs = df_eggs[['id', 'uuid', 'name', 'description', 'nest', 'author', 'created_at', 'updated_at']].rename(columns={'nest': 'nest_id'})

df_servers = pd.DataFrame(all_servers[0])
df_servers['limit_memory'] = df_servers['limits'].apply(lambda x: x.get('memory', None))
df_servers['limit_disk'] = df_servers['limits'].apply(lambda x: x.get('disk', None))
df_servers['limit_io'] = df_servers['limits'].apply(lambda x: x.get('io', None))
df_servers['limit_cpu'] = df_servers['limits'].apply(lambda x: x.get('cpu', None))
df_servers['limit_oom_disable'] = df_servers['limits'].apply(lambda x: x.get('oom_disable', None))
df_servers['limit_database'] = df_servers['feature_limits'].apply(lambda x: x.get('database', None))
df_servers['limit_allocation'] = df_servers['feature_limits'].apply(lambda x: x.get('allocation', None))
df_servers['limit_backup'] = df_servers['feature_limits'].apply(lambda x: x.get('backup', None))
df_servers = df_servers[['id', 'uuid', 'identifier', 'name', 'description', 'limit_memory', 'limit_disk', 'limit_io', 'limit_cpu', 'limit_oom_disable', 'limit_database', 'limit_allocation', 'limit_backup', 'user', 'node', 'allocation', 'nest', 'egg','created_at', 'updated_at']].rename(columns={'user': 'client_id', 'node': 'node_id', 'allocation': 'allocation_id', 'nest': 'nest_id', 'egg': 'egg_id'})

flattened_client_server = [item for sublist in all_client_server for item in sublist]
df_clients_server = pd.DataFrame(flattened_client_server)[['id', 'user_id', 'server_id', 'created_at', 'updated_at']].rename(columns={'user_id': 'client_id'})
df_clients_server

# Exporting data into .csv files
df_clients.to_csv(os.path.join(server_app_folder, 'clients.csv'), index=False)
df_locations.to_csv(os.path.join(server_app_folder,'locations.csv'), index=False)
df_nodes.to_csv(os.path.join(server_app_folder,'nodes.csv'), index=False)
df_allocations.to_csv(os.path.join(server_app_folder,'allocations.csv'), index=False)
df_nests.to_csv(os.path.join(server_app_folder,'nests.csv'), index=False)
df_eggs.to_csv(os.path.join(server_app_folder,'eggs.csv'), index=False)
df_servers.to_csv(os.path.join(server_app_folder,'servers.csv'), index=False)
df_clients_server.to_csv(os.path.join(server_app_folder,'clients_server.csv'), index=False)

## Upload csv table files into Postgres

In [None]:
# Setup database variables
ID = 'id'
TABLE = file_table.split('.')[0]
TABLE_UPDATE = TABLE + '_update'
IS_ACTIVE_TABLE = 'is_active_table'
SCHEMA = 'pterodactyl'
SCHEMA_UPDATE = 'pterodactyl_update'

engine = create_engine(connection)

for file_table in os.listdir(server_app_folder):
    
    # Reading of the file_table
    df = pd.read_csv(os.path.join(server_app_folder, file_table))

    # Start connection with database
    with engine.connect() as conn:
        # Start a new transaction
        trans = conn.begin()

        try:
            # Load ID from database
            result = conn.execute(text(f'SELECT "{ID}" FROM {SCHEMA}.{TABLE}'))
            db = pd.DataFrame(result.fetchall(), columns=result.keys())

            # Compare ID
            sameID = db[ID].isin(df[ID])

            toUpdate = df[df[ID].isin(db[ID][sameID])]
            toIngest = df[~df[ID].isin(db[ID][sameID])]
            toDelete = db[~db[ID].isin(df[ID])]

            # Insert the DataFrame into a table
            toIngest.to_sql(TABLE, conn, schema=SCHEMA, if_exists='append', index=False)

            # Insert the updatable DataFrame into the TABLE_UPDATE table
            toUpdate.to_sql(TABLE_UPDATE, conn, schema=SCHEMA_UPDATE, if_exists='append', index=False)

            # Define and execute the following queries
            conn.execute(text(f'DELETE FROM {SCHEMA}.{TABLE} WHERE "{ID}" IN (SELECT "{ID}" FROM {SCHEMA_UPDATE}.{TABLE_UPDATE});'))
            conn.execute(text(f'INSERT INTO {SCHEMA}.{TABLE} SELECT * FROM {SCHEMA_UPDATE}.{TABLE_UPDATE};'))
            conn.execute(text(f'TRUNCATE TABLE {SCHEMA_UPDATE}.{TABLE_UPDATE};'))

            # Update column "is_active" from tables when data is deleted from Pterodactyl App
            toDelete.to_sql(IS_ACTIVE_TABLE, conn, schema=SCHEMA_UPDATE, if_exists='append', index=False)
            conn.execute(text(f'UPDATE {SCHEMA}.{TABLE} SET is_active = false WHERE "{ID}" IN (SELECT * FROM {SCHEMA_UPDATE}.{IS_ACTIVE_TABLE});'))
            conn.execute(text(f'TRUNCATE TABLE {SCHEMA_UPDATE}.{IS_ACTIVE_TABLE};'))

            # Commit the transaction
            trans.commit()

        except Exception as e:
            # Rollback the transaction on exception
            print('!!! [ERROR IN DATABASE QUERIES] !!!')
            trans.rollback()
            print('Transaction has been rolled back')
            print(f'Error occurred during transaction:\n{e}')
            raise

with engine.connect() as conn:
    # Start a new transaction
    trans = conn.begin()

    try:
        # Update date from the last_update table based on max date on the file
        old_last_update = conn.execute(text(f'SELECT date FROM {SCHEMA}.last_update')).fetchall()[0][0]
        new_last_update = last_update
        if new_last_update > old_last_update:
            conn.execute(text(f"UPDATE {SCHEMA}.last_update SET date = '{new_last_update}';"))

        # Commit the transaction
        trans.commit()

    except Exception as e:
        # Rollback the transaction on exception
        print('!!! [ERROR IN DATABASE QUERIES] !!!')
        trans.rollback()
        print('Transaction has been rolled back')
        print(f'Error occurred during transaction:\n{e}')
        raise