# Introduction
You can find a web interface for MinIO at [console.share.pads.fim.uni-passau.de](https://console.share.pads.fim.uni-passau.de/).
Create a service account under /account to obtain an **access key** and a **secret key**. These have to immediately noted down (but you can create multiple such pairs). They are used as access credentials with a MinIO client.

# Imports

In [65]:
import os
import json
import s3fs
import uuid
import requests
import pandas as pd

from tqdm import tqdm
from minio import Minio
from minio.error import S3Error

# Obtaining an MinIO Access Token
- links to the infrastructure
- link to the wiki explaining that as well
- OAuth2 reference

In [77]:
MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY")
MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY")
config_access = {
    "accesskey": MINIO_ACCESS_KEY,
    "secretkey": MINIO_SECRET_KEY,
    "endpoint": "https://share.innkube.fim.uni-passau.de/",
}

In [78]:
config_access["accesskey"]

'Swbbp1T9AMXHrqzwteHU'

In [79]:
path_pads_config = os.path.join(os.path.expanduser("~"), ".pypads", "access-minio.json")

In [80]:
if True:  # Set this to true to remove your local configuration file
    os.remove(path_pads_config)
    print("Do not forget to deactivate this cell again via the above `if True` condition.")

Do not forget to deactivate this cell again via the above `if True` condition.


In [81]:
overwrite = False
if not os.path.exists(path_pads_config):
    overwrite = True
    path_pads_base = os.path.dirname(path_pads_config)
    if not os.path.exists(path_pads_base):
        os.makedirs(path_pads_base)
else:
    with open(path_pads_config, "r") as handle_read_pads_config:
        config_access = json.load(handle_read_pads_config)

print("Endpoint in your access configuration is set to\n\t%s" % config_access["endpoint"])

if config_access["accesskey"] is None or config_access["accesskey"] == "":
    print("WARNING: your accesskey is currently not set in %s" % path_pads_config)
    overwrite = True
if config_access["secretkey"] is None or config_access["secretkey"] == "":
    print("WARNING: your secretkey is currently not set in %s" % path_pads_config)
    overwrite = True
    
if overwrite:
    with open(path_pads_config, "w+") as handle_write_pads_config:
        json.dump(config_access, handle_write_pads_config)
        print("Written access config to '%s'." % path_pads_config)

accesskey = config_access["accesskey"]
secretkey = config_access["secretkey"]

Endpoint in your access configuration is set to
	https://console.share.innkube.fim.uni-passau.de/
Written access config to '/root/.pypads/access-minio.json'.


# Connect to MinIO with S3FileSystem Client

In [91]:
s3 = s3fs.S3FileSystem(
    key=accesskey,
    secret=secretkey,
    use_ssl=True,
    client_kwargs={
      "region_name"       : "Europe",
      "endpoint_url": "https://share.innkube.fim.uni-passau.de/"
    }
)

In [92]:
files = s3.ls("public")

PermissionError: Access Denied.

In [83]:
s3.ls("/")

OSError: [Errno 22] S3 API Requests must be made to API port.

In [13]:
s3.ls("/homes/stier/")

['homes/stier/abc']

In [14]:
def delete_path(path):
    name_base = os.path.dirname(path)
    if name_base is None or name_base == "" or name_base == "/":
        raise ValueError("Do not delete base buckets easily. You passed %s" % path)
    
    if not s3.exists(path):
        raise FileNotFoundError("No such bucket/path %s" % path)

    if s3.isfile(path):
        try:
            s3.rm(path)
        except FileNotFoundError:
            pass
    else:
        for path_file in s3.ls(path, refresh=True):
            delete_path(path_file)
        if s3.exists(path):
            from botocore.exceptions import ParamValidationError
            try:
                # The bucket might be already removed, then we catch 
                s3.rmdir(path)
            except ParamValidationError:
                #print("WARNING: failed removing s3 path '%s'" % path)
                s3.rm(path)

In [15]:
s3.ls("datasets")

[]

In [16]:
s3.info("datasets")

{'CreationDate': datetime.datetime(2021, 12, 10, 12, 42, 47, 785000, tzinfo=tzutc()),
 'Key': 'datasets',
 'Size': 0,
 'StorageClass': 'BUCKET',
 'size': 0,
 'type': 'directory',
 'name': 'datasets'}

In [17]:
path_base = "/homes/stier/"  # Adapt this to your desired base bucket path

In [18]:
# Let's create a demo base folder
name_demo = str(uuid.uuid4())
path_base_demo = os.path.join(path_base, "demofolder-" + name_demo)
path_base_demo

'/homes/stier/demofolder-c73a373e-8800-4796-8712-1fcc9674af82'

In [19]:
# Execute the creation of the base folder and show infos of the bucket
try:
    s3.mkdir(path_base_demo)
    print("Bucket %s created just now." % path_base_demo)
except FileExistsError:
    print("Bucket %s already created. Did you execute the cell twice?" % path_base_demo)

Bucket /homes/stier/demofolder-c73a373e-8800-4796-8712-1fcc9674af82 created just now.


In [20]:
# A tool to check whether a specific path exists
print("Does", path_base_demo, "exist?", s3.exists(path_base_demo))

Does /homes/stier/demofolder-c73a373e-8800-4796-8712-1fcc9674af82 exist? False


In [21]:
# Create an empty file by 'touching' it
file_empty = os.path.join(path_base_demo, "empty_file.txt")
print(file_empty)
s3.touch(file_empty)
print("Does", file_empty, "exist?", s3.exists(file_empty))

/homes/stier/demofolder-c73a373e-8800-4796-8712-1fcc9674af82/empty_file.txt
Does /homes/stier/demofolder-c73a373e-8800-4796-8712-1fcc9674af82/empty_file.txt exist? True


In [22]:
# Check whether the path now exists
print("Does", path_base_demo, "exist?", s3.exists(path_base_demo))
s3.info(path_base_demo), s3.ls(path_base_demo)

Does /homes/stier/demofolder-c73a373e-8800-4796-8712-1fcc9674af82 exist? True


({'Key': 'homes/stier/demofolder-c73a373e-8800-4796-8712-1fcc9674af82',
  'name': 'homes/stier/demofolder-c73a373e-8800-4796-8712-1fcc9674af82',
  'type': 'directory',
  'Size': 0,
  'size': 0,
  'StorageClass': 'DIRECTORY'},
 ['homes/stier/demofolder-c73a373e-8800-4796-8712-1fcc9674af82/empty_file.txt'])

In [23]:
# Creating a file with UTF-8 encoded json text in it
path_file_hyperparams = os.path.join(path_base_demo, "hyperparams.json")
path_file_hyperparams

'/homes/stier/demofolder-c73a373e-8800-4796-8712-1fcc9674af82/hyperparams.json'

In [24]:
hyperparams = {
    "model": "ResNet18",
    "learningrate": 0.01,
    "numberepochs": 100
}
with s3.open(path_file_hyperparams, "wb") as handle_hyperparams:
    written_json = json.dumps(hyperparams, indent=2)
    handle_hyperparams.write(bytes(written_json.encode("utf-8")))

In [25]:
# .. let's read back the file and print its content
try:
    with s3.open(path_file_hyperparams, "rb") as handle_hyperparams:
        read_bytes = handle_hyperparams.read()
        print(json.loads(read_bytes.decode("utf-8")))
except FileNotFoundError:
    print("The file %s does not exist on MinIO anymore!" % path_file_hyperparams)

{'model': 'ResNet18', 'learningrate': 0.01, 'numberepochs': 100}


In [26]:
# .. and removing the file again
s3.rm(path_file_hyperparams)
print("Does", os.path.basename(path_file_hyperparams), "still exist?", s3.exists(path_file_hyperparams))

Does hyperparams.json still exist? False


In [27]:
# Let's fill it with some auto-generated trash files
for ix in range(20):
    file_tmp_ix = os.path.join(path_base_demo, "%s.json" % ix)
    dict_content = {
        "file_idx": ix
    }
    with s3.open(file_tmp_ix, "wb") as handle_tmp_write:
        written_json = json.dumps(dict_content, indent=2)
        handle_tmp_write.write(bytes(written_json.encode("utf-8")))

In [28]:
s3.ls("/homes/stier/")

['homes/stier/abc',
 'homes/stier/demofolder-c73a373e-8800-4796-8712-1fcc9674af82']

In [29]:
s3.ls(path_base_demo, refresh=True)[:5]

['homes/stier/demofolder-c73a373e-8800-4796-8712-1fcc9674af82/0.json',
 'homes/stier/demofolder-c73a373e-8800-4796-8712-1fcc9674af82/1.json',
 'homes/stier/demofolder-c73a373e-8800-4796-8712-1fcc9674af82/10.json',
 'homes/stier/demofolder-c73a373e-8800-4796-8712-1fcc9674af82/11.json',
 'homes/stier/demofolder-c73a373e-8800-4796-8712-1fcc9674af82/12.json']

In [30]:
delete_path(path_base_demo)

In [31]:
s3.exists("/homes/stier/demofolder-43640d8b-ae82-4c81-b0dd-aec06e92fa41")

False

In [32]:
s3.ls("/homes/stier/demofolder-43640d8b-ae82-4c81-b0dd-aec06e92fa41", refresh=True)

[]

In [33]:
s3.rm("/homes/stier/demofolder-43640d8b-ae82-4c81-b0dd-aec06e92fa41")

In [34]:
url_data_digits_train = "https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra"
url_data_digits_test = "https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes"
file_data_digits_train = os.path.join(path_base_demo, "optdigits.tra")
file_data_digits_test = os.path.join(path_base_demo, "optdigits.tes")

In [35]:
%%time
# Download digits data from UCI database if it does not exist in MinIO, yet
if not s3.exists(file_data_digits_train):
    response = requests.get(url_data_digits_train, stream=True)
    with s3.open(file_data_digits_train, "wb") as handle:
        for data in tqdm(response.iter_content()):
            handle.write(data)


with s3.open(file_data_digits_train, "rb") as handle_data_read:
    df = pd.read_csv(handle_data_read)

df.describe()

563639it [00:03, 165161.98it/s]


CPU times: user 3.43 s, sys: 30.3 ms, total: 3.46 s
Wall time: 4.72 s


Unnamed: 0,0,1,6,15,12,1.1,0.1,0.2,0.3,7,...,0.21,0.22,0.23,6.3,14.1,7.4,1.3,0.24,0.25,0.26
count,3822.0,3822.0,3822.0,3822.0,3822.0,3822.0,3822.0,3822.0,3822.0,3822.0,...,3822.0,3822.0,3822.0,3822.0,3822.0,3822.0,3822.0,3822.0,3822.0,3822.0
mean,0.0,0.301151,5.481685,11.805076,11.451334,5.506541,1.387755,0.142334,0.002093,1.959184,...,0.148352,0.000262,0.283098,5.855835,11.942439,11.462323,6.701988,2.106227,0.20225,4.49843
std,0.0,0.867026,4.632199,4.260055,4.538141,5.613322,3.37181,1.051733,0.088583,3.051663,...,0.767858,0.016175,0.928156,4.980663,4.334947,4.992066,5.775834,4.028649,1.15084,2.869284
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,10.0,9.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,10.0,9.0,0.0,0.0,0.0,2.0
50%,0.0,0.0,5.0,13.0,13.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,13.0,13.0,6.0,0.0,0.0,4.0
75%,0.0,0.0,9.0,15.0,15.0,10.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,10.0,15.0,16.0,12.0,2.0,0.0,7.0
max,0.0,8.0,16.0,16.0,16.0,16.0,16.0,16.0,5.0,15.0,...,12.0,1.0,10.0,16.0,16.0,16.0,16.0,16.0,16.0,9.0


In [36]:
# Delete cache for digits train file on MinIO
s3.rm(file_data_digits_train)

In [37]:
%%time
# Download digits data from UCI database if it does not exist in MinIO, yet
if not s3.exists(file_data_digits_test):
    response = requests.get(url_data_digits_test, stream=True)
    with s3.open(file_data_digits_test, "wb") as handle:
        for data in tqdm(response.iter_content()):
            handle.write(data)


with s3.open(file_data_digits_test, "rb") as handle_data_read:
    df = pd.read_csv(handle_data_read)

df.describe()

264712it [00:01, 145009.21it/s]


CPU times: user 1.77 s, sys: 30 µs, total: 1.77 s
Wall time: 2.65 s


Unnamed: 0,0,0.1,5,13,9,1,0.2,0.3,0.4,0.5,...,0.23,0.24,0.25,6,13.2,10.2,0.26,0.27,0.28,0.29
count,1796.0,1796.0,1796.0,1796.0,1796.0,1796.0,1796.0,1796.0,1796.0,1796.0,...,1796.0,1796.0,1796.0,1796.0,1796.0,1796.0,1796.0,1796.0,1796.0,1796.0
mean,0.0,0.304009,5.2049,11.835189,11.849666,5.784521,1.363029,0.129733,0.005568,1.994989,...,0.20657,0.000557,0.27951,5.55735,12.08853,11.810134,6.767817,2.069042,0.364699,4.493318
std,0.0,0.907416,4.756148,4.249936,4.288055,5.666871,3.326546,1.037667,0.094248,3.196704,...,0.984663,0.023596,0.934539,5.10443,4.37586,4.935137,5.900105,4.091396,1.86062,2.86414
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,11.0,10.0,0.0,0.0,0.0,2.0
50%,0.0,0.0,4.0,13.0,13.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,13.0,14.0,6.0,0.0,0.0,4.0
75%,0.0,0.0,9.0,15.0,15.0,11.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,10.0,16.0,16.0,12.0,2.0,0.0,7.0
max,0.0,8.0,16.0,16.0,16.0,16.0,16.0,15.0,2.0,16.0,...,13.0,1.0,9.0,16.0,16.0,16.0,16.0,16.0,16.0,9.0


In [38]:
# Delete cache for digits train file on MinIO
s3.rm(file_data_digits_test)

In [39]:
s3.ls("/homes/stier/")

['homes/stier/abc']

In [40]:
s3.glob("/homes/stier/*")

[]

# Variant B: MinIO Python Client

In [41]:
client = Minio(
    "share.pads.fim.uni-passau.de",
    access_key=accesskey,
    secret_key=secretkey,
)

- [ ] test connection with minio
- [ ] list an accessible bucket
- [ ] list all content in a bucket
- [ ] create a new bucket
- [ ] delete a bucket
- [ ] upload a file to minio
- [ ] read / download the content of a file from minio
- [ ] handling large files on minio

In [42]:
client.bucket_exists("homes")

True

In [43]:
client.bucket_exists("homes/stier")

ValueError: Bucket name does not follow S3 standards. Bucket: homes/stier

In [None]:
buckets = client.list_buckets()
for bucket in buckets:
    print(bucket.name, bucket.creation_date)

In [None]:
objs = client.list_objects("homes", prefix="st")
for obj in objs:
    print(obj.object_name)

In [None]:
objs = client.list_objects("homes/stier")
for obj in objs:
    print(obj.object_name)

In [None]:
bucket_home = client.list_objects("homes")
for obj in bucket_home:
    print(obj)

# OAuth2 Client Authentication
- [ ] local config file format? -- reconstruct previous code
- [ ] clean up access classes
- [ ] perform authentication with user credentials?
- [ ] perform auth with access key + secret key?
- [ ] refreshin session

*~/.pypads/access.cfg*
```cfg
[DEFAULT]
padim-client-id = julian
padim-client-secret = pypads
padim-username = stier
padim-password = foo
padim-region-name = eu-de
padim-endpoint = "https://share.pads.fim.uni-passau.de"
```

In [None]:
import padaccs
from padaccs import get_refreshable_session, padas_kwargs, get_s3fs
from padaccs import _get_w_def

In [None]:
padaccs._load_pypads_config(padaccs._config)

In [None]:
_get_w_def("padim-client-id", "foo")

In [None]:
padaccs._config["DEFAULT"]["padim-client-id"]

In [None]:

# https://s3fs.readthedocs.io/en/latest/ for documentation. Works with pandas, csv, parquet and many more
# parquet is recommended, as it can reduce the amount of network traffic

fs = s3fs.S3FileSystem(anon=False, use_ssl=True, 
                       session= await get_refreshable_session().make_refreshable_proxy(),
                       client_kwargs=padas_kwargs)

print(fs.ls("public/"))

In [None]:
from padaccs import get_s3fs, get_refreshable_session
import asyncio

In [None]:
s3_session = get_refreshable_session()
loop = asyncio.get_event_loop()
if loop.is_running():
    loop.

In [None]:
loop

In [None]:
asyncio.get_event_loop()


In [None]:
import s3fs
fs = s3fs.S3FileSystem(anon=False, use_ssl=True, session=await s3_session.make_refreshable_proxy(),
                  client_kwargs={
                        "region_name"       : "eu-de",
                        "endpoint_url"      : "https://share.pads.fim.uni-passau.de/"
                    }, profile="pypads"
                )

In [None]:
fs.ls("public/")