In [1]:
import pandas as pd
import os
import sys
from io import BytesIO

import osfclient
from osfclient.utils import norm_remote_path

See [osfclient documentation](https://osfclient.readthedocs.io/en/latest/cli-usage.html) for details.

OSF storage limits:
* private components: 5GB
* public components: 50 GB
* [Possible providers](https://help.osf.io/hc/en-us/articles/360019737894-FAQs#:~:text=OSF%20supports%20many%20third%2Dparty,connect%20to%20Mendeley%20and%20Zotero.) of storage add-ons: Amazon S3, Bitbucket, Box, Dataverse, Dropbox, Figshare, Github, GitLab, Google Drive, OneDrive, Owncloud

In [2]:
# file in working directory with the format
# username=XXXX
# password=XXXX
osf_credentials = {}
with open("osf_credentials.txt", "r") as credfile:
    for l in credfile:
        osf_credentials[l.split("=")[0]] = l.split("=")[1]

# Read a file into a data frame

In [4]:
remote_path = "data/test_csv.csv" # remote file path & file name
storage = "osfstorage" # seems to be the name of the default OSF storage provider
project_ID = "2eyms" # get this from the URL of the project/component in the browser

# initialize the client and authenticate with the API
osf = osfclient.OSF(
    username=osf_credentials["username"],
    password=osf_credentials["password"]
)
project = osf.project(project_ID)

# initialize the storage (no idea what this actually does)
store = project.storage(storage)

# iterate through all files in the remote storage
for file_ in store.files:
    # if the file matches the wanted file, read it
    if norm_remote_path(file_.path) == remote_path:
        # create a binary file buffer
        fp = BytesIO()
        # manually tell the BytesIO object that it's in "binary" mode since the
        # osfclient.models.file.File object expects the "file" handle to have
        # a "mode" attribute
        fp.mode = "b" 
        # write the remote content to the buffer
        file_.write_to(fp)
        # reset the buffer content pointer (points to the end of the file after
        # writing the remote content)
        fp.seek(0)
        # read the buffer into a data frame
        df = pd.read_csv(fp)
        
df.head()

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1.39M/1.39M [00:00<00:00, 19.5Mbytes/s]


Unnamed: 0,__twarc.retrieved_at,__twarc.url,__twarc.version,attachments.media,attachments.media_keys,attachments.poll.duration_minutes,attachments.poll.end_datetime,attachments.poll.id,attachments.poll.options,attachments.poll.voting_status,...,reply_settings,source,text,type,withheld.scope,withheld.copyright,withheld.country_codes,hashtags,reference_type,wanted_tag
0,,,,,,,,,,,...,everyone,Twitter Web App,Für live Tweets folgt bitte u.a. @karolinedoer...,quoted,,,,"['entfristethanna', 'ichbinhanna']",no_reference,False
1,2021-10-01T07:00:45+00:00,https://api.twitter.com/2/tweets/search/all?ex...,2.3.10,,,,,,,,...,everyone,Twitter Web App,"Moin! @AndreasBovensc1, @SWH_HB, @SPDLandBreme...",,,,,['HannaWählt'],quoted,True
2,,,,,,,,,,,...,everyone,Twitter for Android,"Wir sind heute morgen auf einen # gestoßen, d...",quoted,,,,,no_reference,False
3,2021-10-01T07:00:45+00:00,https://api.twitter.com/2/tweets/search/all?ex...,2.3.10,,,,,,,,...,everyone,Twitter Web App,"Okay, @AndreasBovensc1, ich habe meinen heutig...",,,,,"['HannaWählt', 'IchBinHanna']",quoted,True
4,,,,,,,,,,,...,everyone,Twitter for Android,#IchbinHanna und ich hab kleine Kinder. Liebe ...,quoted,,,,['IchbinHanna'],no_reference,False


# Download a file

In [25]:
local_path = "../data/osf_test/testfile.txt" # local file path & file name
remote_path = "data/testfile.txt" # remote file path & file name
storage = "osfstorage" # seems to be the name of the default OSF storage provider
project_ID = "2eyms" # get this from the URL of the project in the browser

if local_path is None:
    _, local_path = os.path.split(remote_path)

if os.path.exists(local_path):
    sys.exit("Local file %s already exists, not overwriting." % local_path)

directory, _ = os.path.split(local_path)
#if directory:
#    os.makedirs(directory, exist_ok=True)

# initialize the client
osf = osfclient.OSF(
    username=osf_credentials["username"],
    password=osf_credentials["password"]
)
# initialize the project (can also be a "component" of a larger project)
project = osf.project(project_ID)

store = project.storage(storage)
for file_ in store.files:
    if norm_remote_path(file_.path) == remote_path:
        with open(local_path, 'wb') as fp:
            file_.write_to(fp)

        # only fetching one file so we are done
        break

100%|████████████████████████████████████████████████████████████████████████████████████████████████| 10.0/10.0 [00:00<00:00, 40.6kbytes/s]
