In [7]:
from pathlib import Path 
from typing import List
from openpecha_data_cataloger.utility import remove_duplicates_and_empty_elements

"""Get the folders in a directory with starting with char"""
def list_folders(directory_path: Path, starting_char: str) -> List[str]:
    # return [folder for folder in directory_path.iterdir() if folder.is_dir() and folder.name.startswith(starting_char)]
    return [folder for folder in directory_path.iterdir() if folder.is_dir()]


directory_path = Path("/home/tenzin3/.openpecha/pechas/others")
starting_char = "P"
folders = list_folders(directory_path, starting_char)


In [8]:
downloaded_opfs = [folder.name for folder in folders]
downloaded_opfs = remove_duplicates_and_empty_elements(downloaded_opfs)
print(f"Already downloaded opfs: {len(downloaded_opfs)}")

Already downloaded opfs: 335


In [9]:
from openpecha_data_cataloger.config import DATA_DIR

opfs_list_path = Path(DATA_DIR / "opf_batches" / "others_batch.txt")
opf_list = opfs_list_path.read_text().split("\n")
opf_list = remove_duplicates_and_empty_elements(opf_list)
print(f"opfs list: {len(opf_list)}")

opfs list: 335


In [13]:
for opf in opf_list:
    if opf not in downloaded_opfs:
        print(opf)

In [4]:
print(downloaded_opfs[:10])

['I09408022', 'I48005610', 'I62DFC622', 'IE6FB74D8', 'IEE2B6DB6', 'IAB7A647F', 'I3A83745B', 'IB9B7D29A', 'I167D35DF', 'IB3824773']


In [7]:
import subprocess


def _mkdir(path):
    if path.is_dir():
        return path
    path.mkdir(exist_ok=True, parents=True)
    return path


def clone_github_repo(
    repository,
    destination_folder: Path,
    organization: str = "OpenPecha-Data",
):
    github_token = "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"
    repo_url = f"https://github.com/{organization}/{repository}.git"
    auth_repo_url = repo_url.replace("https://", f"https://{github_token}@")

    try:
        if not destination_folder.exists():
            _mkdir(destination_folder)
            
            # Make a new folder in destination_folder and clone the repo there
            command = ["git", "clone", auth_repo_url, str(destination_folder)]
            subprocess.run(command, check=True)
    except subprocess.CalledProcessError as e:
        raise Exception(f"Failed to clone repository {repo_url}: {e}")
    except Exception as e:
        raise Exception(f"An error occurred while cloning repository {repo_url}: {e}")

In [1]:
import logging 

log_fn = "errors.log"
progress_fn = "progress.log"

logging.basicConfig(
    filename=str(log_fn),
    level=logging.ERROR,
    format="%(asctime)s - %(levelname)s - %(message)s",
)

def log_progress_with_id(id_: str):
    """Log error message with ID to a separate file."""
    with open(progress_fn, "a") as log_file:
        log_file.write(f"{id_}\n")

opf_list = ["10df3300ae84901a884042ed9c68861"]
for idx, opf in enumerate(opf_list):
    log_progress_with_id(opf)
    # if opf not in downloaded_opfs:
    try:
        clone_github_repo(opf, Path(f"/home/tenzin3/.openpecha/pechas/others/{opf}"))    
    except Exception as e:
        logging.error(e)
        continue    