<a href="https://colab.research.google.com/github/Tob-iee/data-experiment_tracking/blob/main/Notebooks/run_train_script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Notebook configurations  🏗

#@markdown Is the repository mirrored from GitHub to DagsHub? 
MIRROR = True #@param {type:"boolean"}

#@markdown Clone the Git repo to the Colab runtime
CLONE = True #@param {type:"boolean"}

#@markdown Pull the changes from the Git server to Colab runtime
PULL_GIT = True #@param {type:"boolean"}

#@markdown Initialize DVC in this repository (Should only done once **per reository**)
INIT_DVC = False #@param {type:"boolean"}

#@markdown Set DagsHub storage as DVC's remote (Should only done once per reository)
SET_DVC_REMOTE = True #@param {type:"boolean"}

#@markdown Set DVC’s user configurations for DagsHub user (will be set locally - should only done **per runtime**)
SET_DVC_USER = True #@param {type:"boolean"}

#@markdown Pull the changes from the DagsHub storage to Colab runtime
PULL_DVC = True #@param {type:"boolean"}

#@markdown Host DVC chcae directory on Google Drive (will aviod pulling the data per runtime)
GDRIVE_CACHE = False #@param {type:"boolean"}

#@markdown Configure MLflow remote tracking server
MLFLOW = True #@param {type:"boolean"}

#@markdown Use Google Drive as the runtime memory disk (will change dirctory to the Drive)
GDRIVE = False #@param {type:"boolean"}

#@markdown ---

In [2]:
#@title DagsHub Configurations 🐶

#@markdown Enter the DAGsHub repository owner name:
DAGSHUB_REPO_OWNER= "Nwoke" #@param {type:"string"} 

#@markdown Enter the DAGsHub repository name:
DAGSHUB_REPO_NAME= "data_model_experiment-tracking" #@param {type:"string"}

#@markdown Enter the username of your DAGsHub account:
DAGSHUB_USER_NAME = "Nwoke" #@param {type:"string"}

#@markdown Enter the email for your DAGsHub account:
DAGSHUB_EMAIL = "tochukwunwoke1@gmail.com" #@param {type:"string"}

#@markdown Enter the branch name:
BRANCH= "main" #@param {type:"string"}

# Additional information 💡

**DagsHub**

In [6]:
import getpass
DAGSHUB_TOKEN = getpass.getpass('Please enter your DAGsHub token or password: ')

**GitHub**

In [2]:
if MIRROR:
  # Set GitHub variables
  GITHUB_REPO_OWNER = input("What is the repository owner username?")
  GITHUB_USER_NAME = input("What is your GitHub username?")
  GITHUB_REPO_NAME = input("What is your GitHub repository name?")
  GITHUB_EMAIL = input("What is the email for your GitHub account:")
  GITHUB_TOKEN = getpass.getpass('Please enter your GitHub token or password: ')

**MLFlow**

In [1]:
if MLFLOW:
  MLFLOW_EXPERIMENT_NAME = input("Please enter the MLFlow experiment name or skipe to use 'default'") or "default"
  print("MLFlow experiment name: ",MLFLOW_EXPERIMENT_NAME)

# Help Functions 🚁

In [None]:
# Imports
import requests
import datetime
import os
from pathlib import Path

In [None]:
# Constants and Flags
global MOUNT_GDRIVE
MOUNT_GDRIVE = False

In [None]:
def mount_gdrive():
    from google.colab import drive
    drive.mount('/content/drive')

    global MOUNT_GDRIVE
    MOUNT_GDRIVE = True

In [None]:
def add_prefix_colab_path(base_path):
  return os.path.join("/content/drive/MyDrive",base_path)

In [None]:
  def link_gdrive_as_cache(cache_path):
    import subprocess

    bashCommand = f"ln -s {cache_path} .dvc/cache"
    process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
  
    return process.communicate()

In [None]:
def git_push():
  "Push files to remote Git server on DAGsHub or GitHub"
  if MIRROR:
    !git push https://{GITHUB_USER_NAME}:{GITHUB_TOKEN}@github.com/{GITHUB_REPO_OWNER}/{GITHUB_REPO_NAME}.git 
  else:
    !git push https://{DAGSHUB_USER_NAME}:{DAGSHUB_TOKEN}@dagshub.com/{DAGSHUB_REPO_OWNER}/{DAGSHUB_REPO_NAME}.git 

# Black Magic 🪄 

**Mount Google Drive**

In [None]:
if GDRIVE:
  if not MOUNT_GDRIVE:
    mount_gdrive()
  %cd /content/drive/MyDrive

**Configure Git**

In [None]:
if MIRROR:
  !git config --global user.email {GITHUB_EMAIL}
  !git config --global user.name {GITHUB_USER_NAME}
else:
  !git config --global user.email {DAGSHUB_EMAIL}
  !git config --global user.name {DAGSHUB_USER_NAME}

**Clone the Repository**

In [None]:
if CLONE:
  if MIRROR:
    !git clone -b {BRANCH} https://{GITHUB_USER_NAME}:{GITHUB_TOKEN}@github.com/{GITHUB_REPO_OWNER}/{GITHUB_REPO_NAME}.git
    %cd {GITHUB_REPO_NAME}
  else:
    !git clone -b {BRANCH} https://{DAGSHUB_USER_NAME}:{DAGSHUB_TOKEN}@dagshub.com/{DAGSHUB_REPO_OWNER}/{DAGSHUB_REPO_NAME}.git
    %cd {DAGSHUB_REPO_NAME}
if PULL_GIT:
  !git pull

**Install Requirements**

In [None]:
from pathlib import Path

!pip install --upgrade pip --quiet

req_path = Path("requirements.txt")
if req_path.is_file():
  !pip install -r requirements.txt --quiet

**Configure DVC**

In [None]:
dvc_installed = !pip list -v | grep dvc
if not dvc_installed:
  print("Installing DVC")
  !pip install dvc>=2.8.1 --quiet

# Import DVC package (relevant only when working in a Colab environment)
import dvc

if INIT_DVC:
  # initialize DVC
  !dvc init

if SET_DVC_REMOTE:
  # Set DVC remote storage as 'DAGsHub storage'
  !dvc remote add origin https://dagshub.com/{DAGSHUB_REPO_OWNER}/{DAGSHUB_REPO_NAME}.dvc

if SET_DVC_USER:
  # General DVC user configuration
  !dvc remote modify --local origin auth basic
  !dvc remote modify --local origin user {DAGSHUB_USER_NAME}
  !dvc remote modify --local origin password {DAGSHUB_TOKEN}

if PULL_DVC:
  !dvc pull -r origin >& dev_null

  # Make sure that all files were pulled
  !dvc pull -r origin -vv

**Link cache directory to Google Drive**

In [None]:
if GDRIVE_CACHE:
  if not MOUNT_GDRIVE:
    mount_gdrive()
  
  cache_path = input("Please enter the path where you want to store the cache. "
                    "The path doesn't have to exist at the moment")
  
  cache_path = add_prefix_colab_path(cache_path)
  Path(cache_path).mkdir(parents=True, exist_ok=True)

  output, error = link_gdrive_as_cache(cache_path)

  if error:
    print('WARNING:',error)

**Configure MLflow**

In [None]:
if MLFLOW:
  
  mlflow_installed = !pip list -v | grep mlflow
  if not mlflow_installed:
    print("Installing MLflow")
    !pip install mlflow --quiet

  import mlflow

  os.environ['MLFLOW_TRACKING_USERNAME'] = DAGSHUB_USER_NAME
  os.environ['MLFLOW_TRACKING_PASSWORD'] = DAGSHUB_TOKEN

  # mlflow.set_tracking_uri(f'https://dagshub.com/{DAGSHUB_REPO_OWNER}/{DAGSHUB_REPO_NAME}.mlflow')
  os.environ['MLFLOW_TRACKING_URI'] = f'https://dagshub.com/{DAGSHUB_REPO_OWNER}/{DAGSHUB_REPO_NAME}.mlflow'
  mlflow.set_tracking_uri(os.environ['MLFLOW_TRACKING_URI'])
  os.environ['MLFLOW_EXPERIMENT_NAME'] = MLFLOW_EXPERIMENT_NAME

# DagYard 🦮 🐕 🦮 🐕

From this point, you can treat this Colab notebook as a remote machine, with strong computational power, that holds all of your project components. You can edit the code, modify the data, and train the model - all as part of the project.

Once reaching a valuable result, you can version the project components using Git and DVC (see code below) and push the changes to your DagsHub remotes.

In [None]:
!pwd
%cd /content/data-experiment_tracking

In [None]:
!pip install wget

In [None]:
# !MLFLOW_TRACKING_URI="https://dagshub.com/Nwoke/data_model_experiment-tracking.mlflow"
!dvc repro

**Push the files to the remotes** 🏁

In [None]:
# !git add dvc.lock

!git commit -m "Done training" && git push

!dvc push -r origin