In [1]:
!pip install dvc

Collecting dvc
  Downloading dvc-3.60.1-py3-none-any.whl.metadata (17 kB)
Collecting celery (from dvc)
  Downloading celery-5.5.3-py3-none-any.whl.metadata (22 kB)
Collecting colorama>=0.3.9 (from dvc)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting configobj>=5.0.9 (from dvc)
  Downloading configobj-5.0.9-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting dpath<3,>=2.1.0 (from dvc)
  Downloading dpath-2.2.0-py3-none-any.whl.metadata (15 kB)
Collecting dulwich (from dvc)
  Downloading dulwich-0.22.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting dvc-data<3.17,>=3.16.2 (from dvc)
  Downloading dvc_data-3.16.10-py3-none-any.whl.metadata (5.0 kB)
Collecting dvc-http>=2.29.0 (from dvc)
  Downloading dvc_http-2.32.0-py3-none-any.whl.metadata (1.3 kB)
Collecting dvc-objects (from dvc)
  Downloading dvc_objects-5.1.1-py3-none-any.whl.metadata (3.8 kB)
Collecting dvc-render<2,>=1.0.1 (from dvc)
  Downloading dvc_render-1.0.2

In [2]:
!git init
!dvc init -f

[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/.git/
Initialized DVC repository.

You can now commit the changes to git.

[31m+---------------------------------------------------------------------+
[0m[31m|[0m                                                                     [31m|[0m
[31m|[0m        DVC has enabled anonymous aggregate usage analytics.         [31m|[0m
[31m|[0m     Read the analytics documentation (and how to opt-out) here:     [31m|[0m
[31m|[0m             <[36mhttps://dvc.org

In [3]:
!mkdir data models
!mkdir -p scripts

In [4]:
%%writefile scripts/download_data.py
import pandas as pd
from sklearn.datasets import load_iris

df = pd.DataFrame(load_iris(as_frame=True).frame)
df.to_csv("data/iris.csv", index=False)

Writing scripts/download_data.py


In [5]:
%%writefile scripts/train_model.py
import pandas as pd
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import yaml

with open("params.yaml") as f:
    params = yaml.safe_load(f)

df = pd.read_csv("data/iris.csv")
X = df.drop("target", axis=1)
y = df["target"]
X_train, _, y_train, _ = train_test_split(X, y, test_size=params["train"]["test_size"])

model = LogisticRegression(max_iter=params["train"]["max_iter"])
model.fit(X_train, y_train)

with open("models/model.pkl", "wb") as f:
    pickle.dump(model, f)

Writing scripts/train_model.py


In [6]:
%%writefile scripts/evaluate_model.py
import pandas as pd
import pickle
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

df = pd.read_csv("data/iris.csv")
X = df.drop("target", axis=1)
y = df["target"]
_, X_test, _, y_test = train_test_split(X, y, test_size=0.2)

with open("models/model.pkl", "rb") as f:
    model = pickle.load(f)

y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)

with open("metrics.txt", "w") as f:
    f.write(f"Accuracy: {acc:.4f}\n")

Writing scripts/evaluate_model.py


In [7]:
%%writefile params.yaml
train:
  test_size: 0.2
  max_iter: 200

Writing params.yaml


In [8]:
!dvc stage add -n download_data \
  -d scripts/download_data.py \
  -o data/iris.csv \
  python scripts/download_data.py

!If DVC froze, see `hardlink_lock` in <[36mhttps://man.dvc.org/config#core[39m>                                                                      Added stage 'download_data' in 'dvc.yaml'

To track the changes with git, run:

	git add dvc.yaml data/.gitignore

To enable auto staging, run:

	dvc config core.autostage true
[0m

In [9]:
!dvc stage add -n train_model \
  -d scripts/train_model.py -d data/iris.csv -d params.yaml \
  -o models/model.pkl \
  python scripts/train_model.py

!If DVC froze, see `hardlink_lock` in <[36mhttps://man.dvc.org/config#core[39m>                                                                      Added stage 'train_model' in 'dvc.yaml'

To track the changes with git, run:

	git add models/.gitignore dvc.yaml

To enable auto staging, run:

	dvc config core.autostage true
[0m

In [10]:
!dvc stage add -n evaluate_model \
  -d scripts/evaluate_model.py -d models/model.pkl -d data/iris.csv \
  -o metrics.txt \
  python scripts/evaluate_model.py

!If DVC froze, see `hardlink_lock` in <[36mhttps://man.dvc.org/config#core[39m>                                                                      Added stage 'evaluate_model' in 'dvc.yaml'

To track the changes with git, run:

	git add .gitignore dvc.yaml

To enable auto staging, run:

	dvc config core.autostage true
[0m

In [11]:
!dvc params diff

[0m

In [12]:
!dvc repro

Running stage 'download_data':
> python scripts/download_data.py
Generating lock file 'dvc.lock'
Updating lock file 'dvc.lock'

Running stage 'train_model':
> python scripts/train_model.py
Updating lock file 'dvc.lock'

Running stage 'evaluate_model':
> python scripts/evaluate_model.py
Updating lock file 'dvc.lock'

To track the changes with git, run:

	git add dvc.lock

To enable auto staging, run:

	dvc config core.autostage true
Use `dvc push` to send your updates to remote storage.
[0m

In [13]:
%%writefile params.yaml
train:
  test_size: 0.25
  max_iter: 300

Overwriting params.yaml


In [14]:
!dvc repro

!If DVC froze, see `hardlink_lock` in <[36mhttps://man.dvc.org/config#core[39m>                                                                      !          |0.00 [00:00,     ?file/s]                                    !          |0.00 [00:00,     ?file/s]                                    Stage 'download_data' didn't change, skipping
!          |0.00 [00:00,     ?file/s]                                    !          |0.00 [00:00,     ?file/s]                                    !          |0.00 [00:00,     ?file/s]                                    !          |0.00 [00:00,     ?file/s]                                    !          |0.00 [00:00,     ?file/s]                                    !          |0.00 [00:00,     ?file/s]                                    !          |0.00 [00:00,     ?file/s]                                    !          |0.00 [00:00,     ?file/s]                                    !          |0.00 [00:00,   

In [15]:
!mkdir lab3
!mv scripts lab3/
!mv data lab3/
!mv models lab3/
!mv params.yaml lab3/
!mv dvc.yaml lab3/
!mv dvc.lock lab3/
!mv metrics.txt lab3/
