# Deploying Iris-detection model using Vertex AI


### Install Vertex AI SDK for Python and other required packages



In [3]:
# Vertex SDK for Python
! pip3 install --upgrade --force-reinstall -r requirements.txt

Collecting google-cloud-aiplatform==1.126.0 (from -r requirements.txt (line 1))
  Using cached google_cloud_aiplatform-1.126.0-py2.py3-none-any.whl.metadata (45 kB)
Collecting pyarrow==21.0.0 (from -r requirements.txt (line 2))
  Using cached pyarrow-21.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting mlflow==3.5.1 (from -r requirements.txt (line 3))
  Using cached mlflow-3.5.1-py3-none-any.whl.metadata (30 kB)
Collecting pandas==2.3.3 (from -r requirements.txt (line 4))
  Using cached pandas-2.3.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
Collecting numpy==2.1.3 (from -r requirements.txt (line 5))
  Using cached numpy-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting scikit-learn==1.7.2 (from -r requirements.txt (line 6))
  Using cached scikit_learn-1.7.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting joblib==1.5.2 (from -r requirements.txt (line 

### Set GCS Information

In [4]:
BUCKET_URI = f"gs://iitmbs-mlops-21f1000344"

### Import the required libraries

In [5]:
import os
import sys
import pandas as pd
from datetime import datetime, timedelta
from feast import FeatureStore, Entity, FeatureView, Field, FileSource
from feast.types import Float64, Int64
from sklearn.preprocessing import StandardScaler

### Setup Git Repository

In [114]:
!git init

[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /home/jupyter/.git/


In [115]:
!git config --global user.email "chandrakarsatvik@gmail.com"

In [116]:
!git config --global user.name "Satvik Chandrakar"

In [117]:
!git status

On branch master

No commits yet

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31m.bashrc[m
	[31m.cache/[m
	[31m.config/[m
	[31m.docker/[m
	[31m.dvc/[m
	[31m.dvcignore[m
	[31m.gitconfig[m
	[31m.github/[m
	[31m.gsutil/[m
	[31m.ipynb_checkpoints/[m
	[31m.ipython/[m
	[31m.jupyter/[m
	[31m.local/[m
	[31m.npm/[m
	[31mWorkbench.ipynb[m
	[31martifacts/[m
	[31mdata/[m
	[31mfeature_repo/[m
	[31mgcp-key.b64.txt[m
	[31miitmbs-mlops-a99d6ce657ac.json[m
	[31minference.ipynb[m
	[31mraw_data/[m
	[31mrequirements.txt[m
	[31msrc/[m
	[31mtests/[m

nothing added to commit but untracked files present (use "git add" to track)


### GitHub Actions for CI

#### Required Secrets Keys
- GCP_KEY_JSON
- GCP_KEY_BASE64
- MLFLOW_TRACKING_URI
- MLFLOW_EXPERIMENT_NAME

In [10]:
# Convert key to Base64
!base64 -w 0 iitmbs-mlops-a99d6ce657ac.json > gcp-key.b64.txt

In [78]:
%%bash
mkdir .github/
mkdir .github/workflows/
touch .github/workflows/ci-dev.yml
cat > .github/workflows/ci-dev.yml <<'EOF'
name: CI - Dev Branch

on:
  push:
    branches: [dev]
  pull_request:
    branches: [dev, main]

permissions:
  contents: write
  pull-requests: write

jobs:
  dev-ci:
    runs-on: ubuntu-latest

    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.10'

      - name: Install dependencies
        run: pip install -r requirements.txt

      - name: Configure DVC Remote
        env:
          GOOGLE_APPLICATION_CREDENTIALS: ${{ secrets.GCP_KEY_JSON }}
        run: |
          echo "${GOOGLE_APPLICATION_CREDENTIALS}" > gcp-key.json
          dvc remote modify myremote credentialpath gcp-key.json
        
      - name: Setup GCP credentials
        run: |
          echo "${{ secrets.GCP_KEY_BASE64 }}" | base64 --decode > gcp-key.json
          echo "GCP key file written to gcp-key.json"
        env:
          GOOGLE_APPLICATION_CREDENTIALS: gcp-key.json

      - name: Pull data from DVC
        run: dvc pull -r myremote

      - name: Fetch best model from MLflow
        env:
          MLFLOW_TRACKING_URI: ${{ secrets.MLFLOW_TRACKING_URI }}
          MLFLOW_EXPERIMENT_NAME: ${{ secrets.MLFLOW_EXPERIMENT_NAME }}
        run: |
          echo "Fetching best model from MLflow experiment..."
          python <<'PYCODE'
          import mlflow
          from mlflow.tracking import MlflowClient
          import os, shutil

          client = MlflowClient()
          experiment_name = os.getenv("MLFLOW_EXPERIMENT_NAME")
          experiment = client.get_experiment_by_name(experiment_name)
          if not experiment:
              raise SystemExit(f"Experiment '{experiment_name}' not found in MLflow.")
          
          experiment_id = experiment.experiment_id
          print(f"Searching best model from experiment: {experiment_name} (ID: {experiment_id})")

          results = mlflow.search_logged_models(
              experiment_ids=[experiment_id],
              order_by=[{"field_name": "metrics.accuracy", "ascending": False}],
              max_results=1,
              output_format="list"
          )

          if not results:
              raise SystemExit("No logged models found in this experiment.")

          best_model = results[0]
          print(f"Best model ID: {best_model.model_id}")
          print(f"Accuracy: {best_model.metrics[0].value}")

          model_uri = f"models:/{best_model.model_id}"
          output_dir = "fetched_model"
          if os.path.exists(output_dir):
              shutil.rmtree(output_dir)

          os.makedirs(output_dir, exist_ok=True)
          mlflow.artifacts.download_artifacts(artifact_uri=model_uri, dst_path=output_dir)
          print(f"Saved best model locally at '{output_dir}/'")
          PYCODE

      - name: Model sanity check & accuracy evaluation
        run: |
          python <<'PYCODE'
          import mlflow.pyfunc
          import pandas as pd
          from sklearn.model_selection import train_test_split
          from sklearn.metrics import accuracy_score, classification_report

          model = mlflow.pyfunc.load_model("fetched_model")
          df = pd.read_parquet("data/stock_data.parquet")

          X = df[['open', 'high', 'low', 'close', 'volume', 'ma_15_min', 'ma_60_min', 'rsi_14']]
          y = df["target"]
          X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

          y_pred = model.predict(X_test)
          acc = accuracy_score(y_test, y_pred)
          report = classification_report(y_test, y_pred, output_dict=False)

          print(f"Test Accuracy: {acc:.4f}")
          print(report)

          with open("accuracy_report.md", "w") as f:
              f.write("## Model Evaluation Report\n\n")
              f.write(f"**Test Accuracy:** {acc:.4f}\n\n")
              f.write("```text\n")
              f.write("### Classification Report\n\n")
              f.write(report)
              f.write("\n```\n\n")
          PYCODE

      - name: Run unit tests and generate Markdown report
        run: |
          pytest --maxfail=1 --disable-warnings --tb=short -q --junitxml=report.xml > pytest_output.txt

          echo "## Dev Branch Pytest Summary Report" > dev_report.md
          echo "" >> dev_report.md
          echo "**Date:** $(date)" >> dev_report.md
          echo "" >> dev_report.md
          echo "### Test Results:" >> dev_report.md
          echo '```' >> dev_report.md
          cat pytest_output.txt >> dev_report.md
          echo '```' >> dev_report.md
          echo "" >> dev_report.md
          pytest --maxfail=1 --disable-warnings --tb=short -q --cov=. --cov-report=term-missing >> pytest_output.txt 2>&1 || true

      - name: Sample Prediction (Feast Online Store)
        env:
          GOOGLE_APPLICATION_CREDENTIALS: gcp-key.json
        run: |
          python <<'PYCODE'
          from feast import FeatureStore
          import pandas as pd
          import mlflow.pyfunc

          # Load Feature Store
          store = FeatureStore(repo_path="feature_repo")

          # Detect data version
          df = pd.read_parquet("data/stock_data.parquet")
          if "stock_v1_id" in df.columns:
              version = "v1"
              id_col = "stock_v1_id"
          elif "stock_v2_id" in df.columns:
              version = "v2"
              id_col = "stock_v2_id"
          else:
              raise ValueError("No version column found (expected stock_v1_id or stock_v2_id).")

          print(f"✅ Detected feature view version: {version}")

          # Load model
          model = mlflow.pyfunc.load_model("fetched_model")

          sample_rows = []
          sample_ids = df[id_col].unique()[:5]
          for entity_id in sample_ids:
              entity_df = pd.DataFrame({id_col: [entity_id]})

              print("\n===============================")
              print("Requesting online features for entities:", entity_df.to_dict(orient='records'))

              feature_vector = store.get_online_features(
                  features=[
                      f"stock_features_{version}:open",
                      f"stock_features_{version}:high",
                      f"stock_features_{version}:low",
                      f"stock_features_{version}:close",
                      f"stock_features_{version}:volume",
                      f"stock_features_{version}:ma_15_min",
                      f"stock_features_{version}:ma_60_min",
                      f"stock_features_{version}:rsi_14",
                  ],
                  entity_rows=entity_df.to_dict(orient="records"),
              ).to_df()

              print("Online features (raw):")
              print(feature_vector)

              X = feature_vector.drop(columns=[id_col])
              pred = model.predict(X)[0]

              # Find true label from df (if available)
              true_label = df.loc[df[id_col] == entity_id, "target"].values[0] if entity_id in df[id_col].values else "N/A"

              print(f"Predicted target: {pred}, True target: {true_label}")
              sample_rows.append({"Entity": entity_id, "True": true_label, "Predicted": pred})

          # Append to Markdown report
          if sample_rows:
              sample_df = pd.DataFrame(sample_rows)
              with open("accuracy_report.md", "a") as f:
                  f.write("### Sample Predictions (Feast Online Store)\n\n")
                  f.write(sample_df.to_markdown(index=False))
                  f.write("\n\n")
          PYCODE

      - name: Set up CML
        uses: iterative/setup-cml@v2
        with:
          version: latest
          vega: true
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

      - name: Comment CML Report on commit (push)
        if: github.event_name == 'push'
        env:
          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          cat accuracy_report.md >> dev_report.md
          cml comment create --target=commit --publish dev_report.md

      - name: Comment CML Report on PR (pull request)
        if: github.event_name == 'pull_request'
        env:
          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          cat accuracy_report.md >> dev_report.md
          cml comment create --target=pr --publish dev_report.md
EOF

mkdir: cannot create directory ‘.github/’: File exists
mkdir: cannot create directory ‘.github/workflows/’: File exists


In [79]:
%%bash
touch .github/workflows/ci-main.yml
cat > .github/workflows/ci-main.yml <<'EOF'
name: CI - Main Branch

on:
  push:
    branches: [main]
  pull_request:
    branches: [main]

permissions:
  contents: write
  pull-requests: write

jobs:
  main-ci:
    runs-on: ubuntu-latest

    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.10'

      - name: Install dependencies
        run: pip install -r requirements.txt

      - name: Configure DVC Remote
        env:
          GOOGLE_APPLICATION_CREDENTIALS: ${{ secrets.GCP_KEY_JSON }}
        run: |
          echo "${GOOGLE_APPLICATION_CREDENTIALS}" > gcp-key.json
          dvc remote modify myremote credentialpath gcp-key.json
        
      - name: Setup GCP credentials
        run: |
          echo "${{ secrets.GCP_KEY_BASE64 }}" | base64 --decode > gcp-key.json
          echo "GCP key file written to gcp-key.json"
        env:
          GOOGLE_APPLICATION_CREDENTIALS: gcp-key.json

      - name: Pull data from DVC
        run: dvc pull -r myremote

      - name: Fetch best model from MLflow
        env:
          MLFLOW_TRACKING_URI: ${{ secrets.MLFLOW_TRACKING_URI }}
          MLFLOW_EXPERIMENT_NAME: ${{ secrets.MLFLOW_EXPERIMENT_NAME }}
        run: |
          echo "Fetching best model from MLflow experiment..."
          python <<'PYCODE'
          import mlflow
          from mlflow.tracking import MlflowClient
          import os, shutil

          client = MlflowClient()
          experiment_name = os.getenv("MLFLOW_EXPERIMENT_NAME")
          experiment = client.get_experiment_by_name(experiment_name)
          if not experiment:
              raise SystemExit(f"Experiment '{experiment_name}' not found in MLflow.")
          
          experiment_id = experiment.experiment_id
          print(f"Searching best model from experiment: {experiment_name} (ID: {experiment_id})")

          results = mlflow.search_logged_models(
              experiment_ids=[experiment_id],
              order_by=[{"field_name": "metrics.accuracy", "ascending": False}],
              max_results=1,
              output_format="list"
          )

          if not results:
              raise SystemExit("No logged models found in this experiment.")

          best_model = results[0]
          print(f"Best model ID: {best_model.model_id}")
          print(f"Accuracy: {best_model.metrics[0].value}")

          model_uri = f"models:/{best_model.model_id}"
          output_dir = "fetched_model"
          if os.path.exists(output_dir):
              shutil.rmtree(output_dir)

          os.makedirs(output_dir, exist_ok=True)
          mlflow.artifacts.download_artifacts(artifact_uri=model_uri, dst_path=output_dir)
          print(f"Saved best model locally at '{output_dir}/'")
          PYCODE

      - name: Model sanity check & accuracy evaluation
        run: |
          python <<'PYCODE'
          import mlflow.pyfunc
          import pandas as pd
          from sklearn.model_selection import train_test_split
          from sklearn.metrics import accuracy_score, classification_report

          model = mlflow.pyfunc.load_model("fetched_model")
          df = pd.read_parquet("data/stock_data.parquet")

          X = df[['open', 'high', 'low', 'close', 'volume', 'ma_15_min', 'ma_60_min', 'rsi_14']]
          y = df["target"]
          X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

          y_pred = model.predict(X_test)
          acc = accuracy_score(y_test, y_pred)
          report = classification_report(y_test, y_pred, output_dict=False)

          print(f"Test Accuracy: {acc:.4f}")
          print(report)

          with open("accuracy_report.md", "w") as f:
              f.write("## Model Evaluation Report\n\n")
              f.write(f"**Test Accuracy:** {acc:.4f}\n\n")
              f.write("```text\n")
              f.write("### Classification Report\n\n")
              f.write(report)
              f.write("\n```\n\n")
          PYCODE

      - name: Run unit tests and generate Markdown report
        run: |
          pytest --maxfail=1 --disable-warnings --tb=short -q --junitxml=report.xml > pytest_output.txt

          echo "## Dev Branch Pytest Summary Report" > dev_report.md
          echo "" >> dev_report.md
          echo "**Date:** $(date)" >> dev_report.md
          echo "" >> dev_report.md
          echo "### Test Results:" >> dev_report.md
          echo '```' >> dev_report.md
          cat pytest_output.txt >> dev_report.md
          echo '```' >> dev_report.md
          echo "" >> dev_report.md
          pytest --maxfail=1 --disable-warnings --tb=short -q --cov=. --cov-report=term-missing >> pytest_output.txt 2>&1 || true

      - name: Sample Prediction (Feast Online Store)
        env:
          GOOGLE_APPLICATION_CREDENTIALS: gcp-key.json
        run: |
          python <<'PYCODE'
          from feast import FeatureStore
          import pandas as pd
          import mlflow.pyfunc

          # Load Feature Store
          store = FeatureStore(repo_path="feature_repo")

          # Detect data version
          df = pd.read_parquet("data/stock_data.parquet")
          if "stock_v1_id" in df.columns:
              version = "v1"
              id_col = "stock_v1_id"
          elif "stock_v2_id" in df.columns:
              version = "v2"
              id_col = "stock_v2_id"
          else:
              raise ValueError("No version column found (expected stock_v1_id or stock_v2_id).")

          print(f"✅ Detected feature view version: {version}")

          # Load model
          model = mlflow.pyfunc.load_model("fetched_model")

          sample_rows = []
          sample_ids = df[id_col].unique()[:5]
          for entity_id in sample_ids:
              entity_df = pd.DataFrame({id_col: [entity_id]})

              print("\n===============================")
              print("Requesting online features for entities:", entity_df.to_dict(orient='records'))

              feature_vector = store.get_online_features(
                  features=[
                      f"stock_features_{version}:open",
                      f"stock_features_{version}:high",
                      f"stock_features_{version}:low",
                      f"stock_features_{version}:close",
                      f"stock_features_{version}:volume",
                      f"stock_features_{version}:ma_15_min",
                      f"stock_features_{version}:ma_60_min",
                      f"stock_features_{version}:rsi_14",
                  ],
                  entity_rows=entity_df.to_dict(orient="records"),
              ).to_df()

              print("Online features (raw):")
              print(feature_vector)

              X = feature_vector.drop(columns=[id_col])
              pred = model.predict(X)[0]

              # Find true label from df (if available)
              true_label = df.loc[df[id_col] == entity_id, "target"].values[0] if entity_id in df[id_col].values else "N/A"

              print(f"Predicted target: {pred}, True target: {true_label}")
              sample_rows.append({"Entity": entity_id, "True": true_label, "Predicted": pred})

          # Append to Markdown report
          if sample_rows:
              sample_df = pd.DataFrame(sample_rows)
              with open("accuracy_report.md", "a") as f:
                  f.write("### Sample Predictions (Feast Online Store)\n\n")
                  f.write(sample_df.to_markdown(index=False))
                  f.write("\n\n")
          PYCODE

      - name: Set up CML
        uses: iterative/setup-cml@v2
        with:
          version: latest
          vega: true
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

      - name: Comment CML Report on commit (push)
        if: github.event_name == 'push'
        env:
          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          cat accuracy_report.md >> dev_report.md
          cml comment create --target=commit --publish dev_report.md

      - name: Comment CML Report on PR (pull request)
        if: github.event_name == 'pull_request'
        env:
          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          cat accuracy_report.md >> dev_report.md
          cml comment create --target=pr --publish dev_report.md
EOF

### Configure DVC

In [55]:
! dvc init

Initialized DVC repository.

You can now commit the changes to git.

[31m+---------------------------------------------------------------------+
[0m[31m|[0m                                                                     [31m|[0m
[31m|[0m        DVC has enabled anonymous aggregate usage analytics.         [31m|[0m
[31m|[0m     Read the analytics documentation (and how to opt-out) here:     [31m|[0m
[31m|[0m             <[36mhttps://dvc.org/doc/user-guide/analytics[39m>              [31m|[0m
[31m|[0m                                                                     [31m|[0m
[31m+---------------------------------------------------------------------+
[0m
[33mWhat's next?[39m
[33m------------[39m
- Check out the documentation: <[36mhttps://dvc.org/doc[39m>
- Get help and share ideas: <[36mhttps://dvc.org/chat[39m>
- Star us on GitHub: <[36mhttps://github.com/iterative/dvc[39m>
[0m

### Configure GCS as Remote Storage

In [56]:
!dvc remote add -d myremote {BUCKET_URI}

Setting 'myremote' as a default remote.
[0m

In [57]:
!dvc remote modify myremote credentialpath iitmbs-mlops-a99d6ce657ac.json

[0m

### Data Preprocessing

In [58]:
! python src/process_data_v1.py

Processing: ADANIENT__EQ__NSE__NSE__MINUTE.csv
Processing: ABFRL__EQ__NSE__NSE__MINUTE.csv

✅ Full dataset saved to: data/stock_data_full.parquet
Total rows: 730865
✅ Sampled dataset (1000 rows) saved to: data/stock_data.parquet


In [59]:
local_parquet_data = "data/stock_data.parquet"
df = pd.read_parquet(local_parquet_data)
df.shape

(1000, 11)

In [60]:
df

Unnamed: 0,event_timestamp,stock_v1_id,open,high,low,close,volume,ma_15_min,ma_60_min,rsi_14,target
0,2017-04-06 14:22:00+05:30,ABFRL-v1-24305,159.50,159.55,159.40,159.40,402.0,159.390000,158.860000,58.333333,0
1,2020-04-24 14:10:00+05:30,ABFRL-v1-305149,120.15,120.50,120.00,120.25,2487.0,120.566667,120.926667,29.166667,0
2,2017-05-22 09:40:00+05:30,ADANIENT-v1-35275,121.65,121.65,121.45,121.50,9190.0,121.956667,121.984167,36.842105,0
3,2019-12-16 13:00:00+05:30,ABFRL-v1-272079,229.35,229.35,229.35,229.35,333.0,229.333333,229.262500,55.555556,0
4,2017-05-05 11:33:00+05:30,ABFRL-v1-31261,167.30,167.30,167.20,167.20,100.0,167.520000,168.154167,36.734694,0
...,...,...,...,...,...,...,...,...,...,...,...
995,2018-11-15 11:20:00+05:30,ADANIENT-v1-172867,165.00,165.20,164.95,165.05,4058.0,164.976667,164.920000,52.173913,0
996,2017-12-05 14:53:00+05:30,ABFRL-v1-86075,163.05,163.40,163.05,163.25,270.0,163.143333,163.290000,58.620690,0
997,2018-08-24 12:53:00+05:30,ABFRL-v1-153077,192.30,192.75,192.30,192.75,984.0,192.543333,191.920000,60.256410,0
998,2019-10-31 11:04:00+05:30,ADANIENT-v1-260286,199.15,199.15,198.80,198.85,13169.0,199.373333,198.677500,27.659574,0


### Track Data with DVC

In [61]:
! dvc add {local_parquet_data}

 [?25l[32m⠋[0m Checking graph
Adding...                                                                       
![A
Collecting files and computing hashes in data/stock_data.parquet |0.00 [00:00,  [A
                                                                                [A
![A
  0% Checking cache in '/home/jupyter/.dvc/cache/files/md5'| |0/? [00:00<?,    ?[A
                                                                                [A
![A
  0%|          |Adding data/stock_data.parquet to cach0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0%|          |Checking out /home/jupyter/data/stock_0/1 [00:00<?,    ?files/s][A
100% Adding...|████████████████████████████████████████|1/1 [00:00, 11.03file/s][A

To track the changes with git, run:

	git add data/stock_data.parquet.dvc

To enable auto staging, run:

	dvc config core.autostage true
[0m

In [62]:
! dvc push

Collecting                                            |1.00 [00:00,  185entry/s]
Pushing
![A
  0% Checking cache in 'iitmbs-mlops-21f1000344/files/md5'| |0/? [00:00<?,    ?f[A
                                                                                [A
![A
  0% Checking cache in '/home/jupyter/.dvc/cache/files/md5'| |0/? [00:00<?,    ?[A
                                                                                [A
![A
  0%|          |Pushing to gs                         0/1 [00:00<?,     ?file/s][A

![A[A

  0%|          |/home/jupyter/.dvc/cache/files/0.00/67.6k [00:00<?,        ?B/s][A[A

100%|██████████|/home/jupyter/.dvc/cache/f67.6k/67.6k [00:00<00:00,     554kB/s][A[A

                                                                                [A[A
100%|██████████|Pushing to gs                     1/1 [00:00<00:00,  5.48file/s][A
Pushing                                                                         [A
1 file pushed
[0m

In [110]:
# --------------------------
# Define Feast Feature Store config
# --------------------------
entity = Entity(name="stock_v1_id", join_keys=["stock_v1_id"])

stock_source = FileSource(
    path=os.path.abspath(local_parquet_data), # offline store data location
    timestamp_field="event_timestamp",
)

stock_fv = FeatureView(
    name="stock_features_v1",
    entities=[entity],
    ttl=timedelta(days=3650),
    schema=[
        Field(name="open", dtype=Float64),
        Field(name="high", dtype=Float64),
        Field(name="low", dtype=Float64),
        Field(name="close", dtype=Float64),
        Field(name="volume", dtype=Float64),
        Field(name="ma_15_min", dtype=Float64),
        Field(name="ma_60_min", dtype=Float64),
        Field(name="rsi_14", dtype=Float64),
        Field(name="target", dtype=Int64),
    ],
    online=True,
    source=stock_source,
    tags={
        "stock_data_version": "v1",
        "description": "Features derived from v1 of stock dataset",
    },
)

  entity = Entity(name="stock_v1_id", join_keys=["stock_v1_id"])


In [111]:
# --------------------------
# Initialize Feast Store and apply definitions
# --------------------------
store = FeatureStore(repo_path="feature_repo")
store.apply([entity, stock_fv])

In [112]:
# Materialize stock_features_v1 into online store
end_time = df['event_timestamp'].max()
print("Materializing features...")
store.materialize_incremental(
    end_date=end_time,
    feature_views=["stock_features_v1"]
)

Materializing features...
Materializing [1m[32m1[0m feature views to [1m[32m2021-01-01 10:52:00+05:30[0m into the [1m[32msqlite[0m online store.

[1m[32mstock_features_v1[0m from [1m[32m2015-11-10 12:41:30+00:00[0m to [1m[32m2021-01-01 10:52:00+05:30[0m:


### Setup MLFlow Server

In [23]:
# SSH into the VM and run the following command to start MLFlow server
# pip install mlflow
# mlflow server --host 0.0.0.0 --port 8100 --allowed-hosts '*'  --cors-allowed-origins '*'

## Simple RandomForestClassifier model
Build a RandomForestClassifier model on iris data and log parameters, metrics, model, and artifacts to MLflow.

In [66]:
! python src/train.py --n_estimators 10 --max_depth 3 --random_state 42 --version "v1" --stratify YES
! gsutil cp artifacts/model.joblib {BUCKET_URI}/models/ #Upload Model Artifacts to Cloud Storage

Starting training script...
Parameters: n_estimators=10, max_depth=3, version=v1

[1/6] Setting up MLflow...
MLflow tracking URI: http://127.0.0.1:8100
✅ MLflow setup successful

[2/6] Loading local data...
Loading from: data/stock_data.parquet
✅ Data loaded. Shape: (1000, 11)
Columns: ['event_timestamp', 'stock_v1_id', 'open', 'high', 'low', 'close', 'volume', 'ma_15_min', 'ma_60_min', 'rsi_14', 'target']

[3/6] Initializing Feast Feature Store...
✅ Feast store initialized
Memory before fetch: 14.38 GB

[4/6] Fetching features from Feast...
Requesting feature view: stock_features_v1
✅ Features fetched successfully
Training data shape after dropna: (1000, 20)
            event_timestamp       stock_v1_id  ...   rsi_14__  target__
0 2017-01-02 04:47:00+00:00       ABFRL-v1-62  ...  54.545455         0
1 2017-01-04 06:43:00+00:00      ABFRL-v1-928  ...  41.666667         0
2 2017-01-05 08:31:00+00:00     ABFRL-v1-1411  ...  41.176471         0
3 2017-01-06 06:17:00+00:00  ADANIENT-v1-165

In [67]:
! python src/train.py --n_estimators 20 --max_depth 3 --random_state 7 --version "v1" --stratify YES
!gsutil cp artifacts/model.joblib {BUCKET_URI}/models/ #Upload Model Artifacts to Cloud Storage

Starting training script...
Parameters: n_estimators=20, max_depth=3, version=v1

[1/6] Setting up MLflow...
MLflow tracking URI: http://127.0.0.1:8100
✅ MLflow setup successful

[2/6] Loading local data...
Loading from: data/stock_data.parquet
✅ Data loaded. Shape: (1000, 11)
Columns: ['event_timestamp', 'stock_v1_id', 'open', 'high', 'low', 'close', 'volume', 'ma_15_min', 'ma_60_min', 'rsi_14', 'target']

[3/6] Initializing Feast Feature Store...
✅ Feast store initialized
Memory before fetch: 14.38 GB

[4/6] Fetching features from Feast...
Requesting feature view: stock_features_v1
✅ Features fetched successfully
Training data shape after dropna: (1000, 20)
            event_timestamp       stock_v1_id  ...   rsi_14__  target__
0 2017-01-02 04:47:00+00:00       ABFRL-v1-62  ...  54.545455         0
1 2017-01-04 06:43:00+00:00      ABFRL-v1-928  ...  41.666667         0
2 2017-01-05 08:31:00+00:00     ABFRL-v1-1411  ...  41.176471         0
3 2017-01-06 06:17:00+00:00  ADANIENT-v1-165

In [68]:
! python src/train.py --n_estimators 100 --max_depth 5 --random_state 7 --version "v1" --stratify NO
!gsutil cp artifacts/model.joblib {BUCKET_URI}/models/ #Upload Model Artifacts to Cloud Storage

Starting training script...
Parameters: n_estimators=100, max_depth=5, version=v1

[1/6] Setting up MLflow...
MLflow tracking URI: http://127.0.0.1:8100
✅ MLflow setup successful

[2/6] Loading local data...
Loading from: data/stock_data.parquet
✅ Data loaded. Shape: (1000, 11)
Columns: ['event_timestamp', 'stock_v1_id', 'open', 'high', 'low', 'close', 'volume', 'ma_15_min', 'ma_60_min', 'rsi_14', 'target']

[3/6] Initializing Feast Feature Store...
✅ Feast store initialized
Memory before fetch: 14.38 GB

[4/6] Fetching features from Feast...
Requesting feature view: stock_features_v1
✅ Features fetched successfully
Training data shape after dropna: (1000, 20)
            event_timestamp       stock_v1_id  ...   rsi_14__  target__
0 2017-01-02 04:47:00+00:00       ABFRL-v1-62  ...  54.545455         0
1 2017-01-04 06:43:00+00:00      ABFRL-v1-928  ...  41.666667         0
2 2017-01-05 08:31:00+00:00     ABFRL-v1-1411  ...  41.176471         0
3 2017-01-06 06:17:00+00:00  ADANIENT-v1-16

In [69]:
! python src/train.py --n_estimators 250 --max_depth 3 --random_state 7 --version "v1" --stratify NO
!gsutil cp artifacts/model.joblib {BUCKET_URI}/models/ #Upload Model Artifacts to Cloud Storage

Starting training script...
Parameters: n_estimators=250, max_depth=3, version=v1

[1/6] Setting up MLflow...
MLflow tracking URI: http://127.0.0.1:8100
✅ MLflow setup successful

[2/6] Loading local data...
Loading from: data/stock_data.parquet
✅ Data loaded. Shape: (1000, 11)
Columns: ['event_timestamp', 'stock_v1_id', 'open', 'high', 'low', 'close', 'volume', 'ma_15_min', 'ma_60_min', 'rsi_14', 'target']

[3/6] Initializing Feast Feature Store...
✅ Feast store initialized
Memory before fetch: 14.38 GB

[4/6] Fetching features from Feast...
Requesting feature view: stock_features_v1
✅ Features fetched successfully
Training data shape after dropna: (1000, 20)
            event_timestamp       stock_v1_id  ...   rsi_14__  target__
0 2017-01-02 04:47:00+00:00       ABFRL-v1-62  ...  54.545455         0
1 2017-01-04 06:43:00+00:00      ABFRL-v1-928  ...  41.666667         0
2 2017-01-05 08:31:00+00:00     ABFRL-v1-1411  ...  41.176471         0
3 2017-01-06 06:17:00+00:00  ADANIENT-v1-16

### Local Tests

In [70]:
! pytest tests/test_data_validation.py -v

platform linux -- Python 3.10.18, pytest-8.4.2, pluggy-1.6.0 -- /opt/conda/bin/python3
cachedir: .pytest_cache
rootdir: /home/jupyter
plugins: anyio-4.11.0, typeguard-4.4.4, hydra-core-1.3.2
collected 1 item                                                               [0m[1m

tests/test_data_validation.py::test_parquet_data_integrity [32mPASSED[0m[32m        [100%][0m



In [71]:
! pytest tests/test_model_evaluation.py -v

platform linux -- Python 3.10.18, pytest-8.4.2, pluggy-1.6.0 -- /opt/conda/bin/python3
cachedir: .pytest_cache
rootdir: /home/jupyter
plugins: anyio-4.11.0, typeguard-4.4.4, hydra-core-1.3.2
collected 1 item                                                               [0m[1m

tests/test_model_evaluation.py::test_model_performance_parquet [32mPASSED[0m[32m    [100%][0m



### Add to Git and Commit

In [118]:
!git status

On branch master

No commits yet

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31m.bashrc[m
	[31m.cache/[m
	[31m.config/[m
	[31m.docker/[m
	[31m.dvc/[m
	[31m.dvcignore[m
	[31m.gitconfig[m
	[31m.github/[m
	[31m.gsutil/[m
	[31m.ipynb_checkpoints/[m
	[31m.ipython/[m
	[31m.jupyter/[m
	[31m.local/[m
	[31m.npm/[m
	[31mWorkbench.ipynb[m
	[31martifacts/[m
	[31mdata/[m
	[31mfeature_repo/[m
	[31mgcp-key.b64.txt[m
	[31miitmbs-mlops-a99d6ce657ac.json[m
	[31minference.ipynb[m
	[31mraw_data/[m
	[31mrequirements.txt[m
	[31msrc/[m
	[31mtests/[m

nothing added to commit but untracked files present (use "git add" to track)


In [119]:
!git checkout -b dev

Switched to a new branch 'dev'


In [120]:
!git status

On branch dev

No commits yet

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31m.bashrc[m
	[31m.cache/[m
	[31m.config/[m
	[31m.docker/[m
	[31m.dvc/[m
	[31m.dvcignore[m
	[31m.gitconfig[m
	[31m.github/[m
	[31m.gsutil/[m
	[31m.ipynb_checkpoints/[m
	[31m.ipython/[m
	[31m.jupyter/[m
	[31m.local/[m
	[31m.npm/[m
	[31mWorkbench.ipynb[m
	[31martifacts/[m
	[31mdata/[m
	[31mfeature_repo/[m
	[31mgcp-key.b64.txt[m
	[31miitmbs-mlops-a99d6ce657ac.json[m
	[31minference.ipynb[m
	[31mraw_data/[m
	[31mrequirements.txt[m
	[31msrc/[m
	[31mtests/[m

nothing added to commit but untracked files present (use "git add" to track)


In [121]:
! git add artifacts/ data/stock_data.parquet data/stock_data.parquet.dvc feature_repo/ src/ tests/ .dvc/ .github/ requirements.txt .gitconfig .dvcignore

The following paths are ignored by one of your .gitignore files:
data/stock_data.parquet
[33mhint: Use -f if you really want to add them.[m
[33mhint: Turn this message off by running[m
[33mhint: "git config advice.addIgnoredFile false"[m


In [122]:
!git status

On branch dev

No commits yet

Changes to be committed:
  (use "git rm --cached <file>..." to unstage)
	[32mnew file:   .dvc/.gitignore[m
	[32mnew file:   .dvc/config[m
	[32mnew file:   .dvcignore[m
	[32mnew file:   .gitconfig[m
	[32mnew file:   .github/workflows/ci-dev.yml[m
	[32mnew file:   .github/workflows/ci-main.yml[m
	[32mnew file:   artifacts/model.joblib[m
	[32mnew file:   data/stock_data.parquet.dvc[m
	[32mnew file:   feature_repo/.ipynb_checkpoints/feature_store-checkpoint.yaml[m
	[32mnew file:   feature_repo/feature_store.yaml[m
	[32mnew file:   feature_repo/online_store.db[m
	[32mnew file:   requirements.txt[m
	[32mnew file:   src/.ipynb_checkpoints/process_data_v1-checkpoint.py[m
	[32mnew file:   src/.ipynb_checkpoints/process_data_v2-checkpoint.py[m
	[32mnew file:   src/.ipynb_checkpoints/train-checkpoint.py[m
	[32mnew file:   src/process_data_v1.py[m
	[32mnew file:   src/process_data_v2.py[m
	[32mnew file:   src/train.py[m
	[32mnew f

In [123]:
! git commit -m "Commit to both dev and main branch. First iteration done with 150 rows of iris data"

[dev (root-commit) 95e9b39] Commit to both dev and main branch. First iteration done with 150 rows of iris data
 24 files changed, 1472 insertions(+)
 create mode 100644 .dvc/.gitignore
 create mode 100644 .dvc/config
 create mode 100644 .dvcignore
 create mode 100644 .gitconfig
 create mode 100644 .github/workflows/ci-dev.yml
 create mode 100644 .github/workflows/ci-main.yml
 create mode 100644 artifacts/model.joblib
 create mode 100644 data/stock_data.parquet.dvc
 create mode 100644 feature_repo/.ipynb_checkpoints/feature_store-checkpoint.yaml
 create mode 100644 feature_repo/feature_store.yaml
 create mode 100644 feature_repo/online_store.db
 create mode 100644 requirements.txt
 create mode 100644 src/.ipynb_checkpoints/process_data_v1-checkpoint.py
 create mode 100644 src/.ipynb_checkpoints/process_data_v2-checkpoint.py
 create mode 100644 src/.ipynb_checkpoints/train-checkpoint.py
 create mode 100644 src/process_data_v1.py
 create mode 100644 src/process_data_v2.py
 create mode 10

In [124]:
!git log

[33mcommit 95e9b39fc8607322790cb5f32d0a3e8fbaad8230[m[33m ([m[1;36mHEAD -> [m[1;32mdev[m[33m)[m
Author: Satvik Chandrakar <chandrakarsatvik@gmail.com>
Date:   Fri Nov 7 12:46:39 2025 +0000

    Commit to both dev and main branch. First iteration done with 150 rows of iris data


In [125]:
!git remote add origin https://Satvik-ai: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@github.com/Satvik-ai/stock_test.git

In [126]:
!git push -u origin dev

Enumerating objects: 32, done.
Counting objects: 100% (32/32), done.
Delta compression using up to 4 threads
Compressing objects: 100% (29/29), done.
Writing objects: 100% (32/32), 358.65 KiB | 5.27 MiB/s, done.
Total 32 (delta 2), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (2/2), done.[K
To https://github.com/Satvik-ai/stock_test.git
 * [new branch]      dev -> dev
Branch 'dev' set up to track remote branch 'dev' from 'origin'.


In [127]:
!git checkout -b main

Switched to a new branch 'main'


In [128]:
!git status

On branch main
Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31m.bashrc[m
	[31m.cache/[m
	[31m.config/[m
	[31m.docker/[m
	[31m.gsutil/[m
	[31m.ipynb_checkpoints/[m
	[31m.ipython/[m
	[31m.jupyter/[m
	[31m.local/[m
	[31m.npm/[m
	[31mWorkbench.ipynb[m
	[31mdata/.gitignore[m
	[31mdata/stock_data_full.parquet[m
	[31mgcp-key.b64.txt[m
	[31miitmbs-mlops-a99d6ce657ac.json[m
	[31minference.ipynb[m
	[31mraw_data/[m

nothing added to commit but untracked files present (use "git add" to track)


In [129]:
!git push -u origin main

Total 0 (delta 0), reused 0 (delta 0), pack-reused 0
remote: 
remote: Create a pull request for 'main' on GitHub by visiting:[K
remote:      https://github.com/Satvik-ai/stock_test/pull/new/main[K
remote: 
To https://github.com/Satvik-ai/stock_test.git
 * [new branch]      main -> main
Branch 'main' set up to track remote branch 'main' from 'origin'.


### Pytest Code Changes

#### Add the below given code to test_data_validation.py, push the pytest code changes to Dev branch and raise Pull Request to main branch

In [None]:
# def test_numeric_columns_are_numeric_parquet():
#     df = pd.read_parquet("data/stock_data.parquet")
#     numeric_cols = ['open', 'high', 'low', 'close', 'volume', 'ma_15_min', 'ma_60_min', 'rsi_14']

#     for col in numeric_cols:
#         assert pd.api.types.is_numeric_dtype(df[col]), f"Column {col} is not numeric"

In [131]:
# Local test
! pytest tests/test_data_validation.py -v

platform linux -- Python 3.10.18, pytest-8.4.2, pluggy-1.6.0 -- /opt/conda/bin/python3
cachedir: .pytest_cache
rootdir: /home/jupyter
plugins: anyio-4.11.0, typeguard-4.4.4, hydra-core-1.3.2
collected 2 items                                                              [0m[1m

tests/test_data_validation.py::test_parquet_data_integrity [32mPASSED[0m[32m        [ 50%][0m
tests/test_data_validation.py::test_numeric_columns_are_numeric_parquet [32mPASSED[0m[32m [100%][0m



### Create Version 2 of Iris Data

In [132]:
! python src/process_data_v2.py

Processing: ADANIGAS__EQ__NSE__NSE__MINUTE.csv
Processing: ABCAPITAL__EQ__NSE__NSE__MINUTE.csv

✅ Full dataset saved to: data/stock_data_full.parquet
Total rows: 197833
✅ Sampled dataset (1000 rows) saved to: data/stock_data.parquet


In [133]:
local_parquet_data = "data/stock_data.parquet"
df = pd.read_parquet(local_parquet_data)
df.shape

(1000, 11)

In [134]:
df

Unnamed: 0,event_timestamp,stock_v2_id,open,high,low,close,volume,ma_15_min,ma_60_min,rsi_14,target
0,2019-05-14 12:01:00+05:30,ADANIGAS-v2-47447,117.35,117.55,117.35,117.55,239.0,117.520000,117.258333,53.333333,1
1,2017-10-12 09:45:00+05:30,ABCAPITAL-v2-10501,179.85,180.20,179.85,180.05,4419.0,179.550000,179.205000,79.310345,1
2,2019-07-12 15:29:00+05:30,ADANIGAS-v2-63405,163.15,163.25,163.10,163.20,3146.0,163.220000,163.623333,46.153846,1
3,2017-09-18 10:55:00+05:30,ABCAPITAL-v2-4196,203.00,203.00,202.75,202.75,3306.0,202.843333,202.285000,43.902439,1
4,2019-04-05 12:50:00+05:30,ADANIGAS-v2-38871,130.65,130.65,130.65,130.65,44.0,130.663333,130.395833,51.351351,0
...,...,...,...,...,...,...,...,...,...,...,...
995,2020-03-06 12:51:00+05:30,ADANIGAS-v2-123307,126.15,126.25,126.15,126.25,700.0,126.380000,125.474167,52.777778,0
996,2019-07-23 11:15:00+05:30,ADANIGAS-v2-65776,165.35,165.35,165.35,165.35,100.0,165.423333,165.770000,37.500000,0
997,2020-08-11 11:07:00+05:30,ADANIGAS-v2-162578,161.00,161.55,161.00,161.40,3346.0,162.030000,162.783333,26.666667,1
998,2020-03-09 11:20:00+05:30,ADANIGAS-v2-123591,122.25,122.50,122.20,122.45,964.0,122.056667,122.256667,71.428571,0


### Track Data Version 2 with DVC

In [135]:
! dvc add {local_parquet_data}

 [?25l[32m⠋[0m Checking graph
Adding...                                                                       
![A
Collecting files and computing hashes in data/stock_data.parquet |0.00 [00:00,  [A
                                                                                [A
![A
  0% Checking cache in '/home/jupyter/.dvc/cache/files/md5'| |0/? [00:00<?,    ?[A
                                                                                [A
![A
  0%|          |Adding data/stock_data.parquet to cach0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0%|          |Checking out /home/jupyter/data/stock_0/1 [00:00<?,    ?files/s][A
100% Adding...|████████████████████████████████████████|1/1 [00:00, 12.55file/s][A

To track the changes with git, run:

	git add data/stock_data.parquet.dvc

To enable auto staging, run:

	dvc config core.autostage true
[0m

In [136]:
! dvc push

Collecting                                            |1.00 [00:00,  178entry/s]
Pushing
![A
  0% Checking cache in 'iitmbs-mlops-21f1000344/files/md5'| |0/? [00:00<?,    ?f[A
                                                                                [A
![A
  0% Checking cache in '/home/jupyter/.dvc/cache/files/md5'| |0/? [00:00<?,    ?[A
                                                                                [A
![A
  0%|          |Pushing to gs                         0/1 [00:00<?,     ?file/s][A

![A[A

  0%|          |/home/jupyter/.dvc/cache/files/0.00/66.7k [00:00<?,        ?B/s][A[A

100%|██████████|/home/jupyter/.dvc/cache/f66.7k/66.7k [00:00<00:00,     551kB/s][A[A

                                                                                [A[A
100%|██████████|Pushing to gs                     1/1 [00:00<00:00,  6.04file/s][A
Pushing                                                                         [A
1 file pushed
[0m

In [137]:
# --------------------------
# Define Feast Feature Store config
# --------------------------
entity = Entity(name="stock_v2_id", join_keys=["stock_v2_id"])

stock_source = FileSource(
    path=os.path.abspath(local_parquet_data), # offline store data location
    timestamp_field="event_timestamp",
)

stock_fv = FeatureView(
    name="stock_features_v2",
    entities=[entity],
    ttl=timedelta(days=3650),
    schema=[
        Field(name="open", dtype=Float64),
        Field(name="high", dtype=Float64),
        Field(name="low", dtype=Float64),
        Field(name="close", dtype=Float64),
        Field(name="volume", dtype=Float64),
        Field(name="ma_15_min", dtype=Float64),
        Field(name="ma_60_min", dtype=Float64),
        Field(name="rsi_14", dtype=Float64),
        Field(name="target", dtype=Int64),
    ],
    online=True,
    source=stock_source,
    tags={
        "stock_data_version": "v2",
        "description": "Features derived from v2 of stock dataset",
    },
)

  entity = Entity(name="stock_v2_id", join_keys=["stock_v2_id"])


In [138]:
store.apply([entity, stock_fv])

In [139]:
# Materialize iris_features_v2 into online store
end_time = df['event_timestamp'].max()
print("Materializing features...")
store.materialize_incremental(
    end_date=end_time,
    feature_views=["stock_features_v2"]
)

Materializing features...
Materializing [1m[32m1[0m feature views to [1m[32m2021-01-01 13:33:00+05:30[0m into the [1m[32msqlite[0m online store.

[1m[32mstock_features_v2[0m from [1m[32m2015-11-10 13:01:14+00:00[0m to [1m[32m2021-01-01 13:33:00+05:30[0m:


### Train Model with Data Version 2

In [140]:
! python src/train.py --n_estimators 10 --max_depth 3 --random_state 42 --version "v2" --stratify YES
! gsutil cp artifacts/model.joblib {BUCKET_URI}/models/ #Upload Model Artifacts to Cloud Storage

Starting training script...
Parameters: n_estimators=10, max_depth=3, version=v2

[1/6] Setting up MLflow...
MLflow tracking URI: http://127.0.0.1:8100
✅ MLflow setup successful

[2/6] Loading local data...
Loading from: data/stock_data.parquet
✅ Data loaded. Shape: (1000, 11)
Columns: ['event_timestamp', 'stock_v2_id', 'open', 'high', 'low', 'close', 'volume', 'ma_15_min', 'ma_60_min', 'rsi_14', 'target']

[3/6] Initializing Feast Feature Store...
✅ Feast store initialized
Memory before fetch: 14.03 GB

[4/6] Fetching features from Feast...
Requesting feature view: stock_features_v2
✅ Features fetched successfully
Training data shape after dropna: (1000, 20)
            event_timestamp        stock_v2_id  ...   rsi_14__  target__
0 2017-09-07 04:14:00+00:00  ABCAPITAL-v2-1500  ...  85.714286         1
1 2017-09-07 05:12:00+00:00  ABCAPITAL-v2-1558  ...  78.125000         0
2 2017-09-08 06:53:00+00:00  ABCAPITAL-v2-2034  ...  54.929577         0
3 2017-09-08 07:17:00+00:00  ABCAPITAL-v

In [141]:
! python src/train.py --n_estimators 20 --max_depth 3 --random_state 7 --version "v2" --stratify YES
! gsutil cp artifacts/model.joblib {BUCKET_URI}/models/ #Upload Model Artifacts to Cloud Storage

Starting training script...
Parameters: n_estimators=20, max_depth=3, version=v2

[1/6] Setting up MLflow...
MLflow tracking URI: http://127.0.0.1:8100
✅ MLflow setup successful

[2/6] Loading local data...
Loading from: data/stock_data.parquet
✅ Data loaded. Shape: (1000, 11)
Columns: ['event_timestamp', 'stock_v2_id', 'open', 'high', 'low', 'close', 'volume', 'ma_15_min', 'ma_60_min', 'rsi_14', 'target']

[3/6] Initializing Feast Feature Store...
✅ Feast store initialized
Memory before fetch: 14.03 GB

[4/6] Fetching features from Feast...
Requesting feature view: stock_features_v2
✅ Features fetched successfully
Training data shape after dropna: (1000, 20)
            event_timestamp        stock_v2_id  ...   rsi_14__  target__
0 2017-09-07 04:14:00+00:00  ABCAPITAL-v2-1500  ...  85.714286         1
1 2017-09-07 05:12:00+00:00  ABCAPITAL-v2-1558  ...  78.125000         0
2 2017-09-08 06:53:00+00:00  ABCAPITAL-v2-2034  ...  54.929577         0
3 2017-09-08 07:17:00+00:00  ABCAPITAL-v

In [142]:
! python src/train.py --n_estimators 100 --max_depth 5 --random_state 7 --version "v2" --stratify NO
!gsutil cp artifacts/model.joblib {BUCKET_URI}/models/ #Upload Model Artifacts to Cloud Storage

Starting training script...
Parameters: n_estimators=100, max_depth=5, version=v2

[1/6] Setting up MLflow...
MLflow tracking URI: http://127.0.0.1:8100
✅ MLflow setup successful

[2/6] Loading local data...
Loading from: data/stock_data.parquet
✅ Data loaded. Shape: (1000, 11)
Columns: ['event_timestamp', 'stock_v2_id', 'open', 'high', 'low', 'close', 'volume', 'ma_15_min', 'ma_60_min', 'rsi_14', 'target']

[3/6] Initializing Feast Feature Store...
✅ Feast store initialized
Memory before fetch: 14.03 GB

[4/6] Fetching features from Feast...
Requesting feature view: stock_features_v2
✅ Features fetched successfully
Training data shape after dropna: (1000, 20)
            event_timestamp        stock_v2_id  ...   rsi_14__  target__
0 2017-09-07 04:14:00+00:00  ABCAPITAL-v2-1500  ...  85.714286         1
1 2017-09-07 05:12:00+00:00  ABCAPITAL-v2-1558  ...  78.125000         0
2 2017-09-08 06:53:00+00:00  ABCAPITAL-v2-2034  ...  54.929577         0
3 2017-09-08 07:17:00+00:00  ABCAPITAL-

In [143]:
! python src/train.py --n_estimators 250 --max_depth 3 --random_state 7 --version "v2" --stratify NO
!gsutil cp artifacts/model.joblib {BUCKET_URI}/models/ #Upload Model Artifacts to Cloud Storage

Starting training script...
Parameters: n_estimators=250, max_depth=3, version=v2

[1/6] Setting up MLflow...
MLflow tracking URI: http://127.0.0.1:8100
✅ MLflow setup successful

[2/6] Loading local data...
Loading from: data/stock_data.parquet
✅ Data loaded. Shape: (1000, 11)
Columns: ['event_timestamp', 'stock_v2_id', 'open', 'high', 'low', 'close', 'volume', 'ma_15_min', 'ma_60_min', 'rsi_14', 'target']

[3/6] Initializing Feast Feature Store...
✅ Feast store initialized
Memory before fetch: 14.03 GB

[4/6] Fetching features from Feast...
Requesting feature view: stock_features_v2
✅ Features fetched successfully
Training data shape after dropna: (1000, 20)
            event_timestamp        stock_v2_id  ...   rsi_14__  target__
0 2017-09-07 04:14:00+00:00  ABCAPITAL-v2-1500  ...  85.714286         1
1 2017-09-07 05:12:00+00:00  ABCAPITAL-v2-1558  ...  78.125000         0
2 2017-09-08 06:53:00+00:00  ABCAPITAL-v2-2034  ...  54.929577         0
3 2017-09-08 07:17:00+00:00  ABCAPITAL-

### Add to Git, Commit and Push to Dev Branch

In [144]:
!git checkout dev

M	artifacts/model.joblib
M	data/stock_data.parquet.dvc
M	feature_repo/online_store.db
M	tests/.ipynb_checkpoints/test_data_validation-checkpoint.py
M	tests/__pycache__/test_data_validation.cpython-310-pytest-8.4.2.pyc
M	tests/test_data_validation.py
Switched to branch 'dev'
Your branch is up to date with 'origin/dev'.


In [145]:
!git status

On branch dev
Your branch is up to date with 'origin/dev'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   artifacts/model.joblib[m
	[31mmodified:   data/stock_data.parquet.dvc[m
	[31mmodified:   feature_repo/online_store.db[m
	[31mmodified:   tests/.ipynb_checkpoints/test_data_validation-checkpoint.py[m
	[31mmodified:   tests/__pycache__/test_data_validation.cpython-310-pytest-8.4.2.pyc[m
	[31mmodified:   tests/test_data_validation.py[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31m.bashrc[m
	[31m.cache/[m
	[31m.config/[m
	[31m.docker/[m
	[31m.gsutil/[m
	[31m.ipynb_checkpoints/[m
	[31m.ipython/[m
	[31m.jupyter/[m
	[31m.local/[m
	[31m.npm/[m
	[31mWorkbench.ipynb[m
	[31mdata/.gitignore[m
	[31mdata/stock_data_full.parquet[m
	[31mgcp-key.b64.txt[m
	[31miitmbs-mlops-a99d6ce657

In [146]:
!git add data/stock_data.parquet.dvc tests/ artifacts/ feature_repo/

In [147]:
!git status

On branch dev
Your branch is up to date with 'origin/dev'.

Changes to be committed:
  (use "git restore --staged <file>..." to unstage)
	[32mmodified:   artifacts/model.joblib[m
	[32mmodified:   data/stock_data.parquet.dvc[m
	[32mmodified:   feature_repo/online_store.db[m
	[32mmodified:   tests/.ipynb_checkpoints/test_data_validation-checkpoint.py[m
	[32mmodified:   tests/__pycache__/test_data_validation.cpython-310-pytest-8.4.2.pyc[m
	[32mmodified:   tests/test_data_validation.py[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31m.bashrc[m
	[31m.cache/[m
	[31m.config/[m
	[31m.docker/[m
	[31m.gsutil/[m
	[31m.ipynb_checkpoints/[m
	[31m.ipython/[m
	[31m.jupyter/[m
	[31m.local/[m
	[31m.npm/[m
	[31mWorkbench.ipynb[m
	[31mdata/.gitignore[m
	[31mdata/stock_data_full.parquet[m
	[31mgcp-key.b64.txt[m
	[31miitmbs-mlops-a99d6ce657ac.json[m
	[31minference.ipynb[m
	[31mraw_data/[m



In [148]:
! git commit -m "Second commit to dev branch. Pytest code changes and second iteration with 300 rows of iris data"

[dev 8ee3c51] Second commit to dev branch. Pytest code changes and second iteration with 300 rows of iris data
 6 files changed, 18 insertions(+), 4 deletions(-)
 rewrite artifacts/model.joblib (80%)


In [149]:
!git log

[33mcommit 8ee3c516e1aae4166236354bd2ebed624fb0daaa[m[33m ([m[1;36mHEAD -> [m[1;32mdev[m[33m)[m
Author: Satvik Chandrakar <chandrakarsatvik@gmail.com>
Date:   Fri Nov 7 13:06:38 2025 +0000

    Second commit to dev branch. Pytest code changes and second iteration with 300 rows of iris data

[33mcommit 95e9b39fc8607322790cb5f32d0a3e8fbaad8230[m[33m ([m[1;31morigin/main[m[33m, [m[1;31morigin/dev[m[33m, [m[1;32mmain[m[33m)[m
Author: Satvik Chandrakar <chandrakarsatvik@gmail.com>
Date:   Fri Nov 7 12:46:39 2025 +0000

    Commit to both dev and main branch. First iteration done with 150 rows of iris data


In [150]:
!git push origin dev

Enumerating objects: 25, done.
Counting objects: 100% (25/25), done.
Delta compression using up to 4 threads
Compressing objects: 100% (12/12), done.
Writing objects: 100% (13/13), 586.63 KiB | 4.85 MiB/s, done.
Total 13 (delta 6), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (6/6), completed with 6 local objects.[K
To https://github.com/Satvik-ai/stock_test.git
   95e9b39..8ee3c51  dev -> dev
