# Deploying stock model using Vertex AI


## Get started

### Install Vertex AI SDK for Python and other required packages



In [1]:

# Vertex SDK for Python
! pip3 install --upgrade --quiet  google-cloud-aiplatform

### Set Google Cloud project information
Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [2]:
PROJECT_ID = "amiable-dreamer-461212-u2"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

### Create a Cloud Storage bucket

Create a storage bucket to store intermediate artifacts such as datasets.

In [5]:
BUCKET_URI = f"gs://21f1000629_mlops-oppe-1"  # @param {type:"string"}

**If your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [6]:
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

Creating gs://21f1000629_mlops-oppe-1/...


### Initialize Vertex AI SDK for Python

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

In [7]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

### Import the required libraries

In [8]:
import os
import sys

### Configure resource names

Set a name for the following parameters:

`MODEL_ARTIFACT_DIR` - Folder directory path to your model artifacts within a Cloud Storage bucket, for example: "my-models/fraud-detection/trial-4"

`REPOSITORY` - Name of the Artifact Repository to create or use.

`IMAGE` - Name of the container image that is pushed to the repository.

`MODEL_DISPLAY_NAME` - Display name of Vertex AI model resource.

In [9]:
MODEL_ARTIFACT_DIR = "my-models/stock-prediction-classifier"  # @param {type:"string"}
REPOSITORY = "stock-classifier-repo"  # @param {type:"string"}
IMAGE = "stock-classifier-img"  # @param {type:"string"}
MODEL_DISPLAY_NAME = "stock-classifier"  # @param {type:"string"}

# Set the defaults if no names were specified
if MODEL_ARTIFACT_DIR == "[your-artifact-directory]":
    MODEL_ARTIFACT_DIR = "custom-container-prediction-model"

if REPOSITORY == "[your-repository-name]":
    REPOSITORY = "custom-container-prediction"

if IMAGE == "[your-image-name]":
    IMAGE = "sklearn-fastapi-server"

if MODEL_DISPLAY_NAME == "[your-model-display-name]":
    MODEL_DISPLAY_NAME = "sklearn-custom-container"

## Simple clasifier model
Build a classifier model on stock data

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pandas.plotting import parallel_coordinates
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import metrics

data = pd.read_csv('Stock_prediction_pipeline/data/v0/AARTIIND__EQ__NSE__NSE__MINUTE.csv')
data.head(5)

Unnamed: 0,timestamp,open,high,low,close,volume
0,2017-01-02 09:15:00+05:30,340.0,340.0,340.0,340.0,11.0
1,2017-01-02 09:16:00+05:30,340.0,340.0,340.0,340.0,0.0
2,2017-01-02 09:17:00+05:30,340.0,340.0,340.0,340.0,0.0
3,2017-01-02 09:18:00+05:30,340.0,343.7,340.0,343.7,1.0
4,2017-01-02 09:19:00+05:30,343.7,343.7,343.7,343.7,1.0


In [13]:


# === Set up paths ===
DATA_DIR = "Stock_prediction_pipeline/data/v0"
FILES = ["AARTIIND__EQ__NSE__NSE__MINUTE.csv", "ABCAPITAL__EQ__NSE__NSE__MINUTE.csv"]
OUTPUT_DIR = "Stock_prediction_pipeline/data/processed"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def load_and_prepare_stock(file_name):
    stock_name = file_name.split("__")[0]
    df = pd.read_csv(os.path.join(DATA_DIR, file_name))

    # Convert timestamp and sort
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df.sort_values('timestamp')
    df['stock'] = stock_name

    # Fill missing minutes
    full_range = pd.date_range(start=df['timestamp'].min(), end=df['timestamp'].max(), freq='min')

    full_df = pd.DataFrame({'timestamp': full_range})
    df = pd.merge(full_df, df, on='timestamp', how='left')
    df['stock'] = df['stock'].fillna(stock_name)
    df[['open', 'high', 'low', 'close', 'volume']] = df[['open', 'high', 'low', 'close', 'volume']].ffill()
    df = df.dropna()

    # Feature engineering
    df['rolling_avg_10'] = df['close'].rolling(window=10, min_periods=10).mean()
    df['volume_sum_10'] = df['volume'].rolling(window=10, min_periods=10).sum()

    # Target generation
    df['future_close'] = df['close'].shift(-5)
    df['target'] = (df['future_close'] > df['close']).astype(int)
    df = df.drop(columns=['future_close'])

    # Drop incomplete rows
    df = df.dropna(subset=['rolling_avg_10', 'volume_sum_10'])

    return df

# === Process both stocks ===
combined_df = pd.concat([load_and_prepare_stock(f) for f in FILES], ignore_index=True)

# Shuffle for modeling
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save result
output_path = os.path.join(OUTPUT_DIR, "v0_processed.csv")
combined_df.to_csv(output_path, index=False)

print("✅ Done. Cleaned data saved to:", output_path)


✅ Done. Cleaned data saved to: Stock_prediction_pipeline/data/processed/v0_processed.csv


In [14]:
from sklearn.model_selection import train_test_split
import pandas as pd
import os

# Load the processed data
processed_path = "Stock_prediction_pipeline/data/processed/v0_processed.csv"
df = pd.read_csv(processed_path)

# Train-test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

# Save the splits
train_path = "Stock_prediction_pipeline/data/processed/train.csv"
test_path = "Stock_prediction_pipeline/data/processed/test.csv"

train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path, index=False)

print("✅ Train-test split done.")
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")


✅ Train-test split done.
Train shape: (3085618, 10)
Test shape: (771405, 10)


In [15]:
import os
import pandas as pd
import glob

# === Set up paths ===
DATA_DIRS = ["Stock_prediction_pipeline/data/v0", "Stock_prediction_pipeline/data/v1"]
OUTPUT_DIR = "Stock_prediction_pipeline/data/processed"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# === Get all CSV files from both folders ===
FILES = []
for directory in DATA_DIRS:
    FILES.extend(glob.glob(f"{directory}/*.csv"))

def load_and_prepare_stock(file_path):
    file_name = os.path.basename(file_path)
    stock_name = file_name.split("__")[0]
    df = pd.read_csv(file_path)

    # Convert timestamp and sort
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df.sort_values('timestamp')
    df['stock'] = stock_name

    # Fill missing minutes
    full_range = pd.date_range(start=df['timestamp'].min(), end=df['timestamp'].max(), freq='min')
    full_df = pd.DataFrame({'timestamp': full_range})
    df = pd.merge(full_df, df, on='timestamp', how='left')
    df['stock'] = df['stock'].fillna(stock_name)
    df[['open', 'high', 'low', 'close', 'volume']] = df[['open', 'high', 'low', 'close', 'volume']].ffill()
    df = df.dropna()

    # Feature engineering
    df['rolling_avg_10'] = df['close'].rolling(window=10, min_periods=10).mean()
    df['volume_sum_10'] = df['volume'].rolling(window=10, min_periods=10).sum()

    # Target generation
    df['future_close'] = df['close'].shift(-5)
    df['target'] = (df['future_close'] > df['close']).astype(int)
    df = df.drop(columns=['future_close'])

    # Drop incomplete rows
    df = df.dropna(subset=['rolling_avg_10', 'volume_sum_10'])

    return df

# === Process all stocks ===
combined_df = pd.concat([load_and_prepare_stock(f) for f in FILES], ignore_index=True)

# Shuffle for modeling
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save result for v1
output_path = os.path.join(OUTPUT_DIR, "v1_processed.csv")
combined_df.to_csv(output_path, index=False)

print("✅ Done. Cleaned data saved to:", output_path)


✅ Done. Cleaned data saved to: Stock_prediction_pipeline/data/processed/v1_processed.csv


In [16]:
from sklearn.model_selection import train_test_split
import pandas as pd
import os

# Load the v1 processed data
processed_path = "Stock_prediction_pipeline/data/processed/v1_processed.csv"
df = pd.read_csv(processed_path)

# Train-test split for v1
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

# Save the splits for v1
train_path = "Stock_prediction_pipeline/data/processed/train_v1.csv"
test_path = "Stock_prediction_pipeline/data/processed/test_v1.csv"

train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path, index=False)

print("✅ v1 Train-test split done.")
print(f"v1 Train shape: {train_df.shape}")
print(f"v1 Test shape: {test_df.shape}")


✅ v1 Train-test split done.
v1 Train shape: (7358089, 10)
v1 Test shape: (1839523, 10)


In [None]:
# import pickle
# import joblib

# joblib.dump(mod_dt, "artifacts/model.joblib")  #do this later

['artifacts/model.joblib']

### Upload model artifacts and custom code to Cloud Storage

Before you can deploy your model for serving, Vertex AI needs access to the following files in Cloud Storage:

* `model.joblib` (model artifact)
* `preprocessor.pkl` (model artifact)

Run the following commands to upload your files:

In [None]:
# !gsutil cp artifacts/model.joblib {BUCKET_URI}/{MODEL_ARTIFACT_DIR}/ # do this later

Copying file://artifacts/model.joblib [Content-Type=application/octet-stream]...
/ [1 files][  2.6 KiB/  2.6 KiB]                                                
Operation completed over 1 objects/2.6 KiB.                                      


In [2]:
!pwd

/home/jupyter/Stock_prediction_pipeline


In [4]:
# import pandas as pd

# # Load CSV

# df = pd.read_csv("/home/jupyter/Stock_prediction_pipeline/data/processed/train.csv")

# # Save as Parquet
# df.to_parquet("/home/jupyter/Stock_prediction_pipeline/data/processed/train.parquet")


In [3]:
import pandas as pd

df = pd.read_csv("/home/jupyter/Stock_prediction_pipeline/data/processed/v1_processed.csv")
df.to_parquet("/home/jupyter/Stock_prediction_pipeline/data/processed/v1_processed.parquet", index=False)

In [4]:
import pandas as pd
df = pd.read_parquet("/home/jupyter/Stock_prediction_pipeline/data/processed/v1_processed.parquet")
print(df['timestamp'].min(), df['timestamp'].max())


2017-01-02 09:24:00+05:30 2021-01-01 15:29:00+05:30


In [5]:
import pandas as pd

file_path = "/home/jupyter/Stock_prediction_pipeline/data/processed/v1_processed.csv"

df = pd.read_csv(file_path)

# Step 1: Convert to datetime and force UTC awareness if needed
df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True, errors="coerce")

# Step 2: Convert from UTC to IST (Asia/Kolkata)
df["timestamp"] = df["timestamp"].dt.tz_convert("Asia/Kolkata")

# Optional: If Feast prefers UTC
# df["timestamp"] = df["timestamp"].dt.tz_convert("UTC")

# Save to Parquet
df.to_parquet("/home/jupyter/Stock_prediction_pipeline/data/processed/v1_processed.parquet", index=False)


In [6]:
mv /home/jupyter/Stock_prediction_pipeline/data/processed/v1_processed.parquet /home/jupyter/Stock_prediction_pipeline/feature_repo/data/v1_processed.parquet