In [1]:
pip install google-cloud-aiplatform

Collecting google-cloud-aiplatform
  Using cached google_cloud_aiplatform-1.79.0-py2.py3-none-any.whl (7.1 MB)
Collecting google-cloud-resource-manager<3.0.0dev,>=1.3.3
  Using cached google_cloud_resource_manager-1.14.0-py2.py3-none-any.whl (384 kB)
Collecting google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.34.1
  Using cached google_api_core-2.24.1-py3-none-any.whl (160 kB)
Collecting docstring-parser<1
  Using cached docstring_parser-0.16-py3-none-any.whl (36 kB)
Collecting google-cloud-storage<3.0.0dev,>=1.32.0
  Using cached google_cloud_storage-2.19.0-py2.py3-none-any.whl (131 kB)
Collecting google-cloud-bigquery!=3.20.0,<4.0.0dev,>=1.15.0
  Using cached google_cloud_bigquery-3.29.0-py2.py3-none-any.whl (244 kB)
Collecting google-auth<3.0.0dev,>=2.14.1
  Using cached google_auth-2.38.0-py2.py3-none-any.whl (210 kB)
Collecting pydantic<3
  Using cached pydantic-2.10.6-py3-none-any.whl (431 kB)
Collecting proto-plus<2.0.0dev,>=1.22

In [3]:
import google.cloud.aiplatform
print("Google Cloud AI Platform module is available.")

Google Cloud AI Platform module is available.


In [8]:
import gcsfs
print("gcsfs is installed and working!")

gcsfs is installed and working!


In [11]:
import pandas as pd

# Load dataset from Google Cloud Storage (GCS)
df = pd.read_csv("gs://mlb-prospect-data/cleaned_mlb_homeruns.csv")

# Print column names
print("Columns in dataset:", df.columns.tolist())

Columns in dataset: ['play_id', 'ExitVelocity', 'HitDistance', 'LaunchAngle', 'Year', 'WAR', 'PlayerName']


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14064 entries, 0 to 14063
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   play_id       14064 non-null  object 
 1   ExitVelocity  14064 non-null  float64
 2   HitDistance   14064 non-null  float64
 3   LaunchAngle   14064 non-null  float64
 4   Year          14064 non-null  int64  
 5   WAR           14064 non-null  float64
 6   PlayerName    14064 non-null  object 
dtypes: float64(4), int64(1), object(2)
memory usage: 769.2+ KB


In [None]:
from google.cloud import aiplatform
import pandas as pd

# Initialize Vertex AI
aiplatform.init(project="mlb-project-449501", location="us-central1")

# ✅ Load dataset from Google Cloud Storage into Pandas DataFrame
df = pd.read_csv("gs://mlb-prospect-data/cleaned_mlb_homeruns.csv")

# ✅ Print column names to verify target variable exists
print("Columns in dataset:", df.columns.tolist())

# ✅ Register dataset in Vertex AI if not already registered
dataset = aiplatform.TabularDataset.create(
    display_name="mlb_prospect_prediction",
    gcs_source="gs://mlb-prospect-data/cleaned_mlb_homeruns.csv"
)

# ✅ Train AutoML Model
model = aiplatform.AutoMLTabularTrainingJob(
    display_name="prospect_model",
    optimization_prediction_type="regression"  # Change to 'classification' if predicting player tiers
)

# ✅ Run training job
model = model.run(
    dataset=dataset,
    target_column="WAR",  # Ensure "WAR" exists in dataset
    model_display_name="mlb_prospect_model",
    budget_milli_node_hours=5000,  # Increase for better results
    disable_early_stopping=False  # Enables early stopping to prevent overfitting
)

print("🚀 Model training started successfully!")

Columns in dataset: ['play_id', 'ExitVelocity', 'HitDistance', 'LaunchAngle', 'Year', 'WAR', 'PlayerName']
Creating TabularDataset
Create TabularDataset backing LRO: projects/811481050675/locations/us-central1/datasets/2566667508287275008/operations/3418638833397268480
TabularDataset created. Resource name: projects/811481050675/locations/us-central1/datasets/2566667508287275008
To use this TabularDataset in another session:
ds = aiplatform.TabularDataset('projects/811481050675/locations/us-central1/datasets/2566667508287275008')
No column transformations provided, so now retrieving columns from dataset in order to set default column transformations.
The column transformation of type 'auto' was set for the following columns: ['LaunchAngle', 'HitDistance', 'play_id', 'PlayerName', 'Year', 'ExitVelocity'].
No dataset split provided. The service will use a default split.
View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/3050350335567593472?project=

In [5]:
datasets = aiplatform.TabularDataset.list()
print([d.resource_name for d in datasets])

['projects/811481050675/locations/us-central1/datasets/4336159949378813952']
