In [None]:
!gsutil cp -r gs://cloud-ai-platform-c541b3e3-934f-414e-9196-8e2bf7a7fb59 .

# cloud-ai-platform-c541b3e3-934f-414e-9196-8e2bf7a7fb59

'gsutil' is not recognized as an internal or external command,
operable program or batch file.


In [None]:
from google.cloud import aiplatform


def export_model_sample(
    project: str,
    model_id: str,
    gcs_destination_output_uri_prefix: str,
    location: str = "us-central1",
    api_endpoint: str = "us-central1-aiplatform.googleapis.com",
    timeout: int = 300,
):
    # The AI Platform services require regional API endpoints.
    client_options = {"api_endpoint": api_endpoint}
    # Initialize client that will be used to create and send requests.
    # This client only needs to be created once, and can be reused for multiple requests.
    client = aiplatform.gapic.ModelServiceClient(client_options=client_options)
    output_config = {
        "artifact_destination": {
            "output_uri_prefix": gcs_destination_output_uri_prefix
        },
        # For information about export formats: https://cloud.google.com/ai-platform-unified/docs/export/export-edge-model#aiplatform_export_model_sample-drest
        "export_format_id": "tf-saved-model",
    }
    name = client.model_path(project=project, location=location, model=model_id)
    response = client.export_model(name=name, output_config=output_config)
    print("Long running operation:", response.operation.name)
    print("output_info:", response.metadata.output_info)
    export_model_response = response.result(timeout=timeout)
    print("export_model_response:", export_model_response)

export_model_sample(
    project="llm-db",
    model_id="hf_11_24_smote",
    gcs_destination_output_uri_prefix="gs://cloud-ai-platform-c541b3e3-934f-414e-9196-8e2bf7a7fb59/exported_model",
)


ModuleNotFoundError: No module named 'google'

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_io as tfio   # must come before loading the model

# path to the directory containing saved_model.pb
MODEL_DIR = "./download_dir/.../predict/001"

# load the model
loaded = tf.saved_model.load(MODEL_DIR)
infer = loaded.signatures["serving_default"]

# read your test set
df_test = pd.read_csv("path/to/your/test.csv")
features = ["task_group","author_category","language_category","location"]  # whatever your model signature expects

# build inputs dict
inputs = {}
for col in features:
    # if your model expects string tensors:
    inputs[col] = tf.constant(df_test[col].astype(str).values)
    # or float: tf.constant(df_test[col].values, dtype=tf.float32)

# call the model
outputs = infer(**inputs)

# inspect output keys
print("Available outputs:", list(outputs.keys()))
# typically something like 'output_0' or 'predictions'
pred_tensor = outputs["output_0"]    # replace with the right key

# if it’s logits or probabilities, you might need argmax
if pred_tensor.shape[-1] > 1:
    pred_indices = tf.argmax(pred_tensor, axis=-1).numpy()
else:
    # if binary, maybe threshold
    pred_indices = (pred_tensor.numpy() > 0.5).astype(int).flatten()

# decode indices back to category names if you kept a LabelEncoder
# (assuming you saved it when exporting)
#    from sklearn.preprocessing import LabelEncoder
#    le = load_my_label_encoder()
#    preds = le.inverse_transform(pred_indices)

# attach predictions
df_test["pred_idx"] = pred_indices
print(df_test.head())


# Get predictions from server

In [2]:
import pandas as pd

df = pd.read_csv("../data/hf_models_withmodelcard_nov2024.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1098260 entries, 0 to 1098259
Data columns (total 26 columns):
 #   Column               Non-Null Count    Dtype 
---  ------               --------------    ----- 
 0   model_id             1098260 non-null  object
 1   num_downloads        1098260 non-null  int64 
 2   num_likes            1098260 non-null  int64 
 3   is_private           1098260 non-null  bool  
 4   task                 1098260 non-null  object
 5   tags                 1098260 non-null  object
 6   author               1098260 non-null  object
 7   author_category      1098260 non-null  object
 8   base_model_relation  322 non-null      object
 9   base_model           269044 non-null   object
 10  language             1098260 non-null  object
 11  model_creator        6528 non-null     object
 12  model_type           4577 non-null     object
 13  model_name           6433 non-null     object
 14  model_card_tags      376584 non-null   object
 15  datasets       

In [8]:
df.base_model_relation.value_counts()

base_model_relation
"quantized"    180
"adapter"       53
"finetune"      49
"merge"         40
Name: count, dtype: int64

In [10]:
df.base_model.nunique()

24070

In [None]:
base_model_relation , base_model , model_card_tags , datasets, library_name

In [None]:
features = [
    "task_group",
    "author_category",
    "language_category",
    "location",
    "downloads_category"
]
df = df[features]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1098260 entries, 0 to 1098259
Data columns (total 5 columns):
 #   Column              Non-Null Count    Dtype 
---  ------              --------------    ----- 
 0   task_group          1098260 non-null  object
 1   author_category     1098260 non-null  object
 2   language_category   1098260 non-null  object
 3   location            1098260 non-null  object
 4   downloads_category  1098260 non-null  object
dtypes: object(5)
memory usage: 41.9+ MB


In [3]:
df.head()

Unnamed: 0,task_group,author_category,language_category,location,downloads_category
0,Text Processing,Bronze,High,us,Very Low
1,Unknown,Bronze,High,us,Very Low
2,Unknown,Bronze,High,us,Very Low
3,Unknown,Silver,High,us,Very Low
4,Unknown,Silver,High,us,Very Low


In [4]:
df.downloads_category.value_counts()

downloads_category
Very Low    807979
Low         249325
Mid          26888
High         14068
Name: count, dtype: int64

In [5]:
import pandas as pd
import json


sample = df

# 2) For each row, build a dict of your feature columns
instances = []
for _, row in sample.iterrows():
    instances.append({
        "task_group": row["task_group"],
        "author_category": row["author_category"],
        "language_category": row["language_category"],
        "location": row["location"]
    })

payload = {"instances": instances}

# 3) Write that to a JSON file
with open("request.json", "w") as f:
    json.dump(payload, f, indent=2)

print("Wrote", len(instances), "instances to request.json")


Wrote 1098260 instances to request.json


In [7]:
import math

# Split the instances list into 10 roughly equal parts
num_parts = 10
part_size = math.ceil(len(instances) / num_parts)
for i in range(num_parts):
    start = i * part_size
    end = min((i + 1) * part_size, len(instances))
    part = instances[start:end]
    filename = f"request_part{i+1}.json"
    with open(filename, "w") as f:
        json.dump({"instances": part}, f, indent=2)
    print(f"Wrote {len(part)} instances to {filename}")


Wrote 109826 instances to request_part1.json
Wrote 109826 instances to request_part2.json
Wrote 109826 instances to request_part3.json
Wrote 109826 instances to request_part4.json
Wrote 109826 instances to request_part5.json
Wrote 109826 instances to request_part6.json
Wrote 109826 instances to request_part7.json
Wrote 109826 instances to request_part8.json
Wrote 109826 instances to request_part9.json
Wrote 109826 instances to request_part10.json


In [15]:
!curl -X POST -H "Content-Type: application/json" --data @request.json http://localhost:8080/predict

{"predictions": [{"scores": [0.3295256793498993, 0.36474868655204773, 0.14926689863204956, 0.15645872056484222], "classes": ["High", "Mid", "Very Low", "Low"]}, {"scores": [0.3295256793498993, 0.36474868655204773, 0.14926689863204956, 0.15645872056484222], "classes": ["High", "Mid", "Very Low", "Low"]}, {"scores": [0.3295256793498993, 0.36474868655204773, 0.14926689863204956, 0.15645872056484222], "classes": ["High", "Mid", "Very Low", "Low"]}, {"scores": [0.3295256793498993, 0.36474868655204773, 0.14926689863204956, 0.15645872056484222], "classes": ["High", "Mid", "Very Low", "Low"]}, {"scores": [0.3295256793498993, 0.36474868655204773, 0.14926689863204956, 0.15645872056484222], "classes": ["High", "Mid", "Very Low", "Low"]}]}


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  1420  100   737  100   683   3011   2790 --:--:-- --:--:-- --:--:--  5795


In [17]:
import requests
from sklearn.metrics import accuracy_score

# load your full test set
df_test = pd.read_csv("../data/hf_models_withmodelcard_nov2024.csv")
X_test = df_test[["task_group","author_category","language_category","location"]]
y_true = df_test["downloads_category"]

In [19]:
df.downloads_category.value_counts()

downloads_category
Very Low    807979
Low         249325
Mid          26888
High         14068
Name: count, dtype: int64

In [20]:
instances = X_test.to_dict(orient="records")
print(instances[:5])

[{'task_group': 'Text Processing', 'author_category': 'Bronze', 'language_category': 'High', 'location': 'us'}, {'task_group': 'Unknown', 'author_category': 'Bronze', 'language_category': 'High', 'location': 'us'}, {'task_group': 'Unknown', 'author_category': 'Bronze', 'language_category': 'High', 'location': 'us'}, {'task_group': 'Unknown', 'author_category': 'Silver', 'language_category': 'High', 'location': 'us'}, {'task_group': 'Unknown', 'author_category': 'Silver', 'language_category': 'High', 'location': 'us'}]


In [21]:


r = requests.post("http://localhost:8080/predict",
                  json={"instances": instances})
r.raise_for_status()
resp = r.json()

# pick the top‑scoring class for each
y_pred = []
for pred in resp["predictions"]:
    # assume classes list is the same for every row:
    classes = pred["classes"]
    best = pred["scores"].index(max(pred["scores"]))
    y_pred.append(classes[best])

print("Accuracy:", accuracy_score(y_true, y_pred))


HTTPError: 400 Client Error: Bad Request for url: http://localhost:8080/predict