In [1]:
from google import genai
from google.genai import types

genai.__version__

'1.8.0'

In [2]:
import os

os.environ["GOOGLE_API_KEY"] = "***************************************"
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

In [3]:
client = genai.Client(api_key=GOOGLE_API_KEY)

In [4]:
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset="train")
newsgroups_test = fetch_20newsgroups(subset="test")

newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [5]:
print(newsgroups_train.data[0])

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [6]:
import email
import re

import pandas as pd

def preprocess_newsgroup_row(data):
    msg = email.message_from_string(data)
    text = f"{msg['Subject']}\n\n{msg.get_payload()}"
    text = re.sub(r"[\w\.-]+@[\w\.-]+", "", text)
    text = text[:40000]

    return text

def preprocess_newsgroup_data(newsgroup_dataset):
    df = pd.DataFrame(
        {"Text": newsgroup_dataset.data, "Label": newsgroup_dataset.target}
    )
    df["Text"] = df["Text"].apply(preprocess_newsgroup_row)
    df["Class Name"] = df["Label"].map(lambda l: newsgroup_dataset.target_names[l])

    return df

In [7]:
df_train = preprocess_newsgroup_data(newsgroups_train)
df_test = preprocess_newsgroup_data(newsgroups_test)

df_train.head()

Unnamed: 0,Text,Label,Class Name
0,WHAT car is this!?\n\n I was wondering if anyo...,7,rec.autos
1,SI Clock Poll - Final Call\n\nA fair number of...,4,comp.sys.mac.hardware
2,"PB questions...\n\nwell folks, my mac plus fin...",4,comp.sys.mac.hardware
3,Re: Weitek P9000 ?\n\nRobert J.C. Kyanko () wr...,1,comp.graphics
4,Re: Shuttle Launch Question\n\nFrom article <>...,14,sci.space


In [8]:
def sample_data(df, num_samples, classes_to_keep):
    df = (
        df.groupby("Label")[df.columns]
        .apply(lambda x: x.sample(num_samples))
        .reset_index(drop=True)
    )

    df = df[df["Class Name"].str.contains(classes_to_keep)]
    df["Class Name"] = df["Class Name"].astype("category")

    return df

TRAIN_NUM_SAMPLES = 50
TEST_NUM_SAMPLES = 10
# Keep rec.* and sci.*
CLASSES_TO_KEEP = "^rec|^sci"

df_train = sample_data(df_train, TRAIN_NUM_SAMPLES, CLASSES_TO_KEEP)
df_test = sample_data(df_test, TEST_NUM_SAMPLES, CLASSES_TO_KEEP)

In [9]:
#Evaluacija pocetnog stanja

In [10]:
sample_idx = 0
sample_row = preprocess_newsgroup_row(newsgroups_test.data[sample_idx])
sample_label = newsgroups_test.target_names[newsgroups_test.target[sample_idx]]

print(sample_row)
print('---')
print('Label:', sample_label)

Need info on 88-89 Bonneville


 I am a little confused on all of the models of the 88-89 bonnevilles.
I have heard of the LE SE LSE SSE SSEI. Could someone tell me the
differences are far as features or performance. I am also curious to
know what the book value is for prefereably the 89 model. And how much
less than book value can you usually get them for. In other words how
much are they in demand this time of year. I have heard that the mid-spring
early summer is the best time to buy.

			Neil Gandler

---
Label: rec.autos


In [11]:
response = client.models.generate_content(
    model="gemini-1.5-flash-001", contents=sample_row)
print(response.text)

You're right, there were a lot of Bonneville trims in 1988-89! Here's a breakdown to help clarify:

**1988-1989 Bonneville Trim Levels:**

* **LE:**  The base model, featuring standard features for the time like a V6 engine, cloth upholstery, and basic power options.
* **SE:**  (Sport Edition) Added features like a slightly more powerful engine, sport suspension, and some cosmetic upgrades.
* **LSE:** (Luxury Sport Edition) Combined the comfort of the LE with the performance of the SE, offering leather seats, a more powerful V6, and additional comfort features.
* **SSE:**  (Special Sport Edition)  Offered the most power with its V8 engine, along with a sporty exterior and interior, and enhanced handling.
* **SSEi:** (Special Sport Edition Injection)  This was the top-of-the-line Bonneville model. It included the same powerful V8 as the SSE but with fuel injection for improved performance and fuel economy. 

**Value & Demand:**

* **Book Value:**  The Kelley Blue Book website ([https://

In [14]:
from google.api_core import retry

system_instruct = """
You are a classification service. You will be passed input that represents
a newsgroup post and you must respond with the newsgroup from which the post
originates.
"""

is_retriable = lambda e: (isinstance(e, genai.errors.APIError) and e.code in {429, 503})

@retry.Retry(predicate=is_retriable)
def predict_label(post: str) -> str:
    response = client.models.generate_content(
        model="gemini-1.5-flash-001",
        config=types.GenerateContentConfig(
            system_instruction=system_instruct),
        contents=post)

    rc = response.candidates[0]

    if rc.finish_reason.name != "STOP":
        return "(error)"
    else:
        return response.text.strip()

prediction = predict_label(sample_row)

print(prediction)
print()
print("Correct!" if prediction == sample_label else "Incorrect.")

rec.autos.misc

Incorrect.


In [15]:
import tqdm
from tqdm.rich import tqdm as tqdmr
import warnings

tqdmr.pandas()

warnings.filterwarnings("ignore", category=tqdm.TqdmExperimentalWarning)

df_baseline_eval = sample_data(df_test, 2, '.*')

df_baseline_eval['Prediction'] = df_baseline_eval['Text'].progress_apply(predict_label)

accuracy = (df_baseline_eval["Class Name"] == df_baseline_eval["Prediction"]).sum() / len(df_baseline_eval)
print(f"Accuracy: {accuracy:.2%}")

Output()

Accuracy: 18.75%


In [16]:
df_baseline_eval

Unnamed: 0,Text,Label,Class Name,Prediction
0,Re: Opinions on 88-89 Pontiac Bonneville\n\nIn...,7,rec.autos,rec.autos.cars
1,Diatribe on US-sold Opels (was Re: Opel owners...,7,rec.autos,rec.autos.opel
2,"Re: story\n\nIn article <>, (Charles Sundheim...",8,rec.motorcycles,rec.motorcycles
3,Re: Zipper repairs (Was: ...Tankbag...)\n\nIn ...,8,rec.motorcycles,rec.motorcycles
4,Red Sox Choke Contest\n\nTo encourage the grea...,9,rec.sport.baseball,rec.sports.baseball.misc
5,Re: Rule Book Trivia 2\n\nIn article <> Ryan R...,9,rec.sport.baseball,rec.sports.baseball
6,Re: Pens fans reactions\n\nRichard J Coyle (go...,10,rec.sport.hockey,rec.sports.hockey
7,Islanders win Third in OT\n\nDon't know who sc...,10,rec.sport.hockey,rec.sports.hockey
8,Re: The [secret] source of that announcement\n...,11,sci.crypt,comp.sys.ibm.pc.hardware
9,Re: Do we need the clipper for cheap security?...,11,sci.crypt,comp.dcom.modems


In [17]:
#Sada ce da model prilagodi podacima. Cilj je da promtovanje kao i sistemske instrukcije postanu nepotrebne

In [18]:
from collections.abc import Iterable
import random

input_data = {'examples': 
    df_train[['Text', 'Class Name']]
      .rename(columns={'Text': 'textInput', 'Class Name': 'output'})
      .to_dict(orient='records')
 }

model_id = None

if not model_id:
  queued_model = None
  for m in reversed(client.tunings.list()):
    if m.name.startswith('tunedModels/newsgroup-classification-model'):
      if m.state.name == 'JOB_STATE_SUCCEEDED':
        model_id = m.name
        print('Found existing tuned model to reuse.')
        break

      elif m.state.name == 'JOB_STATE_RUNNING' and not queued_model:
        queued_model = m.name
  else:
    if queued_model:
      model_id = queued_model
      print('Found queued model, still waiting.')

if not model_id:
    tuning_op = client.tunings.tune(
        base_model="models/gemini-1.5-flash-001-tuning",
        training_dataset=input_data,
        config=types.CreateTuningJobConfig(
            tuned_model_display_name="Newsgroup classification model",
            batch_size=16,
            epoch_count=2,
        ),
    )

    print(tuning_op.state)
    model_id = tuning_op.name

print(model_id)

  tuning_op = client.tunings.tune(


JobState.JOB_STATE_QUEUED
tunedModels/newsgroup-classification-model-hctmn1wig


In [20]:
#Treniranje modela se ovim procesom desava u pozadini
#uzeo sam vec gotov model da ne bih cekao satima

In [19]:
import datetime
import time

MAX_WAIT = datetime.timedelta(minutes=10)

while not (tuned_model := client.tunings.get(name=model_id)).has_ended:

    print(tuned_model.state)
    time.sleep(60)

    if datetime.datetime.now(datetime.timezone.utc) - tuned_model.create_time > MAX_WAIT:
        print("Taking a shortcut, using a previously prepared model.")
        model_id = "tunedModels/newsgroup-classification-model-ltenbi1b"
        tuned_model = client.tunings.get(name=model_id)
        break


print(f"Done! The model state is: {tuned_model.state.name}")

if not tuned_model.has_succeeded and tuned_model.error:
    print("Error:", tuned_model.error)



JobState.JOB_STATE_RUNNING
Taking a shortcut, using a previously prepared model.
Done! The model state is: JOB_STATE_SUCCEEDED


In [22]:
@retry.Retry(predicate=is_retriable)
def classify_text(text: str) -> str:
    response = client.models.generate_content(
        model=model_id, contents=text)
    rc = response.candidates[0]

    if rc.finish_reason.name != "STOP":
        return "(error)"
    else:
        return rc.content.parts[0].text


df_model_eval = sample_data(df_test, 4, '.*')

df_model_eval["Prediction"] = df_model_eval["Text"].progress_apply(classify_text)

accuracy = (df_model_eval["Class Name"] == df_model_eval["Prediction"]).sum() / len(df_model_eval)
print(f"Accuracy: {accuracy:.2%}")

Output()

Accuracy: 90.62%
