In [1]:
!pip uninstall -qqy jupyterlab kfp 2>/dev/null  
!pip install -U -q "google-genai==1.7.0"

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.7/144.7 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.9/100.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jupyterlab-lsp 3.10.2 requires jupyterlab<4.0.0a0,>=3.1.0, which is not installed.[0m[31m
[0m

In [2]:
from google import genai
from google.genai import types

In [3]:
from kaggle_secrets import UserSecretsClient

GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")

In [4]:
client = genai.Client(api_key = GOOGLE_API_KEY)

In [5]:
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset = "train")
newsgroups_test = fetch_20newsgroups(subset = "test")

newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [6]:
print(newsgroups_train.data[0])

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [7]:
import email
import re

import pandas as pd

def preprocess_row(data):
    msg = email.message_from_string(data)

    text = f"{msg['Subject']}\n\n{msg.get_payload()}"

    text = re.sub(r"[\w\.-]+@[\w\.-]+" , "" , text)

    text = text[:5000]

    return text

def preprocess_data(dataset):
    df = pd.DataFrame(
        {"Text": dataset.data , "Label": dataset.target}
    )

    df['Text'] = df['Text'].apply(preprocess_row)

    df['Class Name'] = df["Label"].map(lambda l : dataset.target_names[l])

    return df

In [8]:
df_train = preprocess_data(newsgroups_train)
df_test = preprocess_data(newsgroups_test)

df_train.head()

Unnamed: 0,Text,Label,Class Name
0,WHAT car is this!?\n\n I was wondering if anyo...,7,rec.autos
1,SI Clock Poll - Final Call\n\nA fair number of...,4,comp.sys.mac.hardware
2,"PB questions...\n\nwell folks, my mac plus fin...",4,comp.sys.mac.hardware
3,Re: Weitek P9000 ?\n\nRobert J.C. Kyanko () wr...,1,comp.graphics
4,Re: Shuttle Launch Question\n\nFrom article <>...,14,sci.space


In [9]:
def sample_data(df , num_samples , classes_to_keep):
    df = (
        df.groupby("Label")[df.columns]
        .apply(lambda x: x.sample(num_samples))
        .reset_index(drop = True)
    )

    df = df[df["Class Name"].str.contains(classes_to_keep)]

    df["Class Name"] = df["Class Name"].astype("category")
    df["Encoded Label"] = df["Class Name"].cat.codes

    return df

In [10]:
TRAIN_NUM_SAMPLES = 100
TEST_NUM_SAMPLES = 25
CLASSES_TO_KEEP = "sci"

df_train = sample_data(df_train , TRAIN_NUM_SAMPLES , CLASSES_TO_KEEP)
df_test = sample_data(df_test , TEST_NUM_SAMPLES , CLASSES_TO_KEEP)

In [11]:
df_train.value_counts("Class Name")

Class Name
sci.crypt          100
sci.electronics    100
sci.med            100
sci.space          100
Name: count, dtype: int64

In [12]:
df_test.value_counts("Class Name")

Class Name
sci.crypt          25
sci.electronics    25
sci.med            25
sci.space          25
Name: count, dtype: int64

In [13]:
from google.api_core import retry
import tqdm
from tqdm.rich import tqdm as tqdmr
import warnings

tqdmr.pandas()

warnings.filterwarnings("ignore", category=tqdm.TqdmExperimentalWarning)

retriable = lambda e : (isinstance(e , genai.errors.APIError) and e.code in {429 , 503})

@retry.Retry(predicate = retriable , timeout = 300)
def embed_fn(text: str) -> list[float]:
    response = client.models.embed_content(
        model = "models/text-embedding-004",
        contents = text,
        config = types.EmbedContentConfig(
            task_type = "classification"
        )
    )

    return response.embeddings[0].values

def create_embeddings(df):
    df["Embeddings"] = df["Text"].progress_apply(embed_fn)
    return df

In [14]:
df_train = create_embeddings(df_train)
df_test = create_embeddings(df_test)

Output()

Output()

In [15]:
df_train.head()

Unnamed: 0,Text,Label,Class Name,Encoded Label,Embeddings
1100,Re: Another data hiding scheme... \n\nIn artic...,11,sci.crypt,0,"[-0.013182628, 0.025199896, -0.037880335, 0.04..."
1101,Re: Clipper and Ranting Libertarians\n\nJust a...,11,sci.crypt,0,"[0.0043288055, 0.034050606, -0.04078017, 0.039..."
1102,"Re: Once tapped, your code is no good any more...",11,sci.crypt,0,"[0.005113458, 0.020383235, -0.037115525, 0.047..."
1103,"Re: Would ""clipper"" make a good cover for othe...",11,sci.crypt,0,"[-0.018435817, 0.026027948, -0.04701217, 0.038..."
1104,Need source of FEAL encrytion algorithm\n\nHi ...,11,sci.crypt,0,"[0.0039009298, -0.019534506, -0.007142377, 0.0..."


In [16]:
import keras
from keras import layers

def build_classification_model(input_size: int , num_classes: int) -> keras.Model:
    return keras.Sequential(
        [
            layers.Input([input_size] , name = "embedding_input"),
            layers.Dense(input_size , activation = "relu" , name = "hidden"),
            layers.Dense(num_classes , activation = "softmax" , name = "output_probs")
        ]
    )

2025-04-11 23:52:03.105530: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744415523.443080      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744415523.522533      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [17]:
embedding_size = len(df_train["Embeddings"].iloc[0])

classifier = build_classification_model(
    embedding_size , len(df_train["Class Name"].unique())
)
classifier.summary()

classifier.compile(
    loss = keras.losses.SparseCategoricalCrossentropy(),
    optimizer = keras.optimizers.Adam(learning_rate = 0.001),
    metrics = ["accuracy"]
)

I0000 00:00:1744415536.898698      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1744415536.899401      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [18]:
import numpy as np

NUM_EPOCHS = 20
BATCH_SIZE = 32

y_train = df_train["Encoded Label"]
x_train = np.stack(df_train["Embeddings"])
y_val = df_test["Encoded Label"]
x_val = np.stack(df_test["Embeddings"])

early_stopping = keras.callbacks.EarlyStopping(monitor = "accuracy" , patience = 3)

history = classifier.fit(
    x = x_train,
    y = y_train,
    validation_data = (x_val , y_val),
    callbacks = [early_stopping],
    batch_size = BATCH_SIZE,
    epochs = NUM_EPOCHS
)

Epoch 1/20


I0000 00:00:1744415539.395189      78 service.cc:148] XLA service 0x7e079800e1d0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1744415539.395808      78 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1744415539.395830      78 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1744415539.583806      78 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m 1/13[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m23s[0m 2s/step - accuracy: 0.2812 - loss: 1.3866

I0000 00:00:1744415540.365410      78 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 95ms/step - accuracy: 0.3341 - loss: 1.3699 - val_accuracy: 0.8000 - val_loss: 1.2685
Epoch 2/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.6661 - loss: 1.2097 - val_accuracy: 0.8100 - val_loss: 1.1334
Epoch 3/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8257 - loss: 1.0421 - val_accuracy: 0.8500 - val_loss: 0.9817
Epoch 4/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9314 - loss: 0.8609 - val_accuracy: 0.8600 - val_loss: 0.8295
Epoch 5/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9480 - loss: 0.7036 - val_accuracy: 0.9100 - val_loss: 0.6852
Epoch 6/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9634 - loss: 0.5194 - val_accuracy: 0.8700 - val_loss: 0.5747
Epoch 7/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━

In [19]:
classifier.evaluate(x = x_val , y = y_val , return_dict = True)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9262 - loss: 0.2211 


{'accuracy': 0.9300000071525574, 'loss': 0.21376001834869385}

In [20]:
def make_prediction(text: str) -> list[float]:
    embedded = embed_fn(text)

    inp = np.array([embedded])

    [result] = classifier.predict(inp)

    return result

In [21]:
new_text = """
First-timer looking to get out of here.

Hi, I'm writing about my interest in travelling to the outer limits!

What kind of craft can I buy? What is easiest to access from this 3rd rock?

Let me know how to do that please.
"""

result = make_prediction(new_text)

for idx , category in enumerate(df_test["Class Name"].cat.categories):
    print(f"{category} : {result[idx] * 100:0.2f}%")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 221ms/step
sci.crypt : 0.04%
sci.electronics : 0.27%
sci.med : 0.08%
sci.space : 99.62%
