# FewShotLearner class walkthrough

In [1]:
import pandas as pd

# Creating a Toy Dataset
support_data = {
    'text': ['Cats are cute', 'Dogs are loyal', 'Birds are awesome',
             'I love programming', 'I like coding', 'I am data scientist'],
    'label': ['Pets', 'Pets', 'Pets',
              'Code', 'Code', 'Code']
}
support_set = pd.DataFrame(support_data)
support_set

Unnamed: 0,text,label
0,Cats are cute,Pets
1,Dogs are loyal,Pets
2,Birds are awesome,Pets
3,I love programming,Code
4,I like coding,Code
5,I am data scientist,Code


In [2]:
from FewShotX.embeddings.embed import Embeddings
from FewShotX.scoring.fewshot import FewShotLearner

# Instantiate the Embeddings class
embedding_model = Embeddings(model_name='all-MiniLM-L6-v2')

# Instantiate the FewShotLearner class with the toy dataset
learner = FewShotLearner(support_set, text_col='text', label_col='label', embedding_model=embedding_model)

In [3]:
# Prepare the training data using the _prepare_training_data method
(X_train, y_train), (X_val, y_val), input_dim, output_dim = learner._prepare_training_data(val_split=0.2)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

X_train shape: torch.Size([4, 384])
y_train shape: torch.Size([4, 384])


+ The shape of `X_tensor` is: 4 samples × 384-dimensional embeddings → (4, 384).
+ The labels "Pets" and "Code" are also embedded using the same model.
+ This 2 unique embeddings are then mapped to its respective example, resulting in a `y_tensor` of shape (4, 384).

In [4]:
learner._train_model((X_train, y_train), (X_val, y_val), input_dim, output_dim, lam=0.1, lr=0.1, 
                     epochs=20, early_stop=5, verbose=True)

Epoch 1/20 - Training Loss: 0.0276 - Validation Loss: 0.4302
Epoch 2/20 - Training Loss: 1.1491 - Validation Loss: 0.0895
Epoch 3/20 - Training Loss: 0.1122 - Validation Loss: 0.1326
Epoch 4/20 - Training Loss: 0.2852 - Validation Loss: 0.2582
Epoch 5/20 - Training Loss: 0.6347 - Validation Loss: 0.2057
Epoch 6/20 - Training Loss: 0.4643 - Validation Loss: 0.0893
Epoch 7/20 - Training Loss: 0.1510 - Validation Loss: 0.0361
Epoch 8/20 - Training Loss: 0.0440 - Validation Loss: 0.0645
Epoch 9/20 - Training Loss: 0.1586 - Validation Loss: 0.1089
Epoch 10/20 - Training Loss: 0.2799 - Validation Loss: 0.1115
Epoch 11/20 - Training Loss: 0.2609 - Validation Loss: 0.0754
Epoch 12/20 - Training Loss: 0.1477 - Validation Loss: 0.0366
Early stopping at epoch 12


If the validation loss does **not improve** for 5 consecutive epochs, the training process is `stopped early`.

In [5]:
query_data = {
    'text': ['Parrots can talk and mimic sounds',
             'Developing machine learning models is fascinating'],
    'label': ['Pets', 'Code']
}
query_set = pd.DataFrame(query_data)
query_set

Unnamed: 0,text,label
0,Parrots can talk and mimic sounds,Pets
1,Developing machine learning models is fascinating,Code


In [6]:
# Compute predictions
predictions, acc = learner.predict(query_set, k=3, return_accuracy=True)
print("Accuracy: ", acc)
predictions

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Accuracy:  1.0


Unnamed: 0,text,label,pred,pred_label,true_label_idx
0,Parrots can talk and mimic sounds,Pets,0,Pets,0
1,Developing machine learning models is fascinating,Code,1,Code,1


# Application to AG News

In [7]:
# Load the Parquet file
df = pd.read_parquet('datasets/agnews2k.parquet')
print(df["label"].value_counts())
df

label
Sports      500
World       500
Sci/Tech    500
Business    500
Name: count, dtype: int64


Unnamed: 0,text,label
0,"Latest BCS a feel-good for Cal, upset stomach ...",Sports
1,Sindelar snaps slump with 66 at rainy Open Joe...,Sports
2,Seven killed in Karbala mosque explosion At le...,World
3,Building Dedicated to Space Shuttle Columbia A...,Sci/Tech
4,Guatemala to pay paramilitaries Guatemala's go...,World
...,...,...
1995,Dodgers Clobber Padres 9-6 (AP) AP - Cesar Izt...,Sports
1996,Digital photo album bypasses PCs SanDisk's med...,Sci/Tech
1997,Greek hero Charisteas the man for the big occa...,Sports
1998,FilePlanet Daily Download Marvel Sues NCSoft a...,Sci/Tech


In [8]:
def split_support_query(df, support_pct, random_state=42):
    """
    Split the dataset into support and query sets based on a percentage.
    """
    # Ensure percentage is within the valid range
    if not (0 < support_pct < 1):
        raise ValueError("support_pct must be a float between 0 and 1.")
    
    # Split the dataset
    support_set = df.sample(frac=support_pct, random_state=random_state)
    query_set = df.drop(support_set.index)

    return support_set, query_set

In [9]:
support_set, query_set = split_support_query(df, support_pct=0.25, random_state=42)

print(f"Support Set: ({len(support_set)} rows):")
print(support_set.head())

print(f"\nQuery Set: ({len(query_set)} rows):")
print(query_set.head())

Support Set: (500 rows):
                                                   text     label
1860  Falluja Rebels Had Enough Arms to Rule Iraq -U...     World
353   Notables With a 9:15 p.m. curfew imposed becau...    Sports
1333  Oil Prices Generate Winners and Losers With cr...     World
905   Offshore drilling rig missing after Ivan An of...  Business
1289  Top UN envoy details lack of progress in Darfu...     World

Query Set: (1500 rows):
                                                text     label
0  Latest BCS a feel-good for Cal, upset stomach ...    Sports
1  Sindelar snaps slump with 66 at rainy Open Joe...    Sports
3  Building Dedicated to Space Shuttle Columbia A...  Sci/Tech
4  Guatemala to pay paramilitaries Guatemala's go...     World
5  Manning to Get First Start Vs. Panthers (AP) A...    Sports


In [10]:
from FewShotX.embeddings.embed import Embeddings
from FewShotX.scoring.fewshot import FewShotLearner

# Initialize Embedder
embed_model = Embeddings(model_name='all-MiniLM-L6-v2', verbose=False)

# Initialize the FewShotLearner
few_shot_learner = FewShotLearner(
    support_set=support_set.copy(),
    text_col="text",
    label_col="label",
    embedding_model=embed_model
)

In [11]:
# Train the model using the support set
few_shot_learner.fit(lam=0.1, lr=0.05, epochs=50, early_stop=10, verbose=True)

Epoch 1/50 - Training Loss: 0.1621 - Validation Loss: 0.0069
Epoch 2/50 - Training Loss: 0.1193 - Validation Loss: 0.0066
Epoch 3/50 - Training Loss: 0.0903 - Validation Loss: 0.0044
Epoch 4/50 - Training Loss: 0.0732 - Validation Loss: 0.0037
Epoch 5/50 - Training Loss: 0.0647 - Validation Loss: 0.0034
Epoch 6/50 - Training Loss: 0.0590 - Validation Loss: 0.0034
Epoch 7/50 - Training Loss: 0.0612 - Validation Loss: 0.0034
Epoch 8/50 - Training Loss: 0.0654 - Validation Loss: 0.0036
Epoch 9/50 - Training Loss: 0.0657 - Validation Loss: 0.0035
Epoch 10/50 - Training Loss: 0.0637 - Validation Loss: 0.0035
Epoch 11/50 - Training Loss: 0.0690 - Validation Loss: 0.0037
Epoch 12/50 - Training Loss: 0.0690 - Validation Loss: 0.0037
Epoch 13/50 - Training Loss: 0.0703 - Validation Loss: 0.0038
Epoch 14/50 - Training Loss: 0.0687 - Validation Loss: 0.0037
Epoch 15/50 - Training Loss: 0.0698 - Validation Loss: 0.0038
Epoch 16/50 - Training Loss: 0.0715 - Validation Loss: 0.0038
Early stopping at

In [12]:
# Make predictions on the query set
df_pred, acc = few_shot_learner.predict(query_set.copy(), return_accuracy=True)
print("Accuracy: ", acc)
df_pred

Accuracy:  0.7206666666666667


Unnamed: 0,text,label,pred,pred_label,true_label_idx
0,"Latest BCS a feel-good for Cal, upset stomach ...",Sports,1,Sports,1
1,Sindelar snaps slump with 66 at rainy Open Joe...,Sports,1,Sports,1
3,Building Dedicated to Space Shuttle Columbia A...,Sci/Tech,2,Business,3
4,Guatemala to pay paramilitaries Guatemala's go...,World,2,Business,0
5,Manning to Get First Start Vs. Panthers (AP) A...,Sports,1,Sports,1
...,...,...,...,...,...
1994,"Nuggets 93, Rockets 88 DerMarr Johnson scored ...",Sports,1,Sports,1
1995,Dodgers Clobber Padres 9-6 (AP) AP - Cesar Izt...,Sports,2,Business,1
1996,Digital photo album bypasses PCs SanDisk's med...,Sci/Tech,3,Sci/Tech,3
1997,Greek hero Charisteas the man for the big occa...,Sports,1,Sports,1


## Hyperparameter tuning

In [None]:
learning_rates = [0.0001, 0.001, 0.01]
lambdas = [0.01, 0.1, 0.5]
best_score = 0

for lr in learning_rates:
    for lam in lambdas:

        # Initialize the embedding model and FewShotLearner
        embed_model = Embeddings(model_name='all-MiniLM-L6-v2', verbose=False)
        few_shot_learner = FewShotLearner(
            support_set=support_set.copy(),
            text_col="text",
            label_col="label",
            embedding_model=embed_model
        )

        # Fit the model and print the loss for debugging
        few_shot_learner.fit(lam=lam, lr=lr, epochs=100, early_stop=10, verbose=False)

        # Evaluate the model
        _, new_score = few_shot_learner.predict(query_set.copy(), return_accuracy=True)
        print(f"LR: {lr}, Lambda: {lam}, Score: {new_score}")

        # Check if the new score is the best so far
        if new_score > best_score:
            best_score = new_score
            best_lr = lr
            best_lam = lam
            best_model = few_shot_learner
            print(f"New best hps: LR: {best_lr}, Lambda: {best_lam}, with a score of {best_score}")
            
print(f"Best Hyperparameters: LR: {best_lr}, Lambda: {best_lam}, with Score = {best_score}")

LR: 0.0001, Lambda: 0.01, Score: 0.866
New best hps: LR: 0.0001, Lambda: 0.01, with a score of 0.866
LR: 0.0001, Lambda: 0.1, Score: 0.8666666666666667
New best hps: LR: 0.0001, Lambda: 0.1, with a score of 0.8666666666666667
LR: 0.0001, Lambda: 0.5, Score: 0.8606666666666667
LR: 0.001, Lambda: 0.01, Score: 0.86
LR: 0.001, Lambda: 0.1, Score: 0.8633333333333333
LR: 0.001, Lambda: 0.5, Score: 0.8666666666666667
LR: 0.01, Lambda: 0.01, Score: 0.822
LR: 0.01, Lambda: 0.1, Score: 0.8533333333333334
LR: 0.01, Lambda: 0.5, Score: 0.854
Best Hyperparameters: LR: 0.0001, Lambda: 0.1, with Score = 0.8666666666666667
