# Lab 2: AG News

- **Type:** Text Classification Dataset  
- **Content:** News articles categorized into four classes: World, Sports, Business, Sci/Tech.  
- **Size:** 120,000 training samples, 7,600 test samples.  
- **Source:** News articles from more than 2,000 news sources.  
- **Task:** Classify news articles based on topic/category. 

This notebook demonstrates the use of the `FewShotX` package (version 0.1.2), available for download [here](https://github.com/RenatoVassallo/BSE-ForecastNLP/releases/download/0.1.2/fewshotx-0.1.2-py3-none-any.whl).

In [None]:
import pandas as pd

# Load the Parquet file
df = pd.read_parquet('datasets/agnews2k.parquet')
print(df["label"].value_counts())
df

In [None]:
def split_support_query(df, support_pct, random_state=42):
    """
    Split the dataset into support and query sets based on a percentage.
    """
    # Ensure percentage is within the valid range
    if not (0 < support_pct < 1):
        raise ValueError("support_pct must be a float between 0 and 1.")
    
    # Split the dataset
    support_set = df.sample(frac=support_pct, random_state=random_state)
    query_set = df.drop(support_set.index)

    return support_set, query_set

In [None]:
support_set, query_set = split_support_query(df, support_pct=0.25, random_state=42)

print(f"Support Set: ({len(support_set)} rows):")
print(support_set.head())

print(f"\nQuery Set: ({len(query_set)} rows):")
print(query_set.head())

In [None]:
from FewShotX import Embeddings, FewShotLearner

# Initialize Embedder
embed_model = Embeddings(model_name='all-MiniLM-L6-v2', verbose=False)

# Initialize the FewShotLearner
few_shot_learner = FewShotLearner(
    support_set=support_set.copy(),
    text_col="text",
    label_col="label",
    embedding_model=embed_model
)

In [None]:
# Train the model using the support set
few_shot_learner.fit(lam=0.1, lr=0.05, epochs=50, early_stop=10, verbose=True)

In [None]:
# Make predictions on the query set
df_pred, acc = few_shot_learner.predict(query_set.copy(), return_accuracy=True)
print("Accuracy: ", acc)
df_pred

## Hyperparameter tuning

In [None]:
learning_rates = [0.0001, 0.001, 0.01]
lambdas = [0.01, 0.1, 0.5]
best_score = 0

for lr in learning_rates:
    for lam in lambdas:

        # Initialize the embedding model and FewShotLearner
        embed_model = Embeddings(model_name='all-MiniLM-L6-v2', verbose=False)
        few_shot_learner = FewShotLearner(
            support_set=support_set.copy(),
            text_col="text",
            label_col="label",
            embedding_model=embed_model
        )

        # Fit the model and print the loss for debugging
        few_shot_learner.fit(lam=lam, lr=lr, epochs=100, early_stop=10, verbose=False)

        # Evaluate the model
        _, new_score = few_shot_learner.predict(query_set.copy(), return_accuracy=True)
        print(f"LR: {lr}, Lambda: {lam}, Score: {new_score}")

        # Check if the new score is the best so far
        if new_score > best_score:
            best_score = new_score
            best_lr = lr
            best_lam = lam
            best_model = few_shot_learner
            print(f"New best hps: LR: {best_lr}, Lambda: {best_lam}, with a score of {best_score}")
            
print(f"Best Hyperparameters: LR: {best_lr}, Lambda: {best_lam}, with Score = {best_score}")