In [6]:
from pathlib import Path

import pandas as pd
from typing import List
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.model_selection import train_test_split

%load_ext autoreload
%autoreload 2
Path.ls = lambda x: list(x.iterdir())

# Setting up Cohere
with Path("cohere_api_key").open("r") as f:
    cohere_api_key = f.read().strip()

import cohere
from cohere.classify import Example

co = cohere.Client(f"{cohere_api_key}")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
from tqdm.notebook import tqdm
from collections import abc

In [18]:
df = pd.read_csv("../data/raw/Tagged_Data.csv")
df = df.dropna()

In [37]:
class CohereLearnBase:
    def make_examples(self, df: pd.DataFrame):
        """
        #TODO: Iterative and slow, make this parallel and fast
        """
        examples = []
        for row in df.iterrows():
            text = row[1]["Name"]
            lbl = row[1]["Key"]
            examples.append(Example(text, lbl))
        return examples

    def parse_cohere_classification(classification):
        lbl = classification.prediction
        confidences = classification.confidence
        score = -1
        for c in confidences:
            score = max(score, c.confidence)
        return lbl, score

    def parse_cohere_response(response):
        lbls, scores = [], []
        for classification in response.classifications:
            lbl, score = parse_cohere_classification(classification)
            lbls.append(lbl)
            scores.append(score)
        return lbls

    def predict(self):
        responses = []
        for trn_df in tqdm(self.train_dfs):
            inputs = self.test_df[self.x_label].tolist()
            examples = self.make_examples(trn_df)
            response = co.classify(inputs=inputs, examples=examples)
        responses.append(response)
        return responses

    def __repr__(self):
        return f"Train Counts: {self.train_counts}, Test Count: {self.test_count}, Text Column(x_label): {self.x_label}, Target Column(y_label): {self.y_label}"


class CohereFewShotClassify(CohereLearnBase):
    def __init__(
        self,
        df: pd.DataFrame,
        co: cohere.Client,
        train_counts: List[int] = [4, 8, 16, 32],
        test_count: int = 64,
        x_label="Name",
        y_label="Key",
    ):
        self.df, self.train_counts, self.test_count, self.x_label, self.y_label = (
            df,
            train_counts,
            test_count,
            x_label,
            y_label,
        )
        self.labels = list(self.df[y_label].unique())
        self.random_state = 37
        self.train_dfs, self.test_df = self.make_train_dataframes(
            self.train_counts, self.test_count, self.labels
        )
        
    def __repr__(self):
        return f"CohereFewShotClassify({super().__repr__()})"

    def make_train_dataframes(
        self, train_counts: List[int], test_count: int, labels: List[str]
    ) -> List[pd.DataFrame]:
        train_dfs = []
        for n in train_counts:
            trn = []
            test_lbl_cuts = []
            for lbl in labels:
                class_cut = df[df[self.y_label] == lbl]
                if len(class_cut) <= self.test_count:
                    raise ValueError(f"For label {lbl} insufficient number of samples")
                test_cut = class_cut.sample(
                    self.test_count, random_state=self.random_state
                )
                test_lbl_cuts.append(test_cut)
                left_over = class_cut[
                    ~class_cut.apply(tuple, 1).isin(test_cut.apply(tuple, 1))
                ]
                trn.append(left_over.sample(n, random_state=self.random_state))
            train_dfs.append(pd.concat(trn))
        test_df = pd.concat(test_lbl_cuts)
        return train_dfs, test_df


cohere_clf = CohereFewShotClassify(
    df=df,
    co=co,
    train_counts=[4, 8, 16, 32],
    test_count=64,
    x_label="Name",
    y_label="Key",
)

cohere_clf
# assert len(test_df) == 64*len(labels)
# assert len(train_cuts) == len(train_dfs)
# assert train_cuts[0]*len(labels) == len(train_dfs[0])

CohereFewShotClassify(Train Counts: [4, 8, 16, 32], Test Count: 64, Text Column(x_label): Name, Target Column(y_label): Key)