Published on October 17, 2024. By Mar√≠lia Prata, mpwolke.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly.offline as py
import plotly.express as px

#Avoid Plotly issues
import plotly.io as pio
pio.renderers.default = 'iframe'


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

![](https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcT7aRNw5Rb02Ug26mJHLeggSp2-8UP0nAOzHg&s)NimbleBox Ai

## Competition Citation

@misc{llm-classification-finetuning,

    author = {Wei-lin Chiang, Lianmin Zheng, Lisa Dunlap, Joseph E. Gonzalez, Ion Stoica, Paul Mooney, Sohier Dane, Addison Howard, Nate Keating}
    ,
    title = {LLM Classification Finetuning
    },
    year = {2024},
    howpublished = {\url{https://kaggle.com/competitions/llm-classification-finetunin
    g}},
    note = {Kaggle}
}

In [None]:
train = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')
train.tail()

## Battles Model A and Model B

In [None]:
#https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH#scrollTo=B_PYA7oVyaHO

import plotly.io as pio
pio.renderers.default = 'iframe'

fig = px.bar(train["winner_model_a"].value_counts(),
             title="Counts of Battle Outcomes A", text_auto=True, height=400, color_discrete_sequence=['crimson'])
fig.update_layout(xaxis_title="Battle Outcome A", yaxis_title="Count", 
                  showlegend=False)
fig

In [None]:
#https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH#scrollTo=B_PYA7oVyaHO


import plotly.io as pio
pio.renderers.default = 'iframe'

fig = px.bar(train["winner_model_b"].value_counts(),
             title="Counts of Battle Outcomes B", text_auto=True, height=400)
fig.update_layout(xaxis_title="Battle Outcome B", yaxis_title="Count",
                  showlegend=False)
fig

In [None]:
#https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH#scrollTo=B_PYA7oVyaHO

import plotly.io as pio
pio.renderers.default = 'iframe'

fig = px.bar(pd.concat([train["model_a"], train["model_b"]]).value_counts(),
             title="Battle Count for Each Model", text_auto=True)
fig.update_layout(xaxis_title="model", yaxis_title="Battle Count", height=400,
                  showlegend=False)
fig  

In [None]:
#https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH#scrollTo=B_PYA7oVyaHO

import plotly.io as pio
pio.renderers.default = 'iframe'

def visualize_battle_count(train, title, show_num_models=30):
    ptbl = pd.pivot_table(train, index="model_a", columns="model_b", aggfunc="size",
                          fill_value=0)
    battle_counts = ptbl + ptbl.T
    ordering = battle_counts.sum().sort_values(ascending=False).index
    ordering = ordering[:show_num_models]
    fig = px.imshow(battle_counts.loc[ordering, ordering],
                    title=title, text_auto=True)
    fig.update_layout(xaxis_title="Model B",
                      yaxis_title="Model A",
                      xaxis_side="top", height=800, width=800,
                      title_y=0.07, title_x=0.5,
                      font=dict(size=10))
    fig.update_traces(hovertemplate=
                      "Model A: %{y}<br>Model B: %{x}<br>Count: %{z}<extra></extra>")
    return fig

fig = visualize_battle_count(train, title="Battle Count of Each Combination of Models", show_num_models=30)
fig

In [None]:
#AttributeError: Can only use .str accessor with string values!

battles_no_ties = train[~train["model_a"].str.contains("winner_tie")]

In [None]:
import plotly.io as pio
pio.renderers.default = 'iframe'

visualize_battle_count(battles_no_ties, "Battle Count for Each Combination of Models (without Ties)")

In [None]:
!pip3 install umap-learn

#Install Cluestar

In [None]:
! pip install -q -U sentence-transformers
! pip install -q -U watermark
! pip install -q -U cluestar

from pandas import DataFrame
from plotly.express import scatter
#from umap import UMAP
from umap import umap_ as UMAP

import umap
from sentence_transformers import SentenceTransformer
from cluestar import plot_text

#Turn On GPU! AssertionError: Torch not compiled with CUDA enabled

In [None]:
#Binga https://www.kaggle.com/code/phanisrikanth/daigt-cluster-explore-7-prompts-dataset

# Convert essays to embeddings using sentence transformers library.
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
# model = SentenceTransformer('all-mpnet-base-v2', device='cuda')
# model = SentenceTransformer('/kaggle/input/thenlper-gte-large/').to("cuda:0") # use this if needed.

#Let's encode the input from training data.
train_embeddings = model.encode(train['model_a'])

In [None]:
#Binga https://www.kaggle.com/code/phanisrikanth/daigt-cluster-explore-7-prompts-dataset

train_embeddings.shape

In [None]:
#Binga https://www.kaggle.com/code/phanisrikanth/daigt-cluster-explore-7-prompts-dataset

# Build a UMAP representation of the essay embeddings.
model = umap.UMAP(random_state=42)
train_umap_embeddings = model.fit_transform(train_embeddings)

In [None]:
#Binga https://www.kaggle.com/code/phanisrikanth/daigt-cluster-explore-7-prompts-dataset

# Plot the umap embeddings in 2D space. Add a legend with blues representing student essays and 1s representing AI generated essays.
plot_text(train_umap_embeddings, train['model_a'], color_array=list(train['response_a'].astype(str)))

#Acknowledgements:

https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH#scrollTo=B_PYA7oVyaHO


Binga https://www.kaggle.com/code/phanisrikanth/daigt-cluster-explore-7-prompts-dataset


Mike Delong https://www.kaggle.com/code/mikedelong/visualize-c-instructionst

mpwolke https://www.kaggle.com/code/mpwolke/formulas-maths-json-tsne-umap/notebook

mpwolke https://www.kaggle.com/code/mpwolke/arc-lucky-cluestar