In [None]:
# 
!pip install watermark
!pip install transformers datasets torch scikit-learn pandas numpy

In [None]:
# Import of libraries

# System libraries
import random
import re
import unicodedata
import itertools

# Library for file manipulation
import pandas as pd
import numpy as np
import pandas

# Data visualization
import seaborn as sns
import matplotlib.pylab as pl
import matplotlib as m
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly.express as px
from matplotlib import pyplot as plt

# Model transformers
import torch
import pandas as pd
from transformers import AutoTokenizer
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification
from keras.preprocessing.sequence import pad_sequences

# Importar display para mostrar imagens no Jupyterfrom keras.preprocessing.sequence import pad_sequences
from IPython.display import display  

# Configuration for graph width and layout
sns.set_theme(style='whitegrid')
palette='viridis'

# Warnings remove alerts
import warnings
warnings.filterwarnings("ignore")

# Python version
from platform import python_version
print('Python version in this Jupyter Notebook:', python_version())

# Load library versions
import watermark

# Library versions
%reload_ext watermark
%watermark -a "Library versions" --iversions

In [None]:
# Carregar o dataset
train_df = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')
test_df = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')

In [None]:
# Visualizar os dados
train_df.head()

In [None]:
train_df.tail()

In [None]:
train_df.shape

In [None]:
train_df.info()

In [None]:
train_df.dtypes

# Pré-processamento

In [None]:
# Concatenar os textos
train_df['combined_text'] = train_df['prompt'] + " [SEP] " + train_df['response_a'] + " [SEP] " + train_df['response_b']

# Preparar o dataset de teste
test_df['combined_text'] = test_df['prompt'] + "[SEP]" + test_df['response_a'] + "[SEP]" + test_df['response_b']

# Concatenar as colunas
train_df['label'] = train_df[['winner_model_a', 
                              'winner_model_b', 
                              'winner_tie']].idxmax(axis=1)

# Visualizando
train_df.head(n=10)

# Feature Engineering

In [None]:
# Mapear labels para inteiros
label_mapping = {'winner_model_a': 0, 
                 'winner_model_b': 1, 
                 'winner_tie': 2}

# Váraivel alvo
train_df['label'] = train_df['label'].map(label_mapping)

In [None]:
# Visualizando
train_df.head()

# Limpeza dados

In [None]:
# View missing values

plt.figure(figsize=(10, 6))
sns.heatmap(train_df.isnull(), cbar=False, cmap="viridis")
plt.title("Viewing Missing Values in the Training Set")
plt.show()

In [None]:
print("Number of missing values ​​per column:")
print(train_df.isnull().sum())

# Análise exploratoria de dados

In [None]:
train_df.winner_model_a.value_counts()

In [None]:
# 3. Distribution of the Target Variable "Class"
sns.countplot(x='label', data=train_df)
plt.title("Class Distribution of Binding Free Energy (BFE)")
plt.grid(False)
plt.show()

print("\nClass Distribution:")
print(train_df['label'].value_counts())

In [None]:
# 1. Basic Characteristics of the Dataset
print("Dataset Information:")
train_df.info()
print()
print("\nSummary Statistics:")
train_df.describe().T

In [None]:
# Get value counts in descending order
value_counts2 = train_df['model_a'].value_counts(ascending=False)

# Plotting
plt.figure(figsize=(15, 6))
sns.barplot(x=value_counts2.index, y=value_counts2.values)
plt.xlabel('Category')
plt.ylabel('Counts')
plt.xticks(rotation=90)
plt.title('Value Counts in Descending Order')
plt.grid(False)
plt.show()

In [None]:
plt.figure(figsize=(15, 6))
sns.countplot(x='model_b', data=train_df)
plt.xticks(rotation=90)
plt.grid(False)
plt.show()

print(train_df['model_b'].value_counts())

In [None]:
from wordcloud import WordCloud

# Crie uma lista de classes únicas na coluna 'label'
labels = train_df['label'].unique()

# Loop para cada classe e gerar a nuvem de palavras
for label in labels:
    textos_prompt = ' '.join(train_df[train_df['label'] == label]['prompt'])

    # Gerar a nuvem de palavras para cada classe
    wordcloud = WordCloud(width=800, height=400, 
                          background_color='white').generate(textos_prompt)

    # Exibir a nuvem de palavras
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Nuvem de palavras para a classe {label}')
    plt.show()

# Vetorização

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack  

# Inicializar TfidfVectorizers para cada coluna de texto
vectorizer_prompt = TfidfVectorizer(max_features=500)   
vectorizer_resp_a = TfidfVectorizer(max_features=500)
vectorizer_resp_b = TfidfVectorizer(max_features=500)

# Ajustar e transformar cada coluna de texto para o conjunto de treino
train_prompt_features = vectorizer_prompt.fit_transform(train_df['prompt'])
train_resp_a_features = vectorizer_resp_a.fit_transform(train_df['response_a'])
train_resp_b_features = vectorizer_resp_b.fit_transform(train_df['response_b'])

# Combinar as matrizes TF-IDF para o conjunto de treino
train_combined_features = hstack([train_prompt_features, train_resp_a_features, train_resp_b_features])

# Ajustar e transformar cada coluna de texto para o conjunto de teste
test_prompt_features = vectorizer_prompt.transform(test_df['prompt'])
test_resp_a_features = vectorizer_resp_a.transform(test_df['response_a'])
test_resp_b_features = vectorizer_resp_b.transform(test_df['response_b'])

# Combinar as matrizes TF-IDF para o conjunto de teste
test_combined_features = hstack([test_prompt_features, test_resp_a_features, test_resp_b_features])

# Armazenar o resultado para o conjunto de teste
final_test_features = test_combined_features

# A matriz de features para o conjunto de treino
final_train_features = train_combined_features

# Target column division

In [None]:
X = test_combined_features
y = train_df[['winner_model_a','winner_model_b','winner_tie']]

# Model Training

In [None]:
from sklearn.model_selection import train_test_split

# Dividir os dados em conjunto de treino e teste
X_train, X_val, y_train, y_val = train_test_split(X, train_df['label'], 
                                                  test_size=0.2, random_state=42, 
                                                  stratify=train_df['label'])

# Viewing training data
print("Viewing rows and columns given by X train", X_train.shape)

# Viewing test data
print("Viewing rows and columns given y train", y_train.shape)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Codificar os rótulos para valores inteiros
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report

# Parâmetros avançados com suporte a GPU
lgbm_params = {
    'n_estimators': 500,
    'learning_rate': 0.05,
    'max_depth': -1,
    'num_leaves': 31,
    'min_child_samples': 20,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': y_train.nunique(),  # Número de classes
    'device': 'gpu',
    'random_state': 42
}

# Criar o modelo LightGBM com suporte a GPU
model = LGBMClassifier(**lgbm_params)

# Treinar o modelo
model.fit(X_train, y_train)

In [None]:
# Fazer previsões no conjunto de validação
y_pred = model.predict(X_val)

# Fazer previsões no conjunto de teste
y_test_pred = model.predict(test)

# Fazer previsões AUC-ROC
y_pred = model.predict(X_val)

# Usar X_val, não test
y_proba = model.predict_proba(X_val)  

In [None]:
# Fazer previsões no conjunto de validação
y_pred_val = model.predict(X_val)
y_proba_val = model.predict_proba(X_val)

# Salvar as previsões e probabilidades em dataframes para a validação
val_predictions_df = pd.DataFrame({
    'prediction_val': y_pred_val,
    'proba_0_val': y_proba_val[:, 0],  # Probabilidade para a classe 0
    'proba_1_val': y_proba_val[:, 1],  # Probabilidade para a classe 1
})

# Fazer previsões no conjunto de teste
y_test_pred = model.predict(test)
y_proba_test = model.predict_proba(test)

# Salvar as previsões e probabilidades em dataframes para o teste
test_predictions_df = pd.DataFrame({
    'prediction_test': y_test_pred,
    'proba_0_test': y_proba_test[:, 0],  # Probabilidade para a classe 0
    'proba_1_test': y_proba_test[:, 1],  # Probabilidade para a classe 1
})

# Concatenar o dataframe de previsões com o dataframe original de treino
combined_df = pd.concat([train_df, val_predictions_df, test_predictions_df], axis=1)

combined_df.head(n=10)

In [None]:
data_llm = combined_df[["model_a", "prediction_val", "proba_0_val"]]
data_llm.head(n=10)

In [None]:
# Get value counts in descending order
value_counts3 = data_llm['model_a'].value_counts(ascending=False)

# Plotting
plt.figure(figsize=(15, 6))
sns.barplot(x=value_counts3.index, y=value_counts3.values)
plt.xlabel('Category')
plt.ylabel('Counts')
plt.xticks(rotation=90)
plt.title('Value Counts in Descending Order')
plt.grid(False)
plt.show()

In [None]:
plt.figure(figsize=(15, 6))

sns.histplot(data_llm["proba_0_val"])

In [None]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report

# AUC-ROC
if len(np.unique(y_val)) > 2:  # Multiclasse
    # Binarizar os rótulos para calcular AUC-ROC
    lb = LabelBinarizer()
    y_val_binarized = lb.fit_transform(y_val)  # Rótulos binarizados
    auc_roc = roc_auc_score(y_val_binarized, y_proba, multi_class='ovr')
    print(f"AUC-ROC (Multiclasse - OVR): {auc_roc:.4f}")
else:  # Binário
    auc_roc = roc_auc_score(y_val, y_proba[:, 1])
    print(f"AUC-ROC (Binário): {auc_roc:.4f}")

In [None]:
from sklearn.metrics import roc_curve, auc

# Fazer previsões de probabilidade
y_proba = model.predict_proba(X_val)

if len(np.unique(y_val)) > 2:  # Multiclasse
    # Para multiclasse, calcular a curva ROC para cada classe
    lb = LabelBinarizer()
    y_val_binarized = lb.fit_transform(y_val)  # Binarizar rótulos
    plt.figure(figsize=(10, 7))
    
    for i, class_name in enumerate(lb.classes_):
        fpr, tpr, _ = roc_curve(y_val_binarized[:, i], y_proba[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f'Classe {class_name} (AUC = {roc_auc:.2f})')
    
    plt.title('Curva ROC Multiclasse')
else:  # Binário
    # Para classificação binária
    fpr, tpr, _ = roc_curve(y_val, y_proba[:, 1])
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(10, 7))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'Curva ROC (AUC = {roc_auc:.2f})')

# Configuração do gráfico
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')  # Linha diagonal
plt.xlabel('Taxa de Falsos Positivos (FPR)')
plt.ylabel('Taxa de Verdadeiros Positivos (TPR)')
plt.title('Curva ROC')
plt.legend(loc="lower right")
plt.grid()
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Obtém os rótulos únicos de y_val
unique_labels = np.unique(y_val)

# Filtra rótulos válidos que estão presentes em y_val
class_labels = [label for label in ['winner_model_a', 'winner_model_b', 'winner_tie'] if label in unique_labels]

# Se não houver rótulos válidos, usa todos os rótulos únicos de y_val
if not class_labels:
    class_labels = unique_labels.tolist()

# Fazer previsões
y_pred = model.predict(X_val)

# Calcular a matriz de confusão
cm = confusion_matrix(y_val, y_pred, labels=class_labels)

# Exibir a matriz de confusão usando seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predição')
plt.ylabel('Verdadeiro')
plt.title('Matriz de Confusão')
plt.show()

In [None]:
# Avaliar o modelo
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy: {accuracy}')

In [None]:
# Avaliar o modelo
report = classification_report(y_val, y_pred)

print('Classification Report:')
print(report)

In [None]:
# Salvar as previsões para o conjunto de teste
output = pd.DataFrame({'id': test_df.index, 
                       'prediction': y_test_pred})

# Salvando modelo
output.to_csv('predictions_LGBMClassifier.csv', index=False)