In [None]:
pip install watermark transformers datasets torch scikit-learn pandas numpy matplotlib seaborn plotly lightgbm wordcloud


In [None]:
# System libraries
import random
import re
import unicodedata
import itertools

# File manipulation
import pandas as pd
import numpy as np

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import pylab as pl
import plotly.express as px
from wordcloud import WordCloud

# Model transformers and preprocessing
import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification
from keras.preprocessing.sequence import pad_sequences

# Warnings remove alerts
import warnings
warnings.filterwarnings("ignore")

# Python version
from platform import python_version
print('Python version in this Jupyter Notebook:', python_version())

# Load library versions
from watermark import watermark
%reload_ext watermark
%watermark -a "Library versions" --iversions


In [None]:
# Load training and testing datasets
train_df = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')
test_df = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')

# View dataset
display(train_df.head(), train_df.tail(), train_df.shape)
train_df.info()


In [None]:
# Combine text columns
train_df['combined_text'] = train_df['prompt'] + " [SEP] " + train_df['response_a'] + " [SEP] " + train_df['response_b']
test_df['combined_text'] = test_df['prompt'] + "[SEP]" + test_df['response_a'] + "[SEP]" + test_df['response_b']

# Create labels
train_df['label'] = train_df[['winner_model_a', 'winner_model_b', 'winner_tie']].idxmax(axis=1)

# Map labels to integers
label_mapping = {'winner_model_a': 0, 'winner_model_b': 1, 'winner_tie': 2}
train_df['label'] = train_df['label'].map(label_mapping)


In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(train_df.isnull(), cbar=False, cmap="viridis")
plt.title("Viewing Missing Values in the Training Set")
plt.show()

print("Number of missing values ​​per column:")
print(train_df.isnull().sum())


In [None]:
sns.countplot(x='label', data=train_df)
plt.title("Class Distribution of Binding Free Energy (BFE)")
plt.grid(False)
plt.show()

print("\nClass Distribution:")
print(train_df['label'].value_counts())


In [None]:
labels = train_df['label'].unique()

for label in labels:
    textos_prompt = ' '.join(train_df[train_df['label'] == label]['prompt'])
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(textos_prompt)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud for Class {label}')
    plt.show()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

vectorizer_prompt = TfidfVectorizer(max_features=500)
vectorizer_resp_a = TfidfVectorizer(max_features=500)
vectorizer_resp_b = TfidfVectorizer(max_features=500)

# Transform training data
train_prompt_features = vectorizer_prompt.fit_transform(train_df['prompt'])
train_resp_a_features = vectorizer_resp_a.fit_transform(train_df['response_a'])
train_resp_b_features = vectorizer_resp_b.fit_transform(train_df['response_b'])
train_combined_features = hstack([train_prompt_features, train_resp_a_features, train_resp_b_features])

# Transform testing data
test_prompt_features = vectorizer_prompt.transform(test_df['prompt'])
test_resp_a_features = vectorizer_resp_a.transform(test_df['response_a'])
test_resp_b_features = vectorizer_resp_b.transform(test_df['response_b'])
test_combined_features = hstack([test_prompt_features, test_resp_a_features, test_resp_b_features])


In [None]:
from sklearn.model_selection import train_test_split

X = train_combined_features
y = train_df['label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Training data shape:", X_train.shape)
print("Validation data shape:", X_val.shape)


In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report

# LightGBM parameters
lgbm_params = {
    'n_estimators': 500,
    'learning_rate': 0.05,
    'max_depth': -1,
    'num_leaves': 31,
    'objective': 'multiclass',
    'num_class': len(np.unique(y_train)),
    'random_state': 42
}

# Initialize and train the model
model = LGBMClassifier(**lgbm_params)
model.fit(X_train, y_train)


In [None]:
# Predictions and evaluation
y_pred = model.predict(X_val)

print("Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))
