In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data READ

In [None]:
data = pd.read_csv("/kaggle/input/llm-classification-finetuning/train.csv")

In [None]:
data.head(5)

In [None]:
data.describe().T

# Data UNDERSTAND

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
print(f"Total Samples:{len(data)}")
print(f"Columns:{data.columns.tolist()}")
print("\n\nClass Distribution:")
print(f"Winner A: {(data['winner_model_a']==1).sum()}")
print(f"Winner B: {(data['winner_model_b']==1).sum()}")
print(f"Winner Tie: {(data['winner_tie']==1).sum()}")
print("\n\nCheck for Missing Values:")
print(data.isnull().sum())
print("\n\nCalculating Average Length:")
data['prompt_length'] = data['prompt'].str.len()
data['response_a_length'] = data['response_a'].str.len()
data['response_b_length'] = data['response_b'].str.len()

plt.figure(figsize=(15,5))

plt.subplot(131)
sns.histplot(data['prompt_length'],bins=30)
plt.title('Prompt Length Distribution')

plt.subplot(132)
sns.histplot(data['response_a_length'],bins=30)
plt.title('Response A Length Distribution')

plt.subplot(133)
sns.histplot(data['response_a_length'],bins=30)
plt.title('Response B Length Distribution')

plt.tight_layout()
plt.show()



# generate EMBEDDINGS

In [None]:
!pip install transformers -q

In [None]:
from transformers import AutoTokenizer, AutoModel


def get_embeddings(texts):

    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    model = AutoModel.from_pretrained("bert-base-uncased")

    embeddings = []

    for text in tqdm(texts):
        inputs = tokenizer(text,return_tensors="pt",padding=True,truncation=True,max_length=512)
        outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    return np.vstack(embeddings)

In [None]:
def prepare_features(data):
    """prepare features for model training"""
    ###Get embeddings for prompt and responses
    prompt_embeddings = get_embeddings(data['prompt'])
    response_a_embeddings = get_embeddings(data['response_a'])
    response_b_embeddings = get_embeddings(data['response_b'])

    ###Concate all feature
    X = np.hstack([prompt_embeddings,response_a_embeddings,response_b_embeddings])

    ###preparing values
    y = np.column_stack([data['winner_model_a'],data['winner_model_b'],data['winner_tie']])

    return X,y
    

In [None]:
data

# Using XGBoost Classifier

In [None]:
df_train = pd.read_csv("/kaggle/input/llm-classification-finetuning/train.csv")

In [None]:
df_train.head()

In [None]:
df_train.describe(include=object)

In [None]:
df_train.isna().sum()

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder=LabelEncoder()

df_train["model_a"] = encoder.fit_transform(df_train["model_a"])
df_train["model_b"] = encoder.transform(df_train["model_b"])

In [None]:
import regex as re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()


def preprocessing(x:str):
    text = x.lower()
    text = re.sub("[^a-zA-Z0-9\s+]","",text)
    text = re.sub("http\S+","",text)
    removed_stopwords = [word for word in text.split() if word not in stop_words]
    stemmed_word = [stemmer.stem(word) for word in removed_stopwords]
    total_text = " ".join(stemmed_word)
    return total_text
    
    

In [None]:
preprocessing("Hi, I am Gua&*#hda9m https://www.  thoughtfully")

In [None]:
df_train['prompt'] = df_train['prompt'].apply(lambda x:preprocessing(x))

In [None]:
df_train['response_a'] = df_train['response_a'].apply(lambda x:preprocessing(x))


In [None]:
df_train['response_b'] = df_train['response_b'].apply(lambda x:preprocessing(x))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

df_train["whole_text"] = "prompt: "+ df_train.prompt + "  response_a: "+df_train.response_a+" response_b: "+df_train.response_b
X = vectorizer.fit_transform(df_train["whole_text"])

In [None]:
X

In [None]:
df_train.columns

In [None]:
df_train.columns

In [None]:
combined_labels = df_train[["winner_model_a","winner_model_b","winner_tie"]].values

In [None]:
y = np.argmax(combined_labels,axis=1)

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)



In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier()

xgb.fit(X_train,y_train)

In [None]:
# xgb.predict(X_test)

In [None]:
y_pred = xgb.predict(X_test)

In [None]:
y_pred

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test,y_pred)

### This is poor performance for XGBclassifier. So, there is no point in using Tree Based models. 

### Let's take this into

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score


# rfc = RandomForestClassifier()
# rfc.fit(X_train,y_train)

# y_pred = rfc.predict(X_test)
# accuracy_score(y_test,y_pred)


In [None]:
df_train

In [None]:
df_train["whole_text"] = "prompt: "+ df_train.prompt + "  response_a: "+df_train.response_a+" response_b: "+df_train.response_b

In [None]:
df_train.drop(["prompt","response_a","response_b"],axis=1,inplace=True)

In [None]:
df_train

In [None]:
df_combined = df_train[["winner_model_a","winner_model_b","winner_tie"]].values
output = np.argmax(df_combined,axis=1)

In [None]:
df_train["labels"] = output

In [None]:
final_df = df_train.drop(['id','model_a','model_b','winner_model_a','winner_model_b','winner_tie'],axis=1)

In [None]:
final_df

In [None]:
from datasets import Dataset, DatasetDict

train_ds = Dataset.from_pandas(final_df) 

In [None]:
train_ds

In [None]:
model_name = "microsoft/deberta-v3-small"

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
model

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
def tokz(x):
    return tokenizer(x["whole_text"])

In [None]:
tok_data = train_ds.map(tokz,batched=True)

In [None]:
dds = tok_data.train_test_split(0.25,seed=42)

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
args= TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    logging_dir="./logs",
    lr_scheduler_type="cosine",
    weight_decay=0.01,
    fp16=True
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dds["train"],
    eval_dataset=dds["test"]
)

In [None]:
trainer.train()
trainer.evaluate() 