In [None]:
# Install sentence-transformers
# !pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import threading
from sklearn.metrics.pairwise import cosine_similarity

# Check available GPUs
num_gpus = torch.cuda.device_count()
print(f"Available GPUs: {num_gpus}")
if num_gpus < 2:
    raise RuntimeError("This script requires at least 2 GPUs")

def get_semantic_overlap(df):
    
    # Create models for each GPU
    model_path = "/kaggle/input/sentencetransformersallmpnetbasev2/all-mpnet-base-v2"
    prompt_model = SentenceTransformer(model_path)
    prompt_model = prompt_model.to('cuda:0')  # First GPU
    
    response_a_model = SentenceTransformer(model_path)
    response_a_model = response_a_model.to('cuda:1')  # Second GPU
    
    # Will use either GPU 0 or 1 based on which finishes first
    response_b_model = None  # Will initialize this later
    
    # Variables to store results
    prompt_embeddings = None
    response_a_embeddings = None
    response_b_embeddings = None
    
    # Lock for thread synchronization
    lock = threading.Lock()
    prompt_done = False
    response_a_done = False
    gpu_assigned = False  # New flag to track if GPU has been assigned for response_b
    
    # Define functions for parallel processing
    def generate_prompt_embeddings():
        nonlocal prompt_embeddings, prompt_done, gpu_assigned
        print("Generating prompt embeddings on GPU 0...")
        prompt_embeddings = prompt_model.encode(df['prompt'].tolist(), batch_size=128, show_progress_bar=True)
        with lock:
            prompt_done = True
            # If GPU hasn't been assigned yet, assign GPU 0 for response_b
            if not gpu_assigned:
                gpu_assigned = True
                gpu_to_use.append(0)
        print("Prompt embeddings completed on GPU 0")
    
    def generate_response_a_embeddings():
        nonlocal response_a_embeddings, response_a_done, gpu_assigned
        print("Generating response A embeddings on GPU 1...")
        response_a_embeddings = response_a_model.encode(df['response_a'].tolist(), batch_size=64, show_progress_bar=True)
        with lock:
            response_a_done = True
            # If GPU hasn't been assigned yet, assign GPU 1 for response_b
            if not gpu_assigned:
                gpu_assigned = True
                gpu_to_use.append(1)
        print("Response A embeddings completed on GPU 1")
    
    # Create a shared variable to communicate which GPU to use
    gpu_to_use = []
    
    def generate_response_b_embeddings():
        nonlocal response_b_embeddings, response_b_model
        # Wait until a GPU is assigned
        while True:
            with lock:
                if gpu_to_use:  # If a GPU has been assigned
                    break
            # Small sleep to prevent CPU hogging
            import time
            time.sleep(0.1)
        
        chosen_gpu = gpu_to_use[0]  # Get the assigned GPU
        print(f"Generating response B embeddings on GPU {chosen_gpu}...")
        response_b_model = SentenceTransformer(model_path)
        response_b_model = response_b_model.to(f'cuda:{chosen_gpu}')
        response_b_embeddings = response_b_model.encode(df['response_b'].tolist(), batch_size=64, show_progress_bar=True)
        print(f"Response B embeddings completed on GPU {chosen_gpu}")
    
    # Start threads for parallel processing
    thread_prompt = threading.Thread(target=generate_prompt_embeddings)
    thread_response_a = threading.Thread(target=generate_response_a_embeddings)
    thread_response_b = threading.Thread(target=generate_response_b_embeddings)
    
    thread_prompt.start()
    thread_response_a.start()
    thread_response_b.start()
    
    # Wait for all threads to complete
    thread_prompt.join()
    thread_response_a.join()
    thread_response_b.join()
    
    # Calculate cosine similarities
    print("Calculating similarities...")
    df['a_semantic_overlap'] = np.array([
        cosine_similarity([p], [r])[0][0] 
        for p, r in zip(prompt_embeddings, response_a_embeddings)
    ])
    df['b_semantic_overlap'] = np.array([
        cosine_similarity([p], [r])[0][0] 
        for p, r in zip(prompt_embeddings, response_b_embeddings)
    ])
    
    # Save the results
    # df.to_csv('/kaggle/working/df_with_semantic_overlap.csv', index=False)
    print("All done!")
    return df

In [None]:
# Load your data
df = pd.read_csv("/kaggle/input/llm-classification-finetuning/train.csv")





In [None]:
test_df = pd.read_csv("/kaggle/input/llm-classification-finetuning/test.csv")
test_df.head()


In [None]:
df_with_semantic_overlap = get_semantic_overlap(df)
# 
df_test_with_semantic_overlap = get_semantic_overlap(test_df)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import re
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import os
import shutil

In [None]:
# Download necessary NLTK resources
# nltk.download('vader_lexicon')
nltk_data_dir = "/kaggle/input/vader-lexicon/vader_lexicon.txt"
nltk_data_dir = os.path.join(os.getcwd(), 'nltk_data')
sentiment_dir = os.path.join(nltk_data_dir, 'sentiment')
os.makedirs(sentiment_dir, exist_ok=True)

# 3. Copy the lexicon file from the input directory to NLTK's expected location
vader_source = "/kaggle/input/vaderlexicon/vader_lexicon.txt"
vader_destination = os.path.join(sentiment_dir, "vader_lexicon.txt")

# Only copy if the file doesn't already exist in the destination
if not os.path.exists(vader_destination):
    shutil.copy(vader_source, vader_destination)
    print(f"Copied VADER lexicon from {vader_source} to {vader_destination}")
else:
    print(f"VADER lexicon already exists at {vader_destination}")

# 4. Tell NLTK to look for data in your custom directory
nltk.data.path.append(nltk_data_dir)

# Feature Engineering Function

# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Create a new dataframe for features
features_df = pd.DataFrame(index=df.index)

features_df_test = pd.DataFrame(index=test_df.index)

In [None]:
df.head()

In [None]:
# features_df["a_semantic_overlap"] = df["a_semantic_overlap"]
# features_df["b_semantic_overlap"] = df["b_semantic_overlap"]

# 1. Model type features
# Encode model names

def add_features(features_df, df):
    
    # 2. Text-based features for responses
    
    # Response length features
    features_df['resp_a_length'] = df['response_a'].apply(len)
    features_df['resp_b_length'] = df['response_b'].apply(len)
    features_df['resp_length_diff'] = features_df['resp_a_length'] - features_df['resp_b_length']
    features_df['resp_length_ratio'] = features_df['resp_a_length'] / features_df['resp_b_length']
    
    # Sentiment features
    features_df['resp_a_sentiment'] = df['response_a'].apply(lambda x: sia.polarity_scores(x)['compound'])
    features_df['resp_b_sentiment'] = df['response_b'].apply(lambda x: sia.polarity_scores(x)['compound'])
    features_df['sentiment_diff'] = features_df['resp_a_sentiment'] - features_df['resp_b_sentiment']
    
    # 3. Response complexity features
    
    # Average word length
    features_df['resp_a_avg_word_len'] = df['response_a'].apply(
        lambda x: np.mean([len(word) for word in x.split()]) if len(x.split()) > 0 else 0
    )
    features_df['resp_b_avg_word_len'] = df['response_b'].apply(
        lambda x: np.mean([len(word) for word in x.split()]) if len(x.split()) > 0 else 0
    )
    
    # Lexical diversity (unique words ratio)
    features_df['resp_a_lexical_div'] = df['response_a'].apply(
        lambda x: len(set(x.lower().split())) / len(x.split()) if len(x.split()) > 0 else 0
    )
    features_df['resp_b_lexical_div'] = df['response_b'].apply(
        lambda x: len(set(x.lower().split())) / len(x.split()) if len(x.split()) > 0 else 0
    )
    
    # 4. Semantic similarity between prompt and responses
    
    # For a real implementation, you would use embeddings or cosine similarity
    # Here, I'll use a simple approach: percentage of prompt words in response
    def word_overlap_ratio(prompt, response):
        prompt_words = set(prompt.lower().split())
        response_words = set(response.lower().split())
        if len(prompt_words) == 0:
            return 0
        return len(prompt_words.intersection(response_words)) / len(prompt_words)
    
    features_df['prompt_resp_a_overlap'] = df.apply(
        lambda row: word_overlap_ratio(row['prompt'], row['response_a']), axis=1
    )
    features_df['prompt_resp_b_overlap'] = df.apply(
        lambda row: word_overlap_ratio(row['prompt'], row['response_b']), axis=1
    )
    
    # 5. Response similarity
    features_df['resp_similarity'] = df.apply(
        lambda row: word_overlap_ratio(row['response_a'], row['response_b']), axis=1
    )
    
    # 6. First-person language features ("I", "me", "my")
    features_df['resp_a_first_person'] = df['response_a'].apply(
        lambda x: len(re.findall(r'\b(I|me|my|myself)\b', x, re.IGNORECASE))
    )
    features_df['resp_b_first_person'] = df['response_b'].apply(
        lambda x: len(re.findall(r'\b(I|me|my|myself)\b', x, re.IGNORECASE))
    )
    
    # # 7. Use of technical terms or jargon (simplified approach)
    # technical_terms = ['function', 'algorithm', 'data', 'model', 'system', 'process', 'code', 'api']
    # features_df['resp_a_technical'] = df['response_a'].apply(
    #     lambda x: sum(1 for term in technical_terms if term.lower() in x.lower().split())
    # )
    # features_df['resp_b_technical'] = df['response_b'].apply(
    #     lambda x: sum(1 for term in technical_terms if term.lower() in x.lower().split())
    # )
    
    # 8. Question marks and direct engagement
    # features_df['resp_a_questions'] = df['response_a'].apply(lambda x: x.count('?'))
    # features_df['resp_b_questions'] = df['response_b'].apply(lambda x: x.count('?'))
    return features_df



In [None]:
features_df = add_features(features_df, df)
features_df['a_semantic_overlap'] = df_with_semantic_overlap['a_semantic_overlap']
features_df['b_semantic_overlap'] = df_with_semantic_overlap['b_semantic_overlap']




In [None]:
features_df.head()

In [None]:
test_features_df = add_features(features_df_test, test_df)
test_features_df['a_semantic_overlap'] =  df_test_with_semantic_overlap['a_semantic_overlap']
test_features_df['b_semantic_overlap'] =  df_test_with_semantic_overlap['b_semantic_overlap']


In [None]:

# Generate features
features = features_df

# Prepare target variable (1 if model_a wins, 0 if model_b wins, 2 if tie)
y = df['winner_model_a'] * 1 + df['winner_model_b'] * 0 + df['winner_tie'] * 2

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2, random_state=42)

# Set up XGBoost with log loss optimization
xgb_model = xgb.XGBClassifier(
    objective='multi:softprob',  # This optimizes for log loss
    num_class=3,
    learning_rate=0.05,  # Smaller learning rate for better generalization
    max_depth=4,
    n_estimators=200,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.01,
    reg_lambda=1,
    random_state=42
)

# Train the model
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    eval_metric='mlogloss',  # Log loss evaluation metric
    early_stopping_rounds=20,
    verbose=True
)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)
y_pred_proba = xgb_model.predict_proba(X_test)



In [None]:
y_pred_proba

In [None]:
# Calculate log loss (this is the competition metric)
from sklearn.metrics import log_loss
test_log_loss = log_loss(y_test, y_pred_proba)
print(f"\nTest Log Loss: {test_log_loss:.4f} (Lower is better)")

# Calculate accuracy and print classification report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Cross-validation with log loss scoring
from sklearn.model_selection import cross_val_score
cv_log_loss = cross_val_score(xgb_model, features, y, cv=5, scoring='neg_log_loss')
print("\nCross-validation evaluation:")
print(f"CV Log Loss scores: {-cv_log_loss}")  # Negate to get the actual log loss
print(f"Mean CV Log Loss: {-cv_log_loss.mean():.4f} (Lower is better)")


In [None]:


# Get feature importance
feature_importance = pd.DataFrame({
    'Feature': features.columns,
    'Importance': xgb_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("Top 10 most important features:")
print(feature_importance.head(10))

# This is how you would use the model to make predictions on new data
def predict_preferred_model(new_data):
    new_features = engineer_features(new_data)
    predictions = xgb_model.predict(new_features)
    probabilities = xgb_model.predict_proba(new_features)
    
    # Map predictions to model preference
    prediction_map = {
        0: "User prefers model_a",
        1: "User prefers model_b", 
        2: "User has no preference (tie)"
    }
    
    results = []
    for i, pred in enumerate(predictions):
        results.append({
            'Prediction': prediction_map[pred],
            'Probability': probabilities[i][pred],
            'model_a': new_data.iloc[i]['model_a'],
            'model_b': new_data.iloc[i]['model_b']
        })
    
    return results

In [None]:
X_test = test_features_df
test_ids = X_test.index



In [None]:
# Function to prepare final submission
def prepare_final_submission(model, test_data, test_ids):
    """
    Prepare the final submission file according to competition format
    """
    # Get probability predictions
    probas = model.predict_proba(test_data)
    
    # Create submission DataFrame
    submission = pd.DataFrame({
        'id': test_ids,
        'winner_model_a': probas[:, 1],  # Class 1 (model_a wins)
        'winner_model_b': probas[:, 0],  # Class 0 (model_b wins)
        'winner_tie': probas[:, 2]       # Class 2 (tie)
    })
    
    # Ensure probabilities sum to 1
    submission_probs = submission[['winner_model_a', 'winner_model_b', 'winner_tie']]
    row_sums = submission_probs.sum(axis=1)
    submission[['winner_model_a', 'winner_model_b', 'winner_tie']] = submission_probs.div(row_sums, axis=0)
    
    # Round all probability columns to 2 decimal places
    submission[['winner_model_a', 'winner_model_b', 'winner_tie']] = submission[['winner_model_a', 'winner_model_b', 'winner_tie']].round(2)
    
    # Re-normalize after rounding to ensure they sum to 1 again
    submission_probs = submission[['winner_model_a', 'winner_model_b', 'winner_tie']]
    row_sums = submission_probs.sum(axis=1)
    submission[['winner_model_a', 'winner_model_b', 'winner_tie']] = submission_probs.div(row_sums, axis=0)
    
    return submission

# Example of submission preparation (with mock test data)
final_submission = prepare_final_submission(xgb_model, X_test, test_ids)
final_submission.to_csv('submission.csv', index=False)