# Imports

In [1]:
import os
import numpy as np
import pandas as pd
import lightgbm as lgb

from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import IsolationForest
from sklearn.neural_network import MLPClassifier # The "Scikit-Learn Neural Net"
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
np.random.seed(42)
sample_sub = pd.read_csv(f"data/sample_submission.csv")

# Loading preprocessed Embeddings

### Training Embeddings
- **Metric embedding:** Representation of model-generated metrics for a response  
- **Response embedding:** Embedding of the generated response  
- **User prompt embedding:** Representation of the input query  
- **System prompt embedding:** Embedding of the system instructions

In [3]:
# ============================
# Load embeddings and targets
# ============================
EMB  = "./data"
y = np.load('data/train_scores.npy')

# Load ALL embeddings (System included)
train_metric = np.load(f"{EMB}/train_metric_embedding.npy").astype(np.float32)
train_resp   = np.load(f"{EMB}/train_response_embedding.npy").astype(np.float32)
train_user   = np.load(f"{EMB}/train_user_prompt_embedding.npy").astype(np.float32)
train_sys = np.load(f"{EMB}/train_system_prompt_embedding.npy").astype(np.float32)

test_metric = np.load(f"{EMB}/test_metric_embedding.npy").astype(np.float32)
test_resp   = np.load(f"{EMB}/test_response_embedding.npy").astype(np.float32)
test_user   = np.load(f"{EMB}/test_user_prompt_embedding.npy").astype(np.float32)
test_sys = np.load(f"{EMB}/test_system_prompt_embedding.npy").astype(np.float32)

### Target Labels
`train_scores.npy` contains the ground-truth numerical score for each training data point.

After loading, we horizontally concatenate system + user + response embeddings to create a unified text embedding for each row.

In [4]:
# --- 3. PREPARE FEATURES ---
# Concat System + User + Response
X_text_train = np.hstack([train_sys, train_user, train_resp])
X_text_test  = np.hstack([test_sys,  test_user,  test_resp])
X_metric_train = train_metric
X_metric_test  = test_metric

print(f"Text Vector Shape: {X_text_train.shape}")

Text Vector Shape: (5000, 2304)


# Feature Engineering

Feature engineering is the core of the model’s predictive power.  
We build three new meaningful features:

---

## Isolation Forest Outlier Score

**Goal:** Identify how unusual a response is within the text embedding space.

Steps:
1. Train an `IsolationForest` model on the combined text embeddings.
2. Obtain anomaly scores using `.decision_function()`.
3. Scale the values using `MinMaxScaler` so S1 ranges between 0 and 1.

A lower score means "more unusual," while a higher score indicates a more typical or well-structured response.

---

## Match Probability using MLP

**Motivation:**  
Rewards where metric embedding and response embedding logically agree.

### Positive Samples:
Pairs of (metric_embedding, response_embedding) from the same instance.

### Negative Samples:
We shuffle the response embeddings to create mismatched pairs:
- Realistic "good" pairs get label = 1  
- Synthetic mismatches get label = 0

### Model:

We train a two-layer `MLPClassifier`:

```
Input → Dense(512) → Dense(256) → Probability
```

This creates a latent projection that learns whether the metric embedding "belongs" to the text embedding.

The output is the predicted probability that a pair is consistent.

---

##  Cosine Similarity

This captures alignment between the user prompt and generated response.

We compute:

```
cosine_similarity(user_embedding, response_embedding)
```

This value is again scaled to 0–1 using MinMaxScaler.

Higher score means the response closely matches the intent of the user query.

---

## Summary
The three features represent:

| Feature | What it measures |
|---|---|
| S1 | Outlier likelihood of the generated response |
| S2 | Whether the metric and response embeddings logically match |
| S3 | Semantic relevance between user query and model response |

In [None]:
# --- 4. Define New Features ---

# Anomaly Detection (Isolation Forest)
# Detects outliers in the text embedding space.
print("Fitting S1 (Isolation Forest)...")
iso = IsolationForest(n_estimators=100, contamination='auto', n_jobs=-1, random_state=42)
iso.fit(X_text_train)

# MLPClassifier
# This allows the model to mathematically project the Metric and Text to see if they match.
print("Fitting S2 (MLP Matcher)...")

# 1. Positives (Real Pairs)
X_pos = np.hstack([X_metric_train, X_text_train])
y_pos = np.ones(len(X_pos))

# 2. Negatives (Shuffled Pairs)
# We use 2x negatives to make the model really strict about what a "Match" is.
rng = np.random.RandomState(42)
idx_shuf1 = rng.permutation(len(X_text_train))
idx_shuf2 = rng.permutation(len(X_text_train))

X_neg1 = np.hstack([X_metric_train, X_text_train[idx_shuf1]])
X_neg2 = np.hstack([X_metric_train, X_text_train[idx_shuf2]])
y_neg = np.zeros(len(X_neg1) + len(X_neg2))

X_matcher = np.vstack([X_pos, X_neg1, X_neg2])
y_matcher = np.concatenate([y_pos, y_neg])

# MLP: Hidden layers project the high-dim embeddings. 
# This is fast on CPU for this dataset size (~15k rows total).
matcher = MLPClassifier(
    hidden_layer_sizes=(512, 256), # Layers to compress info
    activation='relu',
    solver='adam',
    alpha=0.0001,
    batch_size=256,
    learning_rate_init=0.001,
    max_iter=50, # 50 epochs is usually enough
    random_state=42
)
matcher.fit(X_matcher, y_matcher)

# Cosine Similarity 
def get_s3(u, r):
    num = np.sum(u * r, axis=1)
    den = np.linalg.norm(u, axis=1) * np.linalg.norm(r, axis=1)
    return num / (den + 1e-9)

Fitting S1 (Isolation Forest)...
Fitting S2 (MLP Matcher)...




# Synthetic Data Augmentation

We generate synthetic negative samples:

- We use the real metric embeddings untouched.
- We randomly shuffle:
  - user embeddings
  - response embeddings

This creates inputs that should correspond to **low-quality output**.

## Synthetic Scores

We assign synthetic labels uniformly in the range:

```
0.0 – 3.5
```

This makes the regression model strongly aware of what bad responses look like statistically.

In [None]:
# --- 5. AUGMENTATION ---

# Augment Training Data for Regressor
# We mix real data with synthetic "bad" data so LightGBM learns to predict low scores.
n_aug = len(y)
idx_aug = rng.permutation(len(y))

# Synthetic Data
aug_metric = X_metric_train
aug_text   = X_text_train[idx_aug]
aug_user   = train_user[idx_aug]
aug_resp   = train_resp[idx_aug]
# Synthetic Scores: Uniform 0.0 - 3.5 (Strictly low)
aug_y      = rng.uniform(0.0, 3.5, size=n_aug)

# Combine
X_text_total = np.vstack([X_text_train, aug_text])
X_metric_total = np.vstack([X_metric_train, aug_metric])
train_user_total = np.vstack([train_user, aug_user])
train_resp_total = np.vstack([train_resp, aug_resp])
y_total = np.concatenate([y, aug_y])

In [None]:
# --- COMPUTE FEATURES ---

s1_raw_tr = iso.decision_function(X_text_total)
s1_raw_te = iso.decision_function(X_text_test)
sc1 = MinMaxScaler()
S1_train = sc1.fit_transform(s1_raw_tr.reshape(-1,1)).flatten()
S1_test  = sc1.transform(s1_raw_te.reshape(-1,1)).flatten()

X_concat_tr = np.hstack([X_metric_total, X_text_total])
X_concat_te = np.hstack([X_metric_test, X_text_test])
S2_train = matcher.predict_proba(X_concat_tr)[:, 1]
S2_test  = matcher.predict_proba(X_concat_te)[:, 1]

s3_raw_tr = get_s3(train_user_total, train_resp_total)
s3_raw_te = get_s3(test_user, test_resp)
sc3 = MinMaxScaler()
S3_train = sc3.fit_transform(s3_raw_tr.reshape(-1,1)).flatten()
S3_test  = sc3.transform(s3_raw_te.reshape(-1,1)).flatten()

# Final Training Feature Construction

After computing the S1, S2, and S3 signals, we assemble our final model input.

To give LightGBM more separation power, we engineer additional interaction features:

| Feature | Formula | Meaning |
|---|---|---|
| S1xS2 | S1 * S2 | Alignment of anomaly detection + metric matching |
| S2xS3 | S2 * S3 | Logical agreement + semantic alignment |
| S1xS3 | S1 * S3 | Outlier vs prompt consistency |
| All | S1 * S2 * S3 | High when all three indicators agree |

The resulting dataframe is small but dense and expressive, ideal for boosting models.

# LightGBM Regression Training

We use LightGBM to predict the final score.

## Training Setup

```
objective: regression
metric: rmse
num_leaves: 63
learning_rate: 0.05
min_data_in_leaf: 10
```

These parameters are tuned to:

- Allow non-linear splits
- Recover multi-peaked score distributions
- Capture the structure of the engineered features

## Sample Weights

To avoid LightGBM favoring common score ranges, we compute:

```
class_weight = 1 / (frequency of rounded score bucket)
```

This balances the regression training.

In [None]:
# --- 6. FINAL REGRESSION ---
def get_feats(s1, s2, s3):
    df = pd.DataFrame({"S1": s1, "S2": s2, "S3": s3})
    # Strong interactions to isolate the "Perfect" cluster
    df["S1xS2"] = df.S1 * df.S2
    df["S2xS3"] = df.S2 * df.S3
    df["S1xS3"] = df.S1 * df.S3
    df["All"]   = df.S1 * df.S2 * df.S3
    return df

df_train = get_feats(S1_train, S2_train, S3_train)
df_test  = get_feats(S1_test, S2_test, S3_test)

# Sample Weights
counts = pd.Series(np.round(y_total)).value_counts()
weights = pd.Series(np.round(y_total)).map(lambda x: 1.0 / counts.get(x, 1.0)).values

print("Training LightGBM...")
dtrain = lgb.Dataset(df_train, label=y_total, weight=weights)

# Parameters optimized to separate clusters rather than smooth the mean
params = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.05, 
    "num_leaves": 63,        # Higher leaves = more complex splits (good for bimodal)
    "feature_fraction": 0.9,
    "bagging_fraction": 0.9,
    "min_data_in_leaf": 10,  # Allow smaller clusters (peaks)
    "seed": 42,
    "verbosity": -1
}

model = lgb.train(params, dtrain, num_boost_round=1200)
pred_test = model.predict(df_test)

Training LightGBM...


In [None]:
# 8. EXPORT
id_col = sample_sub.columns[0]
final_df = pd.DataFrame({id_col: sample_sub[id_col], 'score': pred_test})

# Optional: Clip to valid range just to be safe (0-10), but no shifting
final_df['score'] = np.clip(final_df['score'], 0, 10)

final_df.to_csv("stacking_submission_new.csv", index=False)

print("Submission Saved.")
print(final_df['score'].describe())


Submission Saved.
count    3638.000000
mean        5.596733
std         3.398753
min         0.101029
25%         1.870185
50%         6.764587
75%         8.875907
max        10.000000
Name: score, dtype: float64


In [None]:
sub = pd.read_csv('stacking_submission_new.csv')
low_mean = sub[sub['score'] <= 6]['score'].mean()
high_mean = sub[sub['score'] > 6]['score'].mean()

print("Mean (0–6):", low_mean)
print("Mean (6–10):", high_mean)

Mean (0–6): 2.23646495159269
Mean (6–10): 8.690876978928015


In [35]:
import pandas as pd
import numpy as np

sub = pd.read_csv("stacking_submission_new.csv")

low_mean  = sub[sub['score'] <= 6]['score'].mean()
high_mean = sub[sub['score'] > 6]['score'].mean()

delta_low  = 3 - low_mean
delta_high = 9 - high_mean

# Apply shifts directly to score
sub.loc[sub['score'] <= 6, 'score'] += delta_low
sub.loc[sub['score'] >  6, 'score'] += delta_high

sub['score'] = sub['score'].clip(upper=10)

# Save output
sub.to_csv("stacking_submission_shifted_10ub.csv", index=False)
