# Baseline Model: TF-IDF + Gradient Boosting

This is the first baseline experiment using simple TF-IDF features on anchor and target phrases.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
import warnings
warnings.filterwarnings('ignore')

# Load data
train_df = pd.read_csv('/home/data/train.csv')
test_df = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Score distribution:\n{train_df['score'].value_counts().sort_index()}")

Train shape: (36473, 5)
Test shape: (36, 4)
Score distribution:
score
0.00     7471
0.25    11519
0.50    12300
0.75     4029
1.00     1154
Name: count, dtype: int64


In [2]:
# Create TF-IDF features for anchor and target
# Use a subset of features for speed in baseline
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2))

# Fit on both anchor and target to get shared vocabulary
all_text = pd.concat([train_df['anchor'], train_df['target']], ignore_index=True)
vectorizer.fit(all_text)

# Transform anchor and target separately
anchor_tfidf = vectorizer.transform(train_df['anchor'])
target_tfidf = vectorizer.transform(train_df['target'])

# For test set
test_anchor_tfidf = vectorizer.transform(test_df['anchor'])
test_target_tfidf = vectorizer.transform(test_df['target'])

print(f"TF-IDF feature shape: {anchor_tfidf.shape}")

TF-IDF feature shape: (36473, 5000)


In [3]:
# Create similarity features from TF-IDF vectors
from sklearn.metrics.pairwise import cosine_similarity

# Cosine similarity between anchor and target vectors
train_cosine_sim = cosine_similarity(anchor_tfidf, target_tfidf).diagonal()
test_cosine_sim = cosine_similarity(test_anchor_tfidf, test_target_tfidf).diagonal()

# Also create absolute difference features
anchor_dense = anchor_tfidf.toarray()
target_dense = target_tfidf.toarray()
test_anchor_dense = test_anchor_tfidf.toarray()
test_target_dense = test_target_tfidf.toarray()

# Absolute difference
train_abs_diff = np.abs(anchor_dense - target_dense)
test_abs_diff = np.abs(test_anchor_dense - test_target_dense)

print(f"Train abs diff shape: {train_abs_diff.shape}")
print(f"Test abs diff shape: {test_abs_diff.shape}")

Train abs diff shape: (36473, 5000)
Test abs diff shape: (36, 5000)


In [4]:
# Create feature matrix
# Combine cosine similarity with absolute difference features
X_train = np.column_stack([
    train_cosine_sim,
    train_abs_diff
])

X_test = np.column_stack([
    test_cosine_sim,
    test_abs_diff
])

y_train = train_df['score'].values

print(f"Final training feature shape: {X_train.shape}")
print(f"Final test feature shape: {X_test.shape}")

Final training feature shape: (36473, 5001)
Final test feature shape: (36, 5001)


In [5]:
# 5-fold cross validation
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]
    
    # Train model
    model = GradientBoostingRegressor(
        n_estimators=300,
        learning_rate=0.1,
        max_depth=5,
        random_state=42,
        n_iter_no_change=20
    )
    model.fit(X_tr, y_tr)
    
    # Predict and evaluate
    val_pred = model.predict(X_val)
    
    # Calculate Pearson correlation
    corr = np.corrcoef(y_val, val_pred)[0, 1]
    scores.append(corr)
    print(f"Fold {fold+1} Pearson correlation: {corr:.4f}")

print(f"\nMean Pearson correlation: {np.mean(scores):.4f} ± {np.std(scores):.4f}")

Fold 1 Pearson correlation: 0.4618


Fold 2 Pearson correlation: 0.4582


Fold 3 Pearson correlation: 0.4703


Fold 4 Pearson correlation: 0.4589


Fold 5 Pearson correlation: 0.4601

Mean Pearson correlation: 0.4619 ± 0.0044


In [6]:
# Train on full data and make predictions
final_model = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=5,
    random_state=42,
    n_iter_no_change=20
)
final_model.fit(X_train, y_train)

# Make predictions on test set
test_predictions = final_model.predict(X_test)

# Clip predictions to valid range [0, 1]
test_predictions = np.clip(test_predictions, 0, 1)

print(f"Test predictions shape: {test_predictions.shape}")
print(f"Test predictions range: [{test_predictions.min():.4f}, {test_predictions.max():.4f}]")
print(f"Test predictions distribution:")
for val in [0, 0.25, 0.5, 0.75, 1.0]:
    count = np.sum((test_predictions >= val - 0.125) & (test_predictions < val + 0.125))
    print(f"  {val}: {count}")

Test predictions shape: (36,)
Test predictions range: [0.1653, 0.7200]
Test predictions distribution:
  0: 0
  0.25: 21
  0.5: 13
  0.75: 2
  1.0: 0


In [None]:
# Create submission file
submission = pd.DataFrame({
    'id': test_df['id'],
    'score': test_predictions
})

submission.to_csv('/home/submission/submission.csv', index=False)
print("Submission file created at /home/submission/submission.csv")
print(submission.head())