# Baseline Model for Google QUEST Challenge

This notebook implements a simple baseline using TF-IDF features and a linear model.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')
sample_submission = pd.read_csv('/home/data/sample_submission.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Sample submission shape: {sample_submission.shape}")

# Identify target columns
target_cols = [col for col in train.columns if col not in test.columns and col != 'qa_id']
print(f"Number of target columns: {len(target_cols)}")
print(f"Target columns: {target_cols[:5]}...")

In [None]:
# Combine text features
train['text'] = train['question_title'].fillna('') + ' ' + train['question_body'].fillna('') + ' ' + train['answer'].fillna('')
test['text'] = test['question_title'].fillna('') + ' ' + test['question_body'].fillna('') + ' ' + test['answer'].fillna('')

# Create TF-IDF features
vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)

X_train_tfidf = vectorizer.fit_transform(train['text'])
X_test_tfidf = vectorizer.transform(test['text'])

print(f"TF-IDF feature shape: {X_train_tfidf.shape}")

# Prepare target matrix
y_train = train[target_cols].values
print(f"Target shape: {y_train.shape}")

In [None]:
# Cross-validation setup
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Store predictions
oof_predictions = np.zeros_like(y_train)
test_predictions = np.zeros((len(test), len(target_cols)))

# Train model for each target
scores = []

for target_idx, target in enumerate(target_cols):
    print(f"Training for target: {target} ({target_idx+1}/{len(target_cols)})")
    
    target_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_tfidf)):
        X_tr, X_val = X_train_tfidf[train_idx], X_train_tfidf[val_idx]
        y_tr, y_val = y_train[train_idx, target_idx], y_train[val_idx, target_idx]
        
        # Train Ridge regression
        model = Ridge(alpha=1.0, random_state=42)
        model.fit(X_tr, y_tr)
        
        # Predict
        val_pred = model.predict(X_val)
        oof_predictions[val_idx, target_idx] = val_pred
        
        # Calculate RMSE for this fold
        rmse = np.sqrt(mean_squared_error(y_val, val_pred))
        target_scores.append(rmse)
        
        # Predict on test
        test_pred = model.predict(X_test_tfidf)
        test_predictions[:, target_idx] += test_pred / n_folds
    
    mean_score = np.mean(target_scores)
    std_score = np.std(target_scores)
    scores.append(mean_score)
    print(f"  Mean RMSE: {mean_score:.4f} Â± {std_score:.4f}")

overall_score = np.mean(scores)
print(f"\nOverall Mean RMSE: {overall_score:.4f}")

In [None]:
# Calculate Spearman correlation (since that's the actual metric)
from scipy.stats import spearmanr

spearman_scores = []
for i, target in enumerate(target_cols):
    corr, _ = spearmanr(y_train[:, i], oof_predictions[:, i])
    spearman_scores.append(corr)
    print(f"{target}: {corr:.4f}")

mean_spearman = np.mean(spearman_scores)
print(f"\nMean Spearman correlation: {mean_spearman:.4f}")

In [None]:
# Create submission
submission = pd.DataFrame({
    'qa_id': test['qa_id']
})

for i, target in enumerate(target_cols):
    submission[target] = test_predictions[:, i]

# Ensure predictions are in [0, 1] range
submission[target_cols] = submission[target_cols].clip(0, 1)

print(f"Submission shape: {submission.shape}")
print(f"Submission columns: {submission.columns.tolist()}")
print("\nFirst few predictions:")
print(submission.head())

# Save submission
submission.to_csv('/home/submission/submission.csv', index=False)
print("\nSubmission saved to /home/submission/submission.csv")