# Step 1 — Baseline model

In this notebook, we build a simple baseline using lexical/length features and Logistic Regression.

In [1]:
# Imports
import pandas as pd, numpy as np, re
from pathlib import Path

In [2]:
#Loading the data
DATA = Path("../data")
TRAIN_PATH = DATA / 'train.csv'
TEST_PATH = DATA / 'test.csv'
OUT_DIR  = Path("../outputs");  OUT_DIR.mkdir(parents=True, exist_ok=True)
ART_DIR  = Path("../artifacts"); ART_DIR.mkdir(parents=True, exist_ok=True)

train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

In [3]:
# Build 3-class target: 0=A wins, 1=B wins, 2=Tie
y = np.select(
    [train_df['winner_model_a']==1, train_df['winner_model_b']==1, train_df['winner_tie']==1],
    [0, 1, 2]
)
train_df['target'] = y

In [4]:
# Helper functions for feature engineering

# count regex matches safely
def count_pattern(text, pattern):
    if not isinstance(text, str):
        return 0
    return len(re.findall(pattern, text))

# Paragraph breaks: "\n\n"
def paragraph_count(text):
    return text.count('\\n\\n')

# List / bullet usage: "-", "•", or numbered lists
def list_count(text):
    return count_pattern(text, r"(^\s*[\-\*•]\s|\d+\.)")

# Quote / markdown emphasis: ">", "**", or blockquotes
def quote_count(text):
    return count_pattern(text, r">|\*\*")

## This choice of features comes from our Exploratory Data Analysis in which we tested the correlation between prefered answers and different lexical features

In [5]:
# Compute structural features for A and B responses
for df in [train_df, test_df]:
    for col in ['response_a', 'response_b']:
        df[f'para_count_{col[-1]}'] = df[col].apply(paragraph_count)
        df[f'list_count_{col[-1]}'] = df[col].apply(list_count)
        df[f'quote_count_{col[-1]}'] = df[col].apply(quote_count)

    # Compute relative differences (A - B)
    for feat in ['para_count', 'list_count', 'quote_count']:
        df[f'{feat}_diff'] = df[f'{feat}_a'] - df[f'{feat}_b']

In [6]:
# Length features (characters)
train_df['len_a'] = train_df['response_a'].astype(str).apply(len)
train_df['len_b'] = train_df['response_b'].astype(str).apply(len)
train_df['len_diff'] = train_df['len_a'] - train_df['len_b']

# Compute metrics difference between a and b:
train_df['para_count_diff']  = train_df['para_count_a']  - train_df['para_count_b']
train_df['list_count_diff']  = train_df['list_count_a']  - train_df['list_count_b']
train_df['quote_count_diff'] = train_df['quote_count_a'] - train_df['quote_count_b']

# Final feature matrix
feature_cols = ['len_diff', 'para_count_diff', 'list_count_diff', 'quote_count_diff']
X = train_df[feature_cols].fillna(0)
X.head()

Unnamed: 0,len_diff,para_count_diff,list_count_diff,quote_count_diff
0,3332,17,8,24
1,-535,-7,-3,0
2,-914,1,0,0
3,1620,0,5,0
4,528,-2,0,0


In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
import numpy as np

# Define modeling pipeline
pipe = Pipeline([
    ('normalize', StandardScaler()),
    ('model', LogisticRegression(
        max_iter=2000,
        C=1.0,
        class_weight=None
    ))
])

# Stratified 5-fold cross-validation setup
cv_splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold_scores = []
for fold_idx, (train_idx, val_idx) in enumerate(cv_splitter.split(X, y), start=1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    pipe.fit(X_train, y_train)
    preds = pipe.predict_proba(X_val)
    loss = log_loss(y_val, preds, labels=[0, 1, 2])
    fold_scores.append(loss)
    print(f"Fold {fold_idx}: log_loss = {loss:.5f}")

fold_scores = np.array(fold_scores)
print(f"\nCross-val log_loss → mean={fold_scores.mean():.5f}, std={fold_scores.std():.5f}")

Fold 1: log_loss = 1.07255
Fold 2: log_loss = 1.07106
Fold 3: log_loss = 1.07194
Fold 4: log_loss = 1.07216
Fold 5: log_loss = 1.06893

Cross-val log_loss → mean=1.07133, std=0.00130


In [8]:
# Build test features
test_df['len_a'] = test_df['response_a'].astype(str).apply(len)
test_df['len_b'] = test_df['response_b'].astype(str).apply(len)
test_df['len_diff'] = test_df['len_a'] - test_df['len_b']

test_df['para_count_diff']  = test_df['para_count_a']  - test_df['para_count_b']
test_df['list_count_diff']  = test_df['list_count_a']  - test_df['list_count_b']
test_df['quote_count_diff'] = test_df['quote_count_a'] - test_df['quote_count_b']

X_test = test_df[feature_cols].fillna(0)

# Fit on full training data
pipe.fit(X, y)

# Predict probabilities for the 3 classes in Kaggle order:
# winner_model_a (class 0), winner_model_b (class 1), winner_tie (class 2)
proba_test = pipe.predict_proba(X_test)  # (n_test, 3)

# Build submission
sub = pd.DataFrame({
    'id': test_df['id'].values,
    'winner_model_a': proba_test[:,0],
    'winner_model_b': proba_test[:,1],
    'winner_tie':     proba_test[:,2],
})

# Save
sub.to_csv(OUT_DIR / f"submission_lr.csv", index=False)
print("Saved submission_lr.csv")


Saved submission_lr.csv
