<a href="https://colab.research.google.com/github/Sagargupta16/LeetCode_Rating_Predictor/blob/main/LC_Contest_Rating_Predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import numpy as np
import tensorflow as tf
import joblib
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Reproducibility
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Paths (relative to project root)
ROOT = Path("..") if Path("../data/data.json").exists() else Path(".")
DATA_PATH = ROOT / "data" / "data.json"
MODEL_PATH = ROOT / "model.keras"
SCALER_PATH = ROOT / "scaler.save"

# Number of features must match production (main.py)
NUM_FEATURES = 7

print(f"TensorFlow {tf.__version__}")
gpus = tf.config.list_physical_devices("GPU")
if gpus:
    print(f"GPU detected: {gpus[0].name}")
    # Allow memory growth to avoid OOM on small GPUs
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
else:
    print("No GPU detected â€” training on CPU")
    print("  For GPU on Windows: use WSL2 + pip install 'tensorflow[and-cuda]'")
    print("  Model is small (3K params) so CPU training takes ~80 seconds")
print(f"Data path: {DATA_PATH.resolve()}")

## Data Collection

**Skip cells 2-4 if you already have `data/data.json`.**

To refresh training data, run from the project root:
```bash
python scripts/update_data.py
```
The REST contest API is now blocked (403), so data collection uses GraphQL via `scripts/update_data.py`.

*(Data collection cells removed -- use `python scripts/update_data.py` instead)*

## Load and Prepare Data

7 features used in both training and production (`main.py`):

| # | Feature | Source | Correlation |
|---|---------|--------|-------------|
| 1 | current_rating | input1 | -0.148 |
| 2 | rank | input2 | -0.474 |
| 3 | total_participants | input3 | -0.308 |
| 4 | rank_percentage | input4 * 100 | -0.495 |
| 5 | attended_contests | input5 | -0.115 |
| 6 | log_rank | log(1 + rank) | **-0.508** |
| 7 | rating_x_percentile | rating * percentile | **-0.555** |

Features 6-7 are engineered from existing data and provide the strongest signal.

In [None]:
# Load training data
records = []
with open(DATA_PATH) as f:
    for line in f:
        records.append(json.loads(line))
print(f"Loaded {len(records)} records")

# Extract raw fields
input1 = np.array([r["input1"] for r in records])  # current_rating
input2 = np.array([r["input2"] for r in records])  # rank
input3 = np.array([r["input3"] for r in records])  # total_participants
input4 = np.array([r["input4"] for r in records])  # percentile (0-1)
input5 = np.array([r["input5"] for r in records])  # attended_contests
y = np.array([r["output"] for r in records])        # rating_change

# Build 7-feature matrix matching production main.py
rank_percentage = input4 * 100  # production computes rank*100/participants
log_rank = np.log1p(input2)
rating_x_pct = input1 * input4

X = np.column_stack([
    input1,          # 0: current_rating
    input2,          # 1: rank
    input3,          # 2: total_participants
    rank_percentage, # 3: rank_percentage
    input5,          # 4: attended_contests
    log_rank,        # 5: log(1 + rank)
    rating_x_pct,    # 6: rating * percentile
])

assert X.shape[1] == NUM_FEATURES, f"Expected {NUM_FEATURES} features, got {X.shape[1]}"
print(f"Feature matrix: {X.shape}")
print(f"Output: mean={y.mean():.2f}, std={y.std():.2f}")

# Scale features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
joblib.dump(scaler, SCALER_PATH)
print(f"Scaler saved to {SCALER_PATH} ({NUM_FEATURES} features)")

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.1, random_state=SEED
)
print(f"Train: {X_train.shape[0]}, Test: {X_test.shape[0]}")

In [None]:
# Dense network (LSTM with 1 timestep adds overhead for zero benefit)
model = Sequential([
    Dense(64, activation="relu", input_shape=(NUM_FEATURES,)),
    Dropout(0.2),
    Dense(32, activation="relu"),
    Dropout(0.2),
    Dense(16, activation="relu"),
    Dense(1),
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss="mse",
    metrics=["mae"],
)
model.summary()

early_stop = EarlyStopping(
    monitor="val_loss", patience=10, restore_best_weights=True, verbose=1
)

history = model.fit(
    X_train, y_train,
    validation_split=0.1,
    epochs=200,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1,
)

# Evaluate
test_loss, test_mae = model.evaluate(X_test, y_test, verbose=0)
print(f"\nTest MSE:  {test_loss:.2f}")
print(f"Test RMSE: {np.sqrt(test_loss):.2f}")
print(f"Test MAE:  {test_mae:.2f}")

model.save(MODEL_PATH)
print(f"Model saved to {MODEL_PATH}")

In [None]:
# Sanity check: predict for a sample user
rating, rank, participants, attended = 1957, 1869, 21165, 87
pct = (rank * 100) / participants
sample = np.array([[rating, rank, participants, pct, attended, np.log1p(rank), rating * (rank / participants)]])
pred = model.predict(scaler.transform(sample), verbose=0)

print(f"Rating: {rating}, Rank: {rank}/{participants}, Attended: {attended}")
print(f"Predicted change: {pred[0][0]:+.2f}")
print(f"Predicted new rating: {rating + pred[0][0]:.2f}")

# Training curve
try:
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    ax1.plot(history.history["loss"], label="Train")
    ax1.plot(history.history["val_loss"], label="Validation")
    ax1.set_xlabel("Epoch"); ax1.set_ylabel("MSE"); ax1.set_title("Loss"); ax1.legend()

    ax2.plot(history.history["mae"], label="Train")
    ax2.plot(history.history["val_mae"], label="Validation")
    ax2.set_xlabel("Epoch"); ax2.set_ylabel("MAE"); ax2.set_title("Mean Absolute Error"); ax2.legend()

    plt.tight_layout()
    plt.show()
except ImportError:
    print("matplotlib not installed, skipping plot")