In [41]:
# # Dataset Overview:

# # This project uses a public dataset of Magnus Carlsen’s games played on Chess.com - thank you Dhrubang.
# # The dataset contains game-level metadata including player ratings, game results,
# # time controls, and full move sequences in SAN notation.

# ## Problem Definition:

# The objective of this project is to predict whether Magnus Carlsen wins a Rapid
# chess game using information available before the game and during the early
# opening phase.

# The prediction task is framed as a binary classification problem:
# - Win - 1
# - Loss or Draw - 0

# Draws are grouped with losses because they are frequent in Rapid chess and
# introducing a separate draw class would significantly reduce model stability
# given the limited dataset size.

In [42]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/magnus-carlsen-chess-com-games/magnus_carlsen_games.csv


In [43]:
import pandas as pd

df = pd.read_csv("/kaggle/input/magnus-carlsen-chess-com-games/magnus_carlsen_games.csv")
df.shape

(6699, 13)

In [44]:
df.columns.tolist()

['id',
 'player_name',
 'opponent_name',
 'player_rating',
 'opponent_rating',
 'format',
 'date',
 'year',
 'result',
 'player_color',
 'opponent_color',
 'result_raw',
 'moves']

In [45]:
df.head()

Unnamed: 0,id,player_name,opponent_name,player_rating,opponent_rating,format,date,year,result,player_color,opponent_color,result_raw,moves
0,0,Magnus Carlsen,RainnWilson,2862,1200,Rapid,2014-12-14,2014,Win,white,black,1-0,1. e4 g6 2. Nf3 d6 3. d4 Bg7 4. Bc4 Bg4 5. Bxf...
1,1,Magnus Carlsen,solskytz,2862,1702,Rapid,2014-12-14,2014,Win,white,black,1-0,1. d4 Nf6 2. c4 e6 3. Nc3 Bb4 4. e3 c5 5. Ne2 ...
2,2,Magnus Carlsen,Tildenbeatsu,2862,1200,Rapid,2014-12-14,2014,Win,white,black,1-0,1. e4 e5 2. Nf3 Nc6 3. Bb5 Nf6 4. O-O Nxe4 5. ...
3,3,Magnus Carlsen,mtmnfy,2862,1200,Rapid,2014-12-14,2014,Win,white,black,1-0,1. d4 e6 2. e4 d5 3. Nd2 Nc6 4. Ngf3 Nf6 5. e5...
4,4,Magnus Carlsen,stepanosinovsky,2862,2360,Rapid,2014-12-14,2014,Loss,white,black,0-1,1. d4 Nf6 2. Bg5 c5 3. d5 Ne4 4. Bc1 e6 5. c4 ...


In [46]:
# Data Prep
df_rapid = df[df["format"] == "Rapid"].copy()
df_rapid.shape

(313, 13)

In [47]:
df_rapid["target_win"] = (df_rapid["result"] == "Win").astype(int)
df_rapid["target_win"].value_counts()
df_rapid["target_win"].value_counts(normalize=True)

target_win
1    0.536741
0    0.463259
Name: proportion, dtype: float64

In [48]:
df_rapid["date"] = pd.to_datetime(df_rapid["date"])
df_rapid["date"].min(), df_rapid["date"].max()

(Timestamp('2014-12-14 00:00:00'), Timestamp('2025-08-04 00:00:00'))

In [49]:
df_feat = df_rapid.copy()
df_feat["rating_diff"] = df_feat["player_rating"] - df_feat["opponent_rating"]
df_feat["is_white"] = df_feat["player_color"].map({"white": 1, "black": 0})
df_feat[["player_rating", "opponent_rating", "rating_diff", "is_white"]].describe()

Unnamed: 0,player_rating,opponent_rating,rating_diff,is_white
count,313.0,313.0,313.0,313.0
mean,2894.233227,2647.769968,246.463259,0.539936
std,34.667929,371.802959,360.149781,0.499201
min,2785.0,259.0,-4.0,0.0
25%,2864.0,2644.0,74.0,0.0
50%,2898.0,2792.0,130.0,1.0
75%,2916.0,2829.0,248.0,1.0
max,2977.0,2886.0,2551.0,1.0


In [50]:
# Sorting by time to prevent leakage
df_feat = df_feat.sort_values("date").reset_index(drop=True)

In [51]:
baseline_features = ["player_rating", "opponent_rating", "rating_diff", "is_white"]

x_baseline = df_feat[baseline_features]
y = df_feat["target_win"]

x_baseline.shape, y.shape

((313, 4), (313,))

In [52]:
# Logistic Regression Model
cutoff_date = "2024-09-01"

train = df_feat["date"] < cutoff_date
val = df_feat["date"] >= cutoff_date

x_train = x_baseline.loc[train]
y_train = y.loc[train]

x_val = x_baseline.loc[val]
y_val = y.loc[val]

x_train.shape, x_val.shape

((228, 4), (85, 4))

In [53]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)

In [54]:
from sklearn.linear_model import LogisticRegression

baseline_model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    random_state=42
)

baseline_model.fit(x_train_scaled, y_train)

In [55]:
# Early-game Feature Engineering

from sklearn.metrics import f1_score, classification_report, confusion_matrix

y_val_pred = baseline_model.predict(x_val_scaled)

print("Baseline F1:", round(f1_score(y_val, y_val_pred), 3))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

Baseline F1: 0.528

Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.73      0.65        44
           1       0.61      0.46      0.53        41

    accuracy                           0.60        85
   macro avg       0.60      0.60      0.59        85
weighted avg       0.60      0.60      0.59        85


Confusion Matrix:
[[32 12]
 [22 19]]


In [56]:
import re

def get_first_n_moves(moves_str, n_full_moves=10):
    if pd.isna(moves_str):
        return []

    cleaned = re.sub(r"\d+\.", "", moves_str)
    tokens = cleaned.strip().split()
    return tokens[: 2 * n_full_moves]


def get_player_moves(moves, is_white):
    return moves[0::2] if is_white == 1 else moves[1::2]

In [57]:
def extract_early_game_features(moves_str, is_white):
    moves = get_first_n_moves(moves_str, n_full_moves=10)
    player_moves = get_player_moves(moves, is_white)

    features = {
        "num_moves_10": len(player_moves),
        "num_captures_10": sum("x" in m for m in player_moves),
        "num_checks_10": sum(("+" in m) or ("#" in m) for m in player_moves),
        "early_queen_move": int(any(m.startswith("Q") for m in player_moves)),
        "castle_kingside_10": int(any("O-O" in m and "O-O-O" not in m for m in player_moves)),
        "castle_queenside_10": int(any("O-O-O" in m for m in player_moves)),
        "minor_piece_moves_10": sum(m.startswith(("N", "B")) for m in player_moves),
        "pawn_moves_10": sum(m[0].islower() for m in player_moves),
    }

    return pd.Series(features)

In [58]:
early_features = df_feat.apply(
    lambda row: extract_early_game_features(row["moves"], row["is_white"]),
    axis=1
)

df_feat = pd.concat([df_feat, early_features], axis=1)
df_feat = df_feat.loc[:, ~df_feat.columns.duplicated()]

In [59]:
df_feat[early_features.columns].describe()

Unnamed: 0,num_moves_10,num_captures_10,num_checks_10,early_queen_move,castle_kingside_10,castle_queenside_10,minor_piece_moves_10,pawn_moves_10
count,313.0,313.0,313.0,313.0,313.0,313.0,313.0,313.0
mean,10.0,1.249201,0.169329,0.345048,0.543131,0.025559,4.338658,4.383387
std,0.0,0.92778,0.500082,0.476145,0.498934,0.158069,1.132422,1.109391
min,10.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0
25%,10.0,1.0,0.0,0.0,0.0,0.0,4.0,4.0
50%,10.0,1.0,0.0,0.0,1.0,0.0,4.0,4.0
75%,10.0,2.0,0.0,1.0,1.0,0.0,5.0,5.0
max,10.0,5.0,4.0,1.0,1.0,1.0,7.0,7.0


In [60]:
baseline_features = [
    "player_rating",
    "opponent_rating",
    "rating_diff",
    "is_white"
]

early_game_features = [
    "num_captures_10",
    "num_checks_10",
    "early_queen_move",
    "castle_kingside_10",
    "castle_queenside_10",
    "minor_piece_moves_10",
    "pawn_moves_10"
]

all_features = baseline_features + early_game_features

x_extended = df_feat[all_features]
y = df_feat["target_win"]

In [61]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report, confusion_matrix

x_train = x_extended.loc[train]
x_val = x_extended.loc[val]

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)

log_reg_ext = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    random_state=42
)

log_reg_ext.fit(x_train_scaled, y_train)
y_val_pred_ext = log_reg_ext.predict(x_val_scaled)

print("Logistic Regression F1:",
      round(f1_score(y_val, y_val_pred_ext), 3))
print(confusion_matrix(y_val, y_val_pred_ext))
print(classification_report(y_val, y_val_pred_ext))

Extended Logistic Regression F1: 0.417
[[28 16]
 [26 15]]
              precision    recall  f1-score   support

           0       0.52      0.64      0.57        44
           1       0.48      0.37      0.42        41

    accuracy                           0.51        85
   macro avg       0.50      0.50      0.49        85
weighted avg       0.50      0.51      0.50        85



In [62]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=6,
    min_samples_leaf=5,
    class_weight="balanced",
    random_state=42
)

rf.fit(x_train, y_train)
y_val_pred_rf = rf.predict(x_val)

print("Random Forest F1:",
      round(f1_score(y_val, y_val_pred_rf), 3))
print(confusion_matrix(y_val, y_val_pred_rf))
print(classification_report(y_val, y_val_pred_rf))

Random Forest F1: 0.5
[[28 16]
 [22 19]]
              precision    recall  f1-score   support

           0       0.56      0.64      0.60        44
           1       0.54      0.46      0.50        41

    accuracy                           0.55        85
   macro avg       0.55      0.55      0.55        85
weighted avg       0.55      0.55      0.55        85

