# Magnus Game Analysis:
- Using Magnus Carlsen's Chess.com games dataset
- https://www.kaggle.com/datasets/dhrubangtalukdar/magnus-carlsen-chess-com-games
- ## Logistic Regression Algorithm:
- For Classification.

In [49]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix
)

## Load the Dataset:

In [3]:
game_df = pd.read_csv("magnus_carlsen_games.csv")

In [5]:
game_df.head()

Unnamed: 0,id,player_name,opponent_name,player_rating,opponent_rating,format,date,year,result,player_color,opponent_color,result_raw,moves
0,0,Magnus Carlsen,RainnWilson,2862,1200,Rapid,2014-12-14,2014,Win,white,black,1-0,1. e4 g6 2. Nf3 d6 3. d4 Bg7 4. Bc4 Bg4 5. Bxf...
1,1,Magnus Carlsen,solskytz,2862,1702,Rapid,2014-12-14,2014,Win,white,black,1-0,1. d4 Nf6 2. c4 e6 3. Nc3 Bb4 4. e3 c5 5. Ne2 ...
2,2,Magnus Carlsen,Tildenbeatsu,2862,1200,Rapid,2014-12-14,2014,Win,white,black,1-0,1. e4 e5 2. Nf3 Nc6 3. Bb5 Nf6 4. O-O Nxe4 5. ...
3,3,Magnus Carlsen,mtmnfy,2862,1200,Rapid,2014-12-14,2014,Win,white,black,1-0,1. d4 e6 2. e4 d5 3. Nd2 Nc6 4. Ngf3 Nf6 5. e5...
4,4,Magnus Carlsen,stepanosinovsky,2862,2360,Rapid,2014-12-14,2014,Loss,white,black,0-1,1. d4 Nf6 2. Bg5 c5 3. d5 Ne4 4. Bc1 e6 5. c4 ...


## Inspect the Dataset Structure:

In [9]:
# Inspect the Dataset Structure & Basic Stats:
game_df.info()
game_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6699 entries, 0 to 6698
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               6699 non-null   int64 
 1   player_name      6699 non-null   object
 2   opponent_name    6699 non-null   object
 3   player_rating    6699 non-null   int64 
 4   opponent_rating  6699 non-null   int64 
 5   format           6699 non-null   object
 6   date             6699 non-null   object
 7   year             6699 non-null   int64 
 8   result           6699 non-null   object
 9   player_color     6699 non-null   object
 10  opponent_color   6699 non-null   object
 11  result_raw       6699 non-null   object
 12  moves            6695 non-null   object
dtypes: int64(4), object(9)
memory usage: 680.5+ KB


Unnamed: 0,id,player_rating,opponent_rating,year
count,6699.0,6699.0,6699.0,6699.0
mean,3349.0,3223.602478,2962.436483,2023.460069
std,1933.979059,129.367877,258.192988,1.723239
min,0.0,2619.0,259.0,2014.0
25%,1674.5,3212.0,2907.5,2023.0
50%,3349.0,3258.0,3026.0,2024.0
75%,5023.5,3299.0,3095.0,2025.0
max,6698.0,3401.0,3429.0,2026.0


## Decide the Target Variable:

In [11]:
game_df["result_raw"].value_counts()

result_raw
1-0        4756
0-1        1326
0.5-0.5     617
Name: count, dtype: int64

In [20]:
# Convert to binary (example: Win vs Not Win)

game_df["target"] = game_df["result_raw"].apply(
    lambda x: 1 if x == '0-1' else 0
)

game_df["target"].value_counts()

target
0    5373
1    1326
Name: count, dtype: int64

In [21]:
game_df.sample(5)

Unnamed: 0,id,player_name,opponent_name,player_rating,opponent_rating,format,date,year,result,player_color,opponent_color,result_raw,moves,target
5350,5350,Magnus Carlsen,baki83,3269,2925,Blitz,2025-04-15,2025,Win,white,black,1-0,1. d4 Nf6 2. Nc3 d5 3. Bf4 Bf5 4. e3 e6 5. Bd3...,0
1279,1279,Magnus Carlsen,BuLolo,3146,3005,Blitz,2023-01-04,2023,Win,white,black,1-0,1. e4 c5 2. Nf3 d6 3. d4 cxd4 4. Nxd4 Nf6 5. N...,0
3319,3319,Magnus Carlsen,SpeedofLight0,3223,3069,Blitz,2024-04-22,2024,Loss,black,white,0-1,1. d4 d5 2. Nc3 Nf6 3. Bf4 g6 4. Qd2 Bg7 5. O-...,1
6221,6221,Magnus Carlsen,dogsofwar,3315,3268,Blitz,2025-10-09,2025,Loss,black,white,0-1,1. d4 Nf6 2. c4 g6 3. Nc3 Bg7 4. e4 d6 5. Be2 ...,1
911,911,Magnus Carlsen,DanielNaroditsky,3234,3101,Blitz,2022-11-20,2022,Win,white,black,1-0,1. e4 c6 2. d4 d5 3. exd5 cxd5 4. Bd3 Nf6 5. h...,0


## Drop Unusable Columns:

In [25]:
print(game_df.columns)

# Automatically find non-numeric columns (except target)
non_numeric_cols = game_df.select_dtypes(include="object").columns
non_numeric_cols = non_numeric_cols.drop("result_raw")

game_df = game_df.drop(columns=non_numeric_cols)

game_df

Index(['id', 'player_rating', 'opponent_rating', 'year', 'result_raw',
       'target'],
      dtype='object')


Unnamed: 0,id,player_rating,opponent_rating,year,result_raw,target
0,0,2862,1200,2014,1-0,0
1,1,2862,1702,2014,1-0,0
2,2,2862,1200,2014,1-0,0
3,3,2862,1200,2014,1-0,0
4,4,2862,2360,2014,0-1,1
...,...,...,...,...,...,...
6694,6694,2854,2724,2026,1-0,0
6695,6695,2859,2689,2026,1-0,0
6696,6696,2849,2781,2026,0-1,1
6697,6697,2853,2645,2026,1-0,0


## Handle Categorical Features % Check Missing Values:

In [31]:
game_df = pd.get_dummies(game_df, drop_first=True)
game_df.isnull().sum()

id                    0
player_rating         0
opponent_rating       0
year                  0
target                0
result_raw_0.5-0.5    0
result_raw_1-0        0
dtype: int64

## Split Features and Target:

In [33]:
X = game_df.drop("target", axis=1)
y = game_df["target"]

## Train–Test and Split: (Stratified)

In [34]:
X_train, X_test, y_train, y_test = train_test_split(
    X,y,
    test_size = 0.2,
    random_state = 42,
    stratify = y
)

## Feature Scaling: (CRITICAL)

In [38]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Train Logistic Regression Model:

In [39]:
model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

## Make Predictions using Model:

In [40]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

## Evaluate the Model Properly:

In [50]:
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1 Score :", f1_score(y_test, y_pred))
print("ROC-AUC  :", roc_auc_score(y_test, y_prob))

Accuracy : 1.0
Precision: 1.0
Recall   : 1.0
F1 Score : 1.0
ROC-AUC  : 1.0


## Confusion Matrix (Interpretation)

In [45]:
confusion_matrix(y_test, y_pred)

array([[1075,    0],
       [   0,  265]])

## Interpret Model Coefficients:

In [51]:
coefficients = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": model.coef_[0]
}).sort_values(by="Coefficient", ascending=False)

coefficients.head(10)

Unnamed: 0,Feature,Coefficient
2,opponent_rating,0.304851
3,year,0.035447
0,id,-0.028852
1,player_rating,-0.142765
4,result_raw_0.5-0.5,-3.464047
5,result_raw_1-0,-5.824254
