<a href="https://colab.research.google.com/github/Xelaro2304/MSB1015-Scientific-Programming/blob/main/Chess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U ydata-profiling
!pip install berserk

In [None]:
import gdown
import os
import berserk
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport


In [None]:
def plot_distribution(data, plot="hist", title=None, label=None, bins=30, show_stats=True, normalize=False):
    """
    Plot a histogram (numeric) or count plot (categorical) for a single variable.

    Parameters:
    - data: array-like, the variable to plot
    - plot: "hist" for histogram, "count" for categorical count plot
    - title: optional plot title
    - label: optional x-axis label
    - bins: number of bins for histogram
    - show_stats: show mean/median/mode (only for histogram)
    - normalize: bool, whether to normalize frequencies/counts (0-1 or percentages)
    """
    plt.figure(figsize=(6,6))

    if plot == "hist":
        stat_type = 'density' if normalize else 'count'
        sns.histplot(data, bins=bins, kde=False, color=sns.color_palette("colorblind")[0], stat=stat_type)

        if show_stats:
            mean_val = np.mean(data)
            median_val = np.median(data)
            mode_val = stats.mode(data, keepdims=True)[0][0]
            plt.axvline(mean_val, color="red", linestyle="--", linewidth=1.5, label=f"Mean = {mean_val:.2f}")
            plt.axvline(median_val, color="green", linestyle="--", linewidth=1.5, label=f"Median = {median_val:.2f}")
            plt.axvline(mode_val, color="blue", linestyle="--", linewidth=1.5, label=f"Mode = {mode_val:.2f}")
            plt.legend()

        plt.ylabel("Density" if normalize else "Frequency")
        plt.xlabel(label if label else "Value")

    elif plot == "count":
        counts = data.value_counts(normalize=normalize)
        counts.plot(kind='bar', color=sns.color_palette("colorblind", len(counts)))
        plt.ylabel("Proportion" if normalize else "Count")
        plt.xlabel(label if label else "Category")

    else:
        raise ValueError("plot must be either 'hist' or 'count'")

    if title:
        plt.title(title)
    plt.tight_layout()
    plt.show()


In [None]:
url = 'https://docs.google.com/uc?export=download&id=1lBXYMdZtKdMm4AtGWjJFjmBygUtn8w5y&confirm=t'
path = os.getcwd()
output = path + '/games.csv'
!wget -O $output 'https://docs.google.com/uc?export=download&id=1lBXYMdZtKdMm4AtGWjJFjmBygUtn8w5y&confirm=t'

In [None]:
games_df = pd.read_csv(output, sep=';')


In [None]:
games_df.head()

In [None]:
games_df.shape

In [None]:
games_df.info()

In [None]:
games_df.isnull().sum()

In [None]:
games_df.describe()

Everything seems normal except for that minimum white rating, which will be inspected further

In [None]:
negative_rating = games_df["white_rating"]
negative_rating = negative_rating[negative_rating < 0]
print('Number of negative values:', len(negative_rating))
negative_rating.head()

There is another game with a negative value for a rating

In [None]:
negative_rating_indices = list(negative_rating.index)
negative_rating_info = games_df.iloc[list(negative_rating_indices),]
negative_rating_info.head()

Will check the original values of the game by fetching it with game ID

In [None]:
with open('./token') as f:

    token = f.read()
    token = token.strip()


session = berserk.TokenSession(token)

client = berserk.Client(session)

In [None]:
negative_rating_games = list(negative_rating_info["id"])
corrected_ratings = []
for g in negative_rating_games:
    game = client.games.export(g, as_pgn=True)
    print(game)
    game = game.split('\n')
    corrected_ratings.append(int(game[9][11:15]))
print(corrected_ratings)

In [None]:
games_positive_rtg = games_df
games_positive_rtg.loc[negative_rating_indices, 'white_rating'] = corrected_ratings
games_positive_rtg.loc[negative_rating_indices]

In [None]:
profile = ProfileReport(games_df,title="Games report")

profile.to_file("games_report.html")


In [None]:
#!env BROWSER=firefox
#!open games_report.html
from IPython.display import HTML

# show an HTML file inside the notebook
HTML(filename="games_report.html")

Things to notice:

1.   There seem to be some duplicated instances
1.   There are 400 unique increment codes, which seems problematic to use for classification
1.   Winner classes are somewhat balanced, except for the amount of draws
2.   The number of draws in winner is higher than the number of draws in victory status, will need to check that












Duplicated instances

In [None]:
import matplotlib.pyplot as plt
from scipy import stats

game_ids = games_positive_rtg['id']
#duplicates = [i for i in game_ids if game_ids.count(i) > 1]
#print(duplicates)
print('Number of unique records:', len(games_positive_rtg['id'].unique()))
duplicate_counts = games_positive_rtg['id'].value_counts()
duplicate_ids = list(duplicate_counts[duplicate_counts > 1].index)
duplicate_counts = duplicate_counts[duplicate_counts > 1]
print('Total number of duplicated records:', sum(duplicate_counts))
print('Number of records duplicated:', len(duplicate_counts))
print('Duplicated ids:', duplicate_ids)

plot_distribution(duplicate_counts.values, 'hist', 'Number of Duplicates per Game ID', 'Game ID', show_stats=False)


Out of the 20,058 records, 19113 are unique, but it is detecting only 813 replicates instead of 945

Repetition is mainly occuring in duplicates, although some of them are repeated 3-5 times

In [None]:
games_unique = games_positive_rtg.drop_duplicates(keep='first')
print(f"Original rows: {len(games_positive_rtg)}, After removing duplicates: {len(games_unique)}")

Trying to remove duplicates values only removes ~400 of them, so I'll inspect further

In [None]:
duplicate_sample = games_positive_rtg[games_positive_rtg['id'].isin(duplicate_ids[0:4])]
duplicate_sample.sort_values(by='id')

Some of the repeated instances have distinct values of "created_at" and "last_move_at", so I'll try removing it

In [None]:
games_time_dropped = games_positive_rtg.drop(columns=['created_at', 'last_move_at'])
#games_time_dropped = games_positive_rtg.drop('last_move_at', axis=1)

games_unique = games_time_dropped.drop_duplicates(keep='first').reset_index(drop=True)
print(f"Original rows: {len(games_time_dropped)}, After removing duplicates: {len(games_unique)}")

All duplicates removed

Convert increment codes

I'll try to handle the increment code in two ways:


1.   Separate time into minutes and time increment per move
2.   Classify each increment code into a time control



In [None]:
increment_code = games_unique['increment_code']
increment_code_split = [time.split('+') for time in increment_code]
print('Splitted increment codes:', increment_code_split)

#As minutes and increment
start_time = [int(minutes[0]) for minutes in increment_code_split]
print('Starting time in minutes:', start_time)

#bar_chart(list(games_unique.iloc()), start_time, 'Starting time per game ID')

increment = [int(seconds[1]) for seconds in increment_code_split]
print('Increment in seconds:', increment)



In [None]:
plot_distribution(start_time, 'hist', 'Starting time per game ID', 'Minutes', show_stats=True)

In [None]:
plot_distribution(increment, 'hist', 'Increment per game ID', 'Seconds', show_stats=True)

In [None]:
start_time_df = pd.DataFrame(start_time, columns=['start_time'])
increment_df = pd.DataFrame(increment, columns=['increment'])
#check if there are games with 0 < start time < 1
#games_unique
under_minute = ((start_time_df < 1) & (start_time_df > 0)).sum()
print('Games with less than 1 minute of start time:', under_minute.iloc[0])


Most games are finish (no increment) and have 10 minutes as start time, with no game starting with less than a minute

In [None]:
games_unique = pd.concat([games_unique, start_time_df], axis = 1)
games_unique = pd.concat([games_unique, increment_df], axis = 1)
games_unique.info()

In [None]:
games_unique.head()

According to the data source, Lichess, time controls are decided assuming a game length of 40 moves and assigning the following categories depending on the duration:

    ≤ 29s = UltraBullet
    ≤ 179s = Bullet
    ≤ 479s = Blitz
    ≤ 1499s = Rapid
    ≥ 1500s = Classical

In [None]:
def set_time_control(minutes, increment):
    total_time = minutes*60+increment*40
    if total_time <= 29:
        return 'UltraBullet'
    elif total_time <= 179:
        return 'Bullet'
    elif total_time <= 479:
        return 'Blitz'
    elif total_time <= 1499:
        return 'Rapid'
    else:
        return 'Classical'

time_control = games_unique.apply(lambda x: set_time_control(x['start_time'], x['increment']), axis=1)
time_control_df = pd.DataFrame({'time_control': time_control})
time_control_df.info()


In [None]:
plot_distribution(time_control, "count",'Time control per game ID', 'Time control')

Too few blitz games to the point they are not even appreciated

In [None]:
blitz = [i for i in time_control if i == 'Blitz']
print('Number of blitz games:', len(blitz))

Very few blitz games

In [None]:
games_unique = pd.concat([games_unique, time_control_df], axis=1)
games_unique.info()

In [None]:
games_unique.head()