In [None]:
from IPython.core.display import HTML
styles = '''@import url('https://fonts.googleapis.com/css?family=Quicksand&display=swap');
 * {
	 margin: 0;
	 padding: 0;
	 box-sizing: border-box;
}
 h3 {
	 font-family: Comic Sans MS;
}
 .alert {
	 width: 80%;
	 margin: 20px auto;
	 padding: 30px;
	 position: relative;
	 border-radius: 5px;
	 box-shadow: 0 0 15px 5px #ccc;
}
 .close {
	 position: absolute;
	 width: 30px;
	 height: 30px;
	 opacity: 0.5;
	 border-width: 1px;
	 border-style: solid;
	 border-radius: 50%;
	 right: 15px;
	 top: 25px;
	 text-align: center;
	 font-size: 1.6em;
	 cursor: pointer;
}
 .simple-alert {
	 background-color: #aed6e5;
	 border-left: 5px solid #245b70;
}
 .simple-alert .close {
	 border-color: #245b70;
	 color: #245b70;
}
 .success-alert {
	 background-color: #aee5c0;
	 border-left: 5px solid #24703d;
}
 .success-alert .close {
	 border-color: #24703d;
	 color: #24703d;
}
 .danger-alert {
	 background-color: #e5aeae;
	 border-left: 5px solid #702424;
}
 .danger-alert .close {
	 border-color: #702424;
	 color: #702424;
}
 .warning-alert {
	 background-color: #ffe6a9;
	 border-left: 5px solid #a97800;
}
 .warning-alert .close {
	 border-color: #a97800;
	 color: #a97800;
}
'''
HTML("<style>"+styles+"</style>")
!pip install rich
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from rich.console import Console
from rich import print
from rich.theme import Theme

sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})

def custom_palette(custom_colors):
    customPalette = sns.set_palette(sns.color_palette(custom_colors))
    sns.palplot(sns.color_palette(custom_colors),size=0.8)
    plt.tick_params(axis='both', labelsize=0, length = 0)

palette = ["#7209B7","#3F88C5","#136F63","#F72585","#FFBA08"]
palette2 = sns.diverging_palette(120, 220, n=20)
custom_palette(palette)

custom_theme = Theme({
    "info" : "italic bold cyan",
    "warning": "italic bold magenta",
    "danger": "bold blue"
})

console = Console(theme=custom_theme)

![](https://kgcorner.com/wp-content/uploads/2021/05/fq4cqqdmz4jv9agitg72.jpeg)
<center><h1>Extracting Features from Portable Game Notation (PGN)</h1></center>
<center>The objective of this notebook is to extract features from the PGN features.</center>
<h3>What is a PGN</h3>
Portable Game Notation (PGN) is a standard plain text format for recording chess games (both the moves and related data), which can be read by humans and is also supported by most chess software.

In [None]:
df = pd.read_csv('../input/chesscom-user-games-60000-games/club_games_data.csv')
df.head(2)

<div class="alert success-alert">
    
📌 <b>What we are doing in this notebook</b>:<br> We have 14 features, the last one is the **PGN**.<br>
    We can extract many more features from it.<br>
    These features may be essential in improving your model accuracy. <br>
    
</div>



### Example PGN


In [None]:
df.pgn[0].split('\n')

The additional features we can extract from the pgn are :-<br>
'Event', 'Site', 'Start_Date', 'End_Date', 'Start_time', <br>'End_time', 'Round', 'Result', 'Tournament', 'ECO',<br> 'First_Move', 'Second_Move', 'Third_Move', 'Fourth_Move'.

## Extracting Features

#### Extracting Features Event, Site, Start_Date, End_Date, Start_Time, End_Time, Eco, EcoName, Round, Result, Game_Type

In [None]:
feature_names = ['Event', 'Site', 'Start_Date', 'End_Date', 'Start_Time',
                 'End_Time', 'Eco', 'EcoName', 'Round', 'Result']
feature_positions = [0, 1, 2, -6, -7, -5, -15, -14, 3, 6]

#Takes in the name you want to give the feature, and the position of the feature in 
#the pgn.split('\n') and creates the feature with feature name in the dataframe

for feature_name, position in zip(feature_names, feature_positions):
    df[feature_name] = df['pgn'].apply(
        lambda x: x.split('\n')[position].split('"')[1])

#### Creating Eco_Name feature 

The ECO Codes is a classification system for the chess openings moves.<br>
There are five main categories, "A" to "E", corresponding to the five volumes of the earlier editions, each of which is further subdivided into 100 subcategories, for a total of 500 codes. The term "ECO" is often used as a shorthand for this coding system.<br>
We can also extract the Eco_Name using the EcoName feature we extracted from the pgn

In [None]:
df.iloc[0]['EcoName']

In [None]:
df['EcoName'] = df['EcoName'].apply(lambda x: x.split('/')[-1])

#### Extracting Is_Tournament Feature

If the game being played is part of a Chess.com Tournament, then the 7th position feature will have Tournament key.<br>
**Using this Information we can create another feature** :- Is_Tournament

In [None]:
def is_tournament(x):
    if (x.split('\n')[7].split('"')[0][1:-1] == 'Tournament'):
        return True
    else:
        return False
    
df['Is_tournament'] = df['pgn'].apply(is_tournament)

#### Extracting the Moves

Now, lets extract the moves played from the PGN. <br>
In the case of moves we see two schemas:-<br>

In [None]:
df.pgn.iloc[0].split("\n")[-2]

In [None]:
df.pgn.iloc[-1].split("\n")[-2]

In this notebook I am just extracting the moves and not the time taken, although it might be interesting to do an analysis on it.<br>

In [None]:
def extract_move(pgn):
    if(pgn.find('{[') == -1):
        original_list = pgn.split("\n")[-2].split()
        toberemoved_list = pgn.split("\n")[-2].split()[::3]
        new_list = [x for x in original_list if x not in toberemoved_list]
        return new_list
    else:
        return pgn.split("\n")[-2].split()[1::4]

In [None]:
df['Moves'] = df['pgn'].apply(extract_move)

## Filtering the data, and creating basic features

In [None]:
print(f"Number of unique values in 'Site': {df['Site'].nunique()}")
print(f"Number of unique values in 'Round': {df['Round'].nunique()}")

Both of these columns have only one unique value making them kind of redundant, so we can drop these features

In [None]:
df.drop(columns=['Site','Round'],axis=1,inplace=True)

In [None]:
df[['white_result','black_result','Result']]

As we can see, Result already tells us who won (or wether it was a draw). <br> So we don't have to keep both these columns around as they are kind of redundant. We can drop the column value whenever it says win, and keep the other column's value, thus creating result_type

In [None]:
df['result_type'] = df['white_result'].apply(lambda x: x if x != 'win' else 0)
idx = df[df['result_type']==0].index
df['result_type'][idx] = df['black_result'][idx]
df.drop(columns=['white_result','black_result'],axis=1,inplace=True)

In [None]:
df['Result'] = df['Result'].apply(lambda x: 'Black' if x == '0-1' else ('White' if x=='1-0' else 'Draw') )

In [None]:
df['rating_difference'] = df['white_rating'] - df['black_rating']

In [None]:
sns.countplot(y=df.rules)

People got bored playing Chess, so they created variants like chess960, threecheck, crazyhouse and kingofthehill. As fun as these variants are, we will train our model for predicting plain old chess.<br>
After filtering the data, the rules column will only have one unique value, so we can drop that too.

In [None]:
df = df[df['rules']=='chess']
df.drop('rules',axis=1,inplace=True)

I have already extracted all the features I wanted from PGN column in this [notebook](https://www.kaggle.com/adityajha1504/extracting-features-from-pgn/edit/run/69682662), so I will be dropping it.<br> I will also be dropping the FEN column. FEN is a standard notation for describing a particular board position of a chess game. The purpose of FEN is to provide all the necessary information to restart a game from a particular position.

In [None]:
df.drop(columns=['fen','pgn'],inplace=True)

## Saving the new csv

The final csv created by us looks like this:-

In [None]:
df.head(2)

In [None]:
# Creating the new and better csv
df.to_csv('../working/df_clean.csv',index=False)