In [1]:
# Import dependencies
import pandas as pd
import numpy as np
from config import db_password
from sqlalchemy import create_engine
import psycopg2

In [2]:
# Create engine
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/lichess_data"
engine = create_engine(db_string)
conn=engine.connect()

In [3]:
# Pull SQL query and put into df
chess_df= pd.read_sql_table ("chess_data",conn)
chess_df.head()

Unnamed: 0,id,created_at,turns,winner,white_id,white_rating,black_id,black_rating,rating_difference,opening_eco,opening_name,moves
0,0,2017-08-31 20:06:40,13,white,bourgris,1500,a-00,1191,309,D10,Slav Defense: Exchange Variation,d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4 Nc3 Ba5 Bf4
1,1,2017-08-30 21:53:20,16,black,a-00,1322,skinnerua,1261,61,B00,Nimzowitsch Defense: Kennedy Variation,d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...
2,2,2017-08-30 21:53:20,61,white,ischia,1496,a-00,1500,-4,C20,King's Pawn Game: Leonardis Variation,e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...
3,3,2017-08-30 16:20:00,61,white,daniamurashov,1439,adivanov2009,1454,-15,D02,Queen's Pawn Game: Zukertort Variation,d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O...
4,4,2017-08-29 18:06:40,95,white,nik221107,1523,adivanov2009,1469,54,C41,Philidor Defense,e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na6 Nc3 Be7 b4 N...


In [4]:
# Drop unneeded ID column
chess_df= chess_df.drop("id",axis=1)
chess_df.head()

Unnamed: 0,created_at,turns,winner,white_id,white_rating,black_id,black_rating,rating_difference,opening_eco,opening_name,moves
0,2017-08-31 20:06:40,13,white,bourgris,1500,a-00,1191,309,D10,Slav Defense: Exchange Variation,d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4 Nc3 Ba5 Bf4
1,2017-08-30 21:53:20,16,black,a-00,1322,skinnerua,1261,61,B00,Nimzowitsch Defense: Kennedy Variation,d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...
2,2017-08-30 21:53:20,61,white,ischia,1496,a-00,1500,-4,C20,King's Pawn Game: Leonardis Variation,e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...
3,2017-08-30 16:20:00,61,white,daniamurashov,1439,adivanov2009,1454,-15,D02,Queen's Pawn Game: Zukertort Variation,d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O...
4,2017-08-29 18:06:40,95,white,nik221107,1523,adivanov2009,1469,54,C41,Philidor Defense,e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na6 Nc3 Be7 b4 N...


### Remove rows without a winner

In [5]:
# Check value counts of winner column
chess_df["winner"].value_counts()

white    507871
black    472378
draw      39166
*           214
Name: winner, dtype: int64

In [6]:
# Remove * rows
chess_df=chess_df.loc[chess_df["winner"]!="*"]

# Check value counts again
chess_df["winner"].value_counts()

white    507871
black    472378
draw      39166
Name: winner, dtype: int64

In [7]:
chess_df.head()

Unnamed: 0,created_at,turns,winner,white_id,white_rating,black_id,black_rating,rating_difference,opening_eco,opening_name,moves
0,2017-08-31 20:06:40,13,white,bourgris,1500,a-00,1191,309,D10,Slav Defense: Exchange Variation,d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4 Nc3 Ba5 Bf4
1,2017-08-30 21:53:20,16,black,a-00,1322,skinnerua,1261,61,B00,Nimzowitsch Defense: Kennedy Variation,d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...
2,2017-08-30 21:53:20,61,white,ischia,1496,a-00,1500,-4,C20,King's Pawn Game: Leonardis Variation,e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...
3,2017-08-30 16:20:00,61,white,daniamurashov,1439,adivanov2009,1454,-15,D02,Queen's Pawn Game: Zukertort Variation,d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O...
4,2017-08-29 18:06:40,95,white,nik221107,1523,adivanov2009,1469,54,C41,Philidor Defense,e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na6 Nc3 Be7 b4 N...


### Simplify opening names

In [8]:
# Splitting off Eco column to transform
opening_df=pd.DataFrame()
opening_df["opening_eco"]=chess_df["opening_eco"]
opening_df.head()

Unnamed: 0,opening_eco
0,D10
1,B00
2,C20
3,D02
4,C41


In [9]:
# Importing simplified opening csv I made
opening_dict=pd.read_csv("../datasets/clean/Chess_Openings.csv",index_col=0).to_dict()
opening_dict=opening_dict["ECO_title"]

# Replacing eco codes with simplified names
opening_df =opening_df.replace({"opening_eco":opening_dict})

# Replace opening name column in chess_df
chess_df["opening_name"]=opening_df["opening_eco"]
chess_df.head()

Unnamed: 0,created_at,turns,winner,white_id,white_rating,black_id,black_rating,rating_difference,opening_eco,opening_name,moves
0,2017-08-31 20:06:40,13,white,bourgris,1500,a-00,1191,309,D10,Queen's Gambit,d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4 Nc3 Ba5 Bf4
1,2017-08-30 21:53:20,16,black,a-00,1322,skinnerua,1261,61,B00,King's Pawn Opening,d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...
2,2017-08-30 21:53:20,61,white,ischia,1496,a-00,1500,-4,C20,King's Pawn Game,e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...
3,2017-08-30 16:20:00,61,white,daniamurashov,1439,adivanov2009,1454,-15,D02,Queen's Pawn Game,d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O...
4,2017-08-29 18:06:40,95,white,nik221107,1523,adivanov2009,1469,54,C41,Philidor's Defense,e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na6 Nc3 Be7 b4 N...


### Replace ratings over 3k

In [10]:
# Look at rows with white rating over 3k
chess_df[chess_df["white_rating"] >3000]

Unnamed: 0,created_at,turns,winner,white_id,white_rating,black_id,black_rating,rating_difference,opening_eco,opening_name,moves
500602,2016-07-01 22:24:02,101,white,BahadirOzen,3006,AstanehChess,2545,461,A00,Polish Opening,d3 c5 Nf3 e6 g3 Nf6 Bg2 d5 O-O Bd6 Nbd2 O...
677731,2016-07-15 02:35:03,81,white,UnVieuxMonsieur,3097,hatembenarfa,2604,493,A01,Nimzovich-Larsen Attack,b3 a6 Bb2 b5 e3 Bb7 Nf3 Nf6 Be2 c5 O-O Nc...


In [11]:
# Replace rating
chess_df.loc[500602,"white_rating"]=3000
chess_df.loc[677731,"white_rating"]=3000

# Fix Rating difference
chess_df.loc[500602,"rating_difference"]=chess_df.loc[500602,"white_rating"] - chess_df.loc[500602,"black_rating"]
chess_df.loc[677731,"rating_difference"]=chess_df.loc[677731,"white_rating"] - chess_df.loc[677731,"black_rating"]

In [12]:
# Look at rows with black rating over 3k
chess_df[chess_df["black_rating"] >3000]

Unnamed: 0,created_at,turns,winner,white_id,white_rating,black_id,black_rating,rating_difference,opening_eco,opening_name,moves
195222,2016-07-15 02:13:20,85,white,hatembenarfa,2620,UnVieuxMonsieur,3080,-460,A01,Nimzovich-Larsen Attack,b3 g6 Bb2 f6 Nf3 Bg7 e4 Nh6 d4 Nf7 e5 O-O...
690032,2016-07-15 02:22:53,45,black,hatembenarfa,2627,UnVieuxMonsieur,3067,-440,E90,King's Indian Defense,Nf3 d6 d4 g6 c4 Bg7 Nc3 Nf6 e4 O-O e5 dxe...


In [13]:
# Replace rating
chess_df.loc[195222,"black_rating"]=3000
chess_df.loc[690032,"black_rating"]=3000

# Fix Rating difference
chess_df.loc[195222,"rating_difference"]=chess_df.loc[195222,"white_rating"] - chess_df.loc[195222,"black_rating"]
chess_df.loc[690032,"rating_difference"]=chess_df.loc[690032,"white_rating"] - chess_df.loc[690032,"black_rating"]

### Push back to SQL

In [14]:
# Drop original chess_data table
conn.execute("drop table chess_data")

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x2f5d0123548>

In [15]:
# Push table back into PgAdmin to replace chess_data
chess_df.to_sql(name='chess_data', con=engine, index=False)