In [1]:
import pandas as pd
import datetime as dt
import numpy as np
import re
from sqlalchemy import create_engine
from config import db_password

In [2]:
# Read in raw CSVs
games_df = pd.read_csv("../datasets/raw/games.csv")
lichess_df =pd.read_csv("../datasets/raw/chess_games.csv")
titles_df = pd.read_csv("../datasets/raw/chess_titles.csv")

In [3]:
# games_df cleaning
# Drop unneeded columns
games_df = games_df.drop(columns=["rated", "last_move_at", "victory_status", "increment_code", "opening_ply"])

# Drop duplicates and na
games_df = games_df.drop_duplicates()
games_df = games_df.dropna()

# Change ID and index
games_df["id"]=range(len(games_df))
games_df.set_index("id",inplace=True)

# Add rating difference column
games_df["rating_difference"]=games_df["white_rating"]-games_df["black_rating"]

# Convert created_at time
games_df["created_at"]=games_df["created_at"] // 1000 
games_df["created_at"]=pd.to_datetime(games_df["created_at"], unit="s")

# Remove special characters from moves
moves = games_df['moves'].map(lambda x: re.sub('[!@#$?+]', '', x))
games_df["moves"]= moves

# Reorder columns
games_df = games_df[["created_at",
                     "turns",
                     "winner",
                     "white_id",
                     "white_rating",
                     "black_id",
                     "black_rating",
                     "rating_difference",
                     "opening_eco",
                     "opening_name",
                     "moves"]]

# Final Check
games_df

Unnamed: 0_level_0,created_at,turns,winner,white_id,white_rating,black_id,black_rating,rating_difference,opening_eco,opening_name,moves
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,2017-08-31 20:06:40,13,white,bourgris,1500,a-00,1191,309,D10,Slav Defense: Exchange Variation,d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4 Nc3 Ba5 Bf4
1,2017-08-30 21:53:20,16,black,a-00,1322,skinnerua,1261,61,B00,Nimzowitsch Defense: Kennedy Variation,d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...
2,2017-08-30 21:53:20,61,white,ischia,1496,a-00,1500,-4,C20,King's Pawn Game: Leonardis Variation,e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...
3,2017-08-30 16:20:00,61,white,daniamurashov,1439,adivanov2009,1454,-15,D02,Queen's Pawn Game: Zukertort Variation,d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O...
4,2017-08-29 18:06:40,95,white,nik221107,1523,adivanov2009,1469,54,C41,Philidor Defense,e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na6 Nc3 Be7 b4 N...
...,...,...,...,...,...,...,...,...,...,...,...
19624,2017-07-11 16:35:14,24,white,belcolt,1691,jamboger,1220,471,A80,Dutch Defense,d4 f5 e3 e6 Nf3 Nf6 Nc3 b6 Be2 Bb7 O-O Be7 Ne5...
19625,2017-07-10 14:48:09,82,black,jamboger,1233,farrukhasomiddinov,1196,37,A41,Queen's Pawn,d4 d6 Bf4 e5 Bg3 Nf6 e3 exd4 exd4 d5 c3 Bd6 Bd...
19626,2017-07-10 14:44:37,35,white,jamboger,1219,schaaksmurf3,1286,-67,D00,Queen's Pawn Game: Mason Attack,d4 d5 Bf4 Nc6 e3 Nf6 c3 e6 Nf3 Be7 Bd3 O-O Nbd...
19627,2017-07-10 14:15:27,109,white,marcodisogno,1360,jamboger,1227,133,B07,Pirc Defense,e4 d6 d4 Nf6 e5 dxe5 dxe5 Qxd1 Kxd1 Nd5 c4 Nb6...


In [4]:
# lichess_df cleaning
# drop unneeded columns
lichess_df=lichess_df.drop(columns=["Event","TimeControl","WhiteRatingDiff","BlackRatingDiff","Termination"])

# drop duplicates and na
lichess_df=lichess_df.drop_duplicates()
lichess_df=lichess_df.dropna()

# Randomly sample df for 1 million rows
lichess_df=lichess_df.sample(1000000,random_state=42)

# Create ID and set as index
lichess_df["id"]=range(len(lichess_df))
lichess_df.set_index("id",inplace=True)

# Merge date and time columns, convert to date time, then drop original columns
lichess_df['created_at'] = lichess_df[['UTCDate', 'UTCTime']].agg(' '.join, axis=1)
lichess_df["created_at"]= pd.to_datetime(lichess_df["created_at"])
lichess_df= lichess_df.drop(columns=["UTCDate","UTCTime"])

# Create rating difference column
lichess_df["rating_difference"]=lichess_df["WhiteElo"]-lichess_df["BlackElo"]

# Change result column
lichess_df['Result']=lichess_df['Result'].str.replace("1-0","white")
lichess_df['Result']=lichess_df['Result'].str.replace("1/2-1/2","draw")
lichess_df['Result']=lichess_df['Result'].str.replace("0-1","black")

# Cleaning moves column
# RegEx to remove stockfish evals
moves=lichess_df['AN'].map(lambda x: re.sub('\{ \[%eval #?-?\d{1,3}\.?\d{0,2}\] \}', '', x))
lichess_df['AN']=moves
# RegEx to remove Ellipsis
moves = lichess_df["AN"].map(lambda x: re.sub(' \d{1,3}\.{3} ','',x))
lichess_df['AN']=moves
# RegEx to remove special characters
moves = lichess_df['AN'].map(lambda x: re.sub('[!@#$?+]', '', x))
lichess_df['AN']=moves
# RegEx to remove turn numbering
moves=moves.map(lambda x: re.sub('\d{1,3}\. ','', x))
lichess_df['AN']=moves
# Remove scores at end of moves
lichess_df['AN']=lichess_df['AN'].str.replace('1-0','')
lichess_df['AN']=lichess_df['AN'].str.replace('0-1','')
lichess_df['AN']=lichess_df['AN'].str.replace('1/2-1/2','')

# creating turns column
lichess_df['turns']=lichess_df["AN"].str.count('\S{1,10}')

# Rename columns to match games_df
lichess_df = lichess_df.rename(columns={"White":"white_id",
                                        "Black":"black_id",
                                        "WhiteElo":"white_rating",
                                        "BlackElo":"black_rating",
                                        "Result": "winner",
                                        "ECO":"opening_eco",
                                        "Opening":"opening_name",
                                        "AN":"moves"})

# Reorder columns to match games_df
lichess_df = lichess_df[["created_at",
                         "turns",
                         "winner",
                         "white_id",
                         "white_rating",
                         "black_id",
                         "black_rating",
                         "rating_difference",
                         "opening_eco",
                         "opening_name",
                         "moves"]]

# Final check
lichess_df.head()

Unnamed: 0_level_0,created_at,turns,winner,white_id,white_rating,black_id,black_rating,rating_difference,opening_eco,opening_name,moves
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,2016-07-24 12:40:27,67,white,LenyaMangal,1446,frayouf,1466,-20,B01,Scandinavian Defense: Mieses-Kotroc Variation,e4 d5 exd5 Qxd5 Nc3 Qe6 Be2 Nc6 Nf3 Qg6 d3 e5 ...
1,2016-07-11 15:20:32,42,black,bhbrianwong,1869,pie1peg,2075,-206,A04,Zukertort Opening: Pirc Invitation,Nf3 d6 g3 c6 Bg2 e5 O-O g6 d3 Bg7 Nbd2 Nf6 e4 ...
2,2016-07-18 16:28:46,77,white,PawnSean,1550,raflalink,1678,-128,B00,Caro-Kann Defense: Hillbilly Attack,e4 c6 Bc4 d5 exd5 cxd5 Bb3 Bf5 d3 e6 a3 Nf6 Nc...
3,2016-07-18 05:49:38,55,white,nerowolf,1704,pkeres12,1822,-118,A43,Benoni Defense: Old Benoni,d4 c5 d5 d6 c4 e5 dxe6 fxe6 e3 Nf6 Nf3 Be7 Be2...
4,2016-07-10 07:10:00,97,white,regisluiz,1813,ROMC,1953,-140,C00,French Defense: Knight Variation,e4 e6 Nf3 d5 exd5 exd5 d4 Nf6 Nc3 Bd6 Bg5 O-O ...


In [5]:
# titles_df cleaning
titles_df= titles_df.set_index("ELO_rating")
titles_df.head()

Unnamed: 0_level_0,title
ELO_rating,Unnamed: 1_level_1
0,Novice
1,Novice
2,Novice
3,Novice
4,Novice


In [6]:
# Export csvs
games_df.to_csv("../datasets/clean/games.csv")
lichess_df.to_csv("../datasets/clean/chess_games.csv")
titles_df.to_csv("../datasets/clean/chess_titles.csv")

In [7]:
# Create engine
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/lichess_data"
engine = create_engine(db_string)

In [8]:
# games_df, lichess_df, and titles_df into pgadmin
games_df.to_sql(name='games', con=engine)
lichess_df.to_sql(name='lichess_games_data', con=engine)
titles_df.to_sql(name='chess_titles', con=engine)