In [1]:
# Import dependencies
import pandas as pd
import numpy as np
from config import db_password
from sqlalchemy import create_engine
import psycopg2

In [2]:
# Create engine
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/lichess_data"
engine = create_engine(db_string)
conn=engine.connect()

# Pull tables from SQL into dfs
chess_df= pd.read_sql_table ("chess_data",conn)
white_users_df= pd.read_sql_table ("white_users",conn)
black_users_df = pd.read_sql_table ("black_users",conn)

### I. Modify chess_data table to only include rows with players of same rating

In [3]:
# Create dictionary from chess titles table
titles_dict=pd.read_sql_table ("chess_titles",conn,index_col="ELO_rating").to_dict()
titles_dict=titles_dict["title"]

In [4]:
# Create players df
players_df=pd.DataFrame()
players_df["white_rating"]=chess_df["white_rating"]
players_df["black_rating"]=chess_df["black_rating"]
players_df["white_title"]=players_df["white_rating"]
players_df["black_title"]=players_df["black_rating"]
players_df.head()

Unnamed: 0,white_rating,black_rating,white_title,black_title
0,1500,1191,1500,1191
1,1322,1261,1322,1261
2,1496,1500,1496,1500
3,1439,1454,1439,1454
4,1523,1469,1523,1469


In [5]:
# convert title columns to names
players_df = players_df.replace({"white_title":titles_dict})
players_df = players_df.replace({"black_title":titles_dict})
players_df.head()

Unnamed: 0,white_rating,black_rating,white_title,black_title
0,1500,1191,Amateur (Class C),Novice
1,1322,1261,Amateur (Class D),Amateur (Class D)
2,1496,1500,Amateur (Class C),Amateur (Class C)
3,1439,1454,Amateur (Class C),Amateur (Class C)
4,1523,1469,Amateur (Class C),Amateur (Class C)


In [6]:
# Create white title df and simplify title
white_title=pd.DataFrame()
white_title=players_df["white_title"].str.split(' ',n=1,expand=True)
white_title.head()

Unnamed: 0,0,1
0,Amateur,(Class C)
1,Amateur,(Class D)
2,Amateur,(Class C)
3,Amateur,(Class C)
4,Amateur,(Class C)


In [7]:
# Create black title df and simplify title
black_title=pd.DataFrame()
black_title=players_df["black_title"].str.split(' ',n=1,expand=True)
black_title.head()

Unnamed: 0,0,1
0,Novice,
1,Amateur,(Class D)
2,Amateur,(Class C)
3,Amateur,(Class C)
4,Amateur,(Class C)


In [8]:
# Add title columns back to chess_df
chess_df["player_titles"]=white_title[0]
chess_df["player_titles2"]=black_title[0]

In [9]:
# Drop any rows where the players aren't the same title
chess_df=chess_df.loc[chess_df["player_titles"]==chess_df["player_titles2"]]

# Check length of chess_df
len(chess_df)

858276

In [10]:
# Drop unneeded 2nd player title column
chess_df=chess_df.drop("player_titles2",axis=1)
chess_df.head()

Unnamed: 0,created_at,turns,winner,white_id,white_rating,black_id,black_rating,rating_difference,opening_eco,opening_name,moves,player_titles
1,2017-08-30 21:53:20,16,black,a-00,1322,skinnerua,1261,61,B00,King's Pawn Opening,d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...,Amateur
2,2017-08-30 21:53:20,61,white,ischia,1496,a-00,1500,-4,C20,King's Pawn Game,e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...,Amateur
3,2017-08-30 16:20:00,61,white,daniamurashov,1439,adivanov2009,1454,-15,D02,Queen's Pawn Game,d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O...,Amateur
4,2017-08-29 18:06:40,95,white,nik221107,1523,adivanov2009,1469,54,C41,Philidor's Defense,e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na6 Nc3 Be7 b4 N...,Amateur
6,2017-09-01 01:40:00,33,white,capa_jr,1520,daniel_likes_chess,1423,97,D00,Queen's Pawn Game,d4 d5 e4 dxe4 Nc3 Nf6 f3 exf3 Nxf3 Nc6 Bb5 a6 ...,Amateur


In [11]:
# Export to csv for tableau use
chess_df.to_csv("../datasets/tableau/chess_data_tableau.csv")

### II. Merge user tables to get master users table

In [12]:
# Rename columns of player dfs to match
white_users_df=white_users_df.rename(columns={"white_id":"player_id","white_rating":"player_rating"})
black_users_df=black_users_df.rename(columns={"black_id":"player_id","black_rating":"player_rating"})

# Join dfs
users_df=pd.concat([white_users_df,black_users_df])
users_df.head()

Unnamed: 0,player_id,player_rating,created_at
0,--jim--,986,2014-07-17 12:01:19
1,-adam-,1767,2016-07-27 08:34:48
2,-ArtanS-,1870,2016-07-31 10:07:13
3,-chessnoob-,1720,2016-07-31 17:46:30
4,-johnnyC-,1329,2016-07-29 19:04:28


In [13]:
# Sort values by date in order to remove earliest duplicate
users_df=users_df.sort_values(by="created_at",axis=0)

In [14]:
# Check length before
len(users_df)

188743

In [15]:
# remove duplicate users, keeping most recent
users_df=users_df.drop_duplicates("player_id",keep="last")

In [16]:
# check length again
len(users_df)

111508

In [17]:
# Duplicate player_title column for replacement with title name
users_df["player_title"] = users_df["player_rating"]

In [18]:
# Replace numbers with titles dictionary again
users_df = users_df.replace({"player_title":titles_dict})
users_df.head()

Unnamed: 0,player_id,player_rating,created_at,player_title
43041,julito,899,2013-08-17 20:33:53,Novice
23640,drunkninja,1219,2013-08-17 20:37:48,Amateur (Class D)
32905,goldenfork,1381,2013-08-19 16:38:07,Amateur (Class D)
9337,bardamu31,800,2013-08-19 17:23:45,Novice
29309,flyfish,1183,2013-08-19 20:56:29,Novice


In [19]:
# Export to csv for tableau use
users_df.to_csv("../datasets/tableau/chess_users_tableau.csv")