In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score

In [4]:
full_data = pd.read_csv("data/chess_games.csv")
print('Column Values: ', full_data.columns.values)

Column Values:  ['game_id' 'rated' 'turns' 'victory_status' 'winner' 'time_increment'
 'white_id' 'white_rating' 'black_id' 'black_rating' 'moves'
 'opening_code' 'opening_moves' 'opening_fullname' 'opening_shortname'
 'opening_response' 'opening_variation']


In [5]:
full_data.shape

(20058, 17)

In [9]:
da = pd.read_csv("data/traindata.csv")

In [10]:
da.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,y
0,8.71,1.13,2.05,2.95,5.93,5.1,77.55
1,9.0,6.8,7.78,2.44,9.81,1.0,100.91
2,7.59,7.91,3.09,8.06,9.14,5.3,137.18
3,2.26,5.19,6.86,7.55,6.68,3.82,110.85
4,6.3,4.38,6.99,9.77,4.72,8.85,152.37


In [6]:
useful_data = full_data[["rated", "turns", "white_rating", "black_rating", "opening_moves", "winner"]]
useful_data = useful_data.replace('White', 0)
useful_data = useful_data.replace('Black', 1)
useful_data = useful_data.replace('Draw', 0.5)
useful_data = useful_data.astype(int)
useful_data.head()

Unnamed: 0,rated,turns,white_rating,black_rating,opening_moves,winner
0,0,13,1500,1191,5,0
1,1,16,1322,1261,4,1
2,1,61,1496,1500,3,0
3,1,61,1439,1454,3,0
4,1,95,1523,1469,5,0


In [7]:
len(useful_data)

20058

In [8]:
pd.DataFrame(useful_data).to_csv("data/chess_useful_data.csv")

In [58]:
useful_data.shape

(20058, 6)

In [13]:
def divide_data_random(data):
    # print(len(data))
    group_1 = round((20/100)* len(data)) -1
    group_2 = round((50/100) * len(data)) -1
    # print(group_1)
    # print(group_2)
    # group_3 = round((30/100) * len(data)) -1
    g1 = data.iloc[:group_1]
    g2 = data.iloc[group_1:group_1+group_2]
    g3 = data.iloc[group_1+group_2:]
    return g1,g2,g3


In [15]:
Y = useful_data['winner']
X = useful_data.drop(['winner'],axis=1)
# divide data in tljo test and train, one centralized set for the test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
  test_size=0.34, random_state=10)

In [57]:
len(X_train)

13238

In [16]:
c1_x, c2_x, c3_x = divide_data_random(X_train)
c1_l, c2_l, c3_l = divide_data_random(Y_train)

In [49]:
def find_best_max_depth(x,y, x_test, y_test):
    best_acc = 0
    best_i = 0
    for i in range(1,201,5):
        clf = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth=i)
        clf = clf.fit(x,y)
        y_pred = clf.predict(x_test)
        if accuracy_score(y_test, y_pred) > best_acc:
            best_acc = accuracy_score(y_test, y_pred)
            best_i = i
    return best_i

In [51]:
def find_best_min_sample_leaf(x,y, x_test, y_test, max_depth):
    best_acc = 0
    best_i = 0
    for i in range(1,201,5):
        clf = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth=max_depth, min_samples_leaf=i)
        clf = clf.fit(x,y)
        y_pred = clf.predict(x_test)
        if accuracy_score(y_test, y_pred) > best_acc:
            best_acc = accuracy_score(y_test, y_pred)
            best_i = i
    return best_i

In [None]:
def make_models(X, y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.34, random_state=10)
    c1_x, c2_x, c3_x = divide_data_random(X_train)
    c1_l, c2_l, c3_l = divide_data_random(Y_train)
    b_d_central = find_best_max_depth(X_train, Y_train, X_test,Y_test)
    b_d_1 = find_best_max_depth(c1_x, c1_l, X_test,Y_test)
    b_d_2 = find_best_max_depth(c2_x, c2_l, X_test,Y_test)
    b_d_3 = find_best_max_depth(c3_x, c3_l, X_test,Y_test)
    b_m_central = find_best_min_sample_leaf(X_train, Y_train,X_test,Y_test,b_d_central)
    b_m_1 = find_best_min_sample_leaf(c1_x, c1_l,X_test,Y_test,b_d_1)
    b_m_2 = find_best_min_sample_leaf(c2_x, c2_l,X_test,Y_test,b_d_2)
    b_m_3 = find_best_min_sample_leaf(c3_x, c3_l,X_test,Y_test,b_d_3)
    clf_central = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth=b_d_central, min_samples_leaf=b_m_central)
    clf_1 = tree.DecisionTreeClassifier(criterion = 'entropy',max_depth=b_d_1, min_samples_leaf=b_m_1)
    clf_2 = tree.DecisionTreeClassifier(criterion = 'entropy',max_depth=b_d_2, min_samples_leaf=b_m_2)
    clf_3 = tree.DecisionTreeClassifier(criterion = 'entropy',max_depth=b_d_3, min_samples_leaf=b_m_3)
    clf_central = clf_central.fit(X_train, Y_train)
    clf_1 = clf_1.fit(c1_x, c1_l)
    clf_2 = clf_2.fit(c2_x, c2_l)
    clf_3 = clf_3.fit(c3_x, c3_l)
    return [clf_central,clf_1,clf_2,clf_3]

In [52]:
b_d_central = find_best_max_depth(X_train, Y_train, X_test,Y_test)
b_d_1 = find_best_max_depth(c1_x, c1_l, X_test,Y_test)
b_d_2 = find_best_max_depth(c2_x, c2_l, X_test,Y_test)
b_d_3 = find_best_max_depth(c3_x, c3_l, X_test,Y_test)
b_m_central = find_best_min_sample_leaf(X_train, Y_train,X_test,Y_test,b_d_central)
b_m_1 = find_best_min_sample_leaf(c1_x, c1_l,X_test,Y_test,b_d_1)
b_m_2 = find_best_min_sample_leaf(c2_x, c2_l,X_test,Y_test,b_d_2)
b_m_3 = find_best_min_sample_leaf(c3_x, c3_l,X_test,Y_test,b_d_3)
clf_central = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth=b_d_central, min_samples_leaf=b_m_central)
clf_1 = tree.DecisionTreeClassifier(criterion = 'entropy',max_depth=b_d_1, min_samples_leaf=b_m_1)
clf_2 = tree.DecisionTreeClassifier(criterion = 'entropy',max_depth=b_d_2, min_samples_leaf=b_m_2)
clf_3 = tree.DecisionTreeClassifier(criterion = 'entropy',max_depth=b_d_3, min_samples_leaf=b_m_3)

141
0.61158357771261
6
0.582258064516129
156
0.6233137829912023
11
0.5828445747800587


In [53]:
clf_central = clf_central.fit(X_train, Y_train)
clf_1 = clf_1.fit(c1_x, c1_l)
clf_2 = clf_2.fit(c2_x, c2_l)
clf_3 = clf_3.fit(c3_x, c3_l)

In [None]:
def predict_and_perfromance(models, X_test):
    Yp_central = models[0].predict(X_test)
    Yp_1 = models[1].predict(X_test)
    Yp_2 = models[2].predict(X_test)
    Yp_3 = models[3].predict(X_test)

In [54]:
Yp_central = clf_central.predict(X_test)
Yp_1 = clf_1.predict(X_test)
Yp_2 = clf_2.predict(X_test)
Yp_3 = clf_3.predict(X_test)

In [55]:
acc_central = accuracy_score(Y_test, Yp_central)
acc_1 = accuracy_score(Y_test, Yp_1)
acc_2 = accuracy_score(Y_test, Yp_2)
acc_3 = accuracy_score(Y_test, Yp_3)

In [56]:
print("central acc", acc_central)
print("model 1 acc", acc_1)
print("model 2 acc", acc_2)
print("model 3 acc", acc_3)

central acc 0.6398826979472141
model 1 acc 0.5907624633431086
model 2 acc 0.6318181818181818
model 3 acc 0.6156891495601173
