In [80]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder

import scikitplot as skplt

In [81]:
#Veriyi temizle ve filitrele
def clear_data(df, sample_size = 0, number_of_most_played_games = 200):
    #Veriyi Temizle ve Duzenle

    if sample_size > 0:
        df = df.sample(n = sample_size)# Test ederken boyutunu kucult

    df = df[['id', 'opening_name', 'white_rating', 'black_rating', 'winner']]#kullanacagin kolonlari al
    df = df.set_index('id')


    most_played_games = df['opening_name'].value_counts().sort_values().tail(number_of_most_played_games).keys()
    df = df.loc[df['opening_name'].isin(most_played_games)]#Cok az ornegi olan satirlari ele.

    df['opening_name'] = df['opening_name'].str.strip().str.replace(r'(:.\|)|(:.)', '', regex = True)#Acilista onemsiz varyasyonlarini gormezden gel. Regular Expression kullanarak filitreledim
    df['opening_name'] = df['opening_name'].str.wrap(15)#Okunurlugu artir. Yoksa grafiklerde labellar tasiyor.



    #df = df[df['rated'] == True]
    df = df[abs(df['white_rating'] - df['black_rating']) < 100]#Elo'lar arasindaki farkin cok olmasi istenilen bir durum degil

    lowestElo, highestElo = df['white_rating'].min(), df['white_rating'].max() 

    lowestElo = int(lowestElo / 100) * 100
    highestElo = int(highestElo / 100) * 100 + 100

     #Elo araliklari icin yeni kolon olusturdum.
    df['rating_range'] = np.nan

    for elo in range(lowestElo, highestElo, 100):
        df['rating_range'] = np.where(df['white_rating'].between(elo, elo + 100), f'{elo}-{elo + 100}', df['rating_range'])

    #print(df['rating_range'].unique())


    return df

In [82]:
#Label'lara karsilik numerik deger ata
def encode_labels(df):
    labelencoder = LabelEncoder()

    dfc = df.copy()
    dfc['opening_name'] = labelencoder.fit_transform(df['opening_name'])
    dfc['rating_range'] = labelencoder.fit_transform(df['rating_range'])
    dfc['winner'] = labelencoder.fit_transform(df['winner'])



    return dfc


In [83]:
def split_train_and_test(df, test_size = 0.25):#Dorte birini test verisi olarak al
    x = df[['opening_name', 'white_rating', 'black_rating', 'rating_range']]
    y = df['winner']
    return train_test_split(x, y, test_size = test_size, random_state = 42) #Veriyi test ve train olarak ikiye ayir.


In [84]:
def train(model, x_train, x_test, y_train, y_test):
    model = model.fit(x_train, y_train)
    return model.predict(x_test)

In [85]:

#Train ve Validation Hatasi    
def calc_metrics(x_train, y_train, x_test, y_test, model):
    model.fit(x_train, y_train)

    predictions = model.predict(x_train)
    train_error = mean_squared_error(y_train, predictions)#Train Hatasi

    predictions = model.predict(x_test)
    validation_error = mean_squared_error(y_test, predictions)#Validation Hatasi
    
    return train_error, validation_error

In [86]:
def get_metrics(y_actual, y_pred, print_confussion_matrix = False):
    #white, black, draw olmak uzere uc cesit label var
    #confusion matrix 3x3 olacak
    labels = np.unique(y_actual)

    cm = confusion_matrix(y_actual, y_pred, labels = labels)
    fp = cm.sum(axis=0) - np.diag(cm)#false positive
    fn = cm.sum(axis=1) - np.diag(cm)#false negative
    tp = np.diag(cm)#true positive
    tn = cm.sum() - (fp + fn + tp)#true negative

    if(print_confussion_matrix):
        print("----Confusion Matrix----\n")
        print(pd.DataFrame(cm, index = labels, columns = labels))
        print('\n')

    acc = (tp + tn) / (tp + fp + tn + fn)#Dogruluk degerini hesapla
    recall = (tp + tn) / (tp + fp + tn + fn)
    specifity = tn / (tn + fp)
    f_score = 2 * tp / (2 * tp + fp + fn)

    return acc, recall, specifity, f_score


In [87]:
df = pd.read_csv("../datasets/games.csv")
df = clear_data(df)
df = encode_labels(df)

In [94]:
def print_as_table(ks, params):
    for i in range(len(ks)):
        k = ks[i]
        param = params[i]
        print(f"k={k}- accuracy={param[0]}, recall={param[1]}, specifity={param[2]}, f_score={param[3]}")

In [77]:
dfc = df.copy()


white_params, black_params, draw_params = [], [], []
ks = []

for k in range(1, 10):#farkli k degerlerine gore olc
    ks.append(k)
    model = KNeighborsClassifier(n_neighbors=k)
    x_train, x_test, y_train, y_test = split_train_and_test(dfc)

    y_pred = train(model, x_train, x_test, y_train, y_test)

    acc, recall, specifity, f_score = get_metrics(y_test, y_pred)
    err = 1 - acc

    black_params.append((acc[0], recall[0], specifity[0], f_score[0]))
    draw_params.append((acc[1], recall[1], specifity[1], f_score[1]))
    white_params.append((acc[2], recall[2], specifity[2], f_score[2]))


In [95]:
print("\nBlacks")
print_as_table(ks, black_params)
print("\nDraws")
print_as_table(ks, draw_params)
print("\nWhites")
print_as_table(ks, white_params)


Blacks
k=1- accuracy=0.5303126994256541, recall=0.5303126994256541, specifity=0.5617848970251716, f_score=0.480225988700565
k=2- accuracy=0.4894703254626675, recall=0.4894703254626675, specifity=0.30892448512585813, f_score=0.5540691192865106
k=3- accuracy=0.5169112954690491, recall=0.5169112954690491, specifity=0.540045766590389, f_score=0.47173761339846476
k=4- accuracy=0.5079770261646458, recall=0.5079770261646458, specifity=0.41876430205949655, f_score=0.5272838749233599
k=5- accuracy=0.5143586470963625, recall=0.5143586470963625, specifity=0.5297482837528604, f_score=0.47408431237042153
k=6- accuracy=0.5067007019783025, recall=0.5067007019783025, specifity=0.43592677345537756, f_score=0.5165728580362726
k=7- accuracy=0.5303126994256541, recall=0.5303126994256541, specifity=0.5320366132723112, f_score=0.4986376021798365
k=8- accuracy=0.5098915124441609, recall=0.5098915124441609, specifity=0.4405034324942792, f_score=0.518796992481203
k=9- accuracy=0.5277600510529674, recall=0.527