In [155]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder

In [156]:
#Veriyi temizle ve filitrele
def clear_data(df, sample_size = 0, number_of_most_played_games = 200):
    #Veriyi Temizle ve Duzenle

    if sample_size > 0:
        df = df.sample(n = sample_size)# Test ederken boyutunu kucult

    df = df[['id', 'opening_name', 'white_rating', 'black_rating', 'winner']]#kullanacagin kolonlari al
    df = df.set_index('id')


    most_played_games = df['opening_name'].value_counts().sort_values().tail(number_of_most_played_games).keys()
    df = df.loc[df['opening_name'].isin(most_played_games)]#Cok az ornegi olan satirlari ele.

    df['opening_name'] = df['opening_name'].str.strip().str.replace(r'(:.\|)|(:.)', '', regex = True)#Acilista onemsiz varyasyonlarini gormezden gel. Regular Expression kullanarak filitreledim
    df['opening_name'] = df['opening_name'].str.wrap(15)#Okunurlugu artir. Yoksa grafiklerde labellar tasiyor.



    #df = df[df['rated'] == True]
    df = df[abs(df['white_rating'] - df['black_rating']) < 100]#Elo'lar arasindaki farkin cok olmasi istenilen bir durum degil

    lowestElo, highestElo = df['white_rating'].min(), df['white_rating'].max() 

    lowestElo = int(lowestElo / 100) * 100
    highestElo = int(highestElo / 100) * 100 + 100

     #Elo araliklari icin yeni kolon olusturdum.
    df['rating_range'] = np.nan

    for elo in range(lowestElo, highestElo, 100):
        df['rating_range'] = np.where(df['white_rating'].between(elo, elo + 100), f'{elo}-{elo + 100}', df['rating_range'])

    #print(df['rating_range'].unique())


    return df

In [157]:
#Label'lara karsilik numerik deger ata
def encode_labels(df):
    labelencoder = LabelEncoder()

    dfc = df.copy()
    dfc['opening_name'] = labelencoder.fit_transform(df['opening_name'])
    dfc['rating_range'] = labelencoder.fit_transform(df['rating_range'])
    dfc['winner'] = labelencoder.fit_transform(df['winner'])



    return dfc


In [158]:
def split_train_and_test(df, test_size = 0.25):#Dorte birini test verisi olarak al
    x = df[['opening_name', 'white_rating', 'black_rating', 'rating_range']]
    y = df['winner']
    return train_test_split(x, y, test_size = test_size, random_state = 42) #Veriyi test ve train olarak ikiye ayir.


In [159]:
def train(model, x_train, x_test, y_train, y_test):
    model = model.fit(x_train, y_train)
    return model.predict(x_test)

In [160]:

#Train ve Validation Hatasi    
def calc_metrics(x_train, y_train, x_test, y_test, model):
    model.fit(x_train, y_train)

    predictions = model.predict(x_train)
    train_error = mean_squared_error(y_train, predictions)#Train Hatasi

    predictions = model.predict(x_test)
    validation_error = mean_squared_error(y_test, predictions)#Validation Hatasi
    
    return train_error, validation_error

In [161]:
def get_accuracy(y_actual, y_pred, print_confussion_matrix = False):
    #white, black, draw olmak uzere uc cesit label var
    #confusion matrix 3x3 olacak
    labels = np.unique(y_actual)

    cm = confusion_matrix(y_actual, y_pred, labels = labels)
    fp = cm.sum(axis=0) - np.diag(cm)#false positive
    fn = cm.sum(axis=1) - np.diag(cm)#false negative
    tp = np.diag(cm)#true positive
    tn = cm.sum() - (fp + fn + tp)#true negative

    if(print_confussion_matrix):
        print("----Confusion Matrix----\n")
        print(pd.DataFrame(cm, index = labels, columns = labels))
        print('\n')

    acc = (tp + tn) / (tp + fp + tn + fn)#Dogruluk degerini hesapla
    return acc


In [162]:
def train_and_graphs(df, max_k = 50):
    dfc = df.copy()

   
    white_winning_accuracies, black_winning_accuracies, draw_accuracies = [], [], []
    white_winning_errors, black_winning_errors, draw_errors = [], [], []
    
    train_errors, validation_errors = [], []
    ks = []

    for k in range(1, max_k):#farkli k degerlerine gore olc
        ks.append(k)
        model = KNeighborsClassifier(n_neighbors=k)
        x_train, x_test, y_train, y_test = split_train_and_test(dfc)

        y_pred = train(model, x_train, x_test, y_train, y_test)

        acc = get_accuracy(y_test, y_pred)
        err = 1 - acc

        black_winning_accuracies.append(acc[0])
        draw_accuracies.append(acc[1])
        white_winning_accuracies.append(acc[2])

        black_winning_errors.append(err[0])
        draw_errors.append(err[1])
        white_winning_errors.append(err[2])


        train_error, validation_error = calc_metrics(x_train, y_train, x_test, y_test, model)
        train_errors.append(train_error)
        validation_errors.append(validation_error)




    acc_df = pd.DataFrame({'k': ks, 'black': black_winning_accuracies, 'draw' : draw_accuracies, 'white' : white_winning_accuracies})
    acc_df = acc_df.set_index(['k'])

    
    best_ks = acc_df.idxmax(axis = 0)#En iyi k degerleri

    print("En K Degerleri")
    print(best_ks)

    
    acc_df.plot()

    plt.title("K Degerine Gore Basari Grafigi")
    plt.show()#basari iterasyon grafigi



    err_df = pd.DataFrame({'k': ks, 'black': black_winning_errors, 'draw' : draw_errors, 'white' : white_winning_errors})
    err_df = err_df.set_index(['k'])

    err_df.plot()

    plt.title("K Degerine Gore Kayip Grafigi")
    plt.show()

    tv_df = pd.DataFrame({'k': ks, 'Validation Errors': validation_errors, 'Train Errors': train_errors})
    tv_df = tv_df.set_index(['k'])#train validation grafigi
    tv_df.plot()

    plt.title("Train ve Validation Hata Grafigi")
    plt.show()

In [172]:
df = pd.read_csv("../datasets/games.csv")
df = clear_data(df)

df = encode_labels(df)


FileNotFoundError: [Errno 2] No such file or directory: 'datasets/games.csv'

In [170]:
train_and_graphs(df)

FileNotFoundError: [Errno 2] No such file or directory: 'datasets/games.csv'