In [5]:
#generate ratings for user by tag
#also run bootstrap analysis proving structure in tag ratings

import numpy as np
import pandas as pd
import os
import csv
import math
import matplotlib.pyplot as plt
import random
random.seed(100)

In [6]:
chess_data_path = "/w/225/1/chess/tactics/glicko_user_tactics_problem.csv_00"
problem_data_path = "/w/225/1/chess/tactics/tag_data_updated.csv"

In [7]:
class Problem:
    def __init__(self, line):
        self.tags = line[4:] #list of tags (strings)
        self.problem_id = line["tactics_problem_id"]
        self.rating = 1720
        self.n = 0


In [8]:
#id:player/problem object
problemDict = {}
playerDict = {}

In [9]:
#run through every problem and get list of tags
#save list of tags in a problem object, accessible by id through problemDict
#also keep rating, n
f = pd.read_csv(problem_data_path)

tag_list = f.columns.tolist()[4:]
most_common_tags = ["Back Rank", "Basic Checkmates","Decoy / Deflection", "Defense", "Discovered Attack","Mate in 1","Zwischenzug", "Mate in 2", "Endgame Tactics","Double","Trapped Piece","Simplification", "Fork / Double Attack", "Hanging Piece", "Mate in 3+", "Mating Net", "Pin", "Remove the Defender", "Sacrifice", "Vulnerable King"]

indices = [tag_list.index(tag) for tag in most_common_tags]

print(tag_list)
print(indices)

for index, row in f.iterrows():
    p = Problem(row)
    problemDict[row["tactics_problem_id"]] = p


['Attacking f7/f2', 'Attacking Castled King', 'Back Rank', 'Basic Checkmates', 'Exchange Sacrifice', 'Mate in 3+', 'Mating Net', 'Queen Sacrifice', 'Sacrifice', 'Vulnerable King', 'Clearance Sacrifice', 'Decoy / Deflection', 'Mate in 2', 'Discovered Attack', 'Fork / Double Attack', 'Hanging Piece', 'Double Check', 'Pin', 'Smothered Mate', 'Remove the Defender', 'Interference', 'Mate in 1', 'NULL', 'Trapped Piece', 'Pawn Promotion', 'Skewer', 'X-Ray Attack', 'Overloading', 'Simplification', 'Defense', 'Zwischenzug', 'Endgame Tactics', 'Stalemate', 'Perpetual Check', 'Desperado', 'Zugzwang', 'En passant', 'Windmill', 'Underpromotion', 'Double', 'Opposition']
[2, 3, 11, 29, 13, 21, 30, 12, 31, 39, 23, 28, 14, 15, 5, 6, 17, 19, 8, 9]


In [10]:
class Player:
    def __init__(self):
        self.rating = 1720
        self.elo_tags = [1720]*len(tag_list)
        self.n = 0
        self.n_tags = [0]*len(tag_list)

In [11]:
def calculateElo(rating1, rating2, s1):
    r1 = 10**(rating1/400)
    r2 = 10**(rating2/400)
    e1 = r1 / (r1 + r2)
    k = 32
    newRating = rating1 + k * (s1 - e1)
    return newRating

In [99]:
#iterate through each row of chess data
#for every line, iterate through tags and update ELO for each tag/user
#then update elo for user/problem
#don't forget to update n as well
f = pd.read_csv(chess_data_path)
colnames = f.columns.tolist()


In [None]:
len(problemDict)

54806

In [None]:
problems_that_dont_exist = set([])
for index, row in f.iterrows():
    if index % 100000 == 0:
        print(index)
    user_id = row["user_hash"]
    problem_id = row["tactics_problem_id"]
    if user_id not in playerDict:
        playerDict[user_id] = Player()
    if problem_id not in problemDict:
        if problem_id not in problems_that_dont_exist:
            print("error, problem", problem_id, "doesn't exist?")
            problems_that_dont_exist.add(problem_id)
        continue
    
    
    user = playerDict[user_id]
    problem = problemDict[problem_id]
    passed = int(row["is_passed"])                                                             
    
    temp = []
    for j in range(len(problem.tags)):
        if problem.tags[j]:
            temp.append(j)
            
    
    tags_to_consider = list(set(temp) & set(indices))
    if len(tags_to_consider) > 0: #select a random tag to update for each problem
        i = random.choice(tags_to_consider)
        newRating = calculateElo(user.elo_tags[i], problem.rating, passed)
        user.elo_tags[i] = newRating
        user.n_tags[i] += 1

    newRatingUser = calculateElo(user.rating, problem.rating, passed)
    newRatingProblem = calculateElo(problem.rating, user.rating, int(not passed))
    user.rating = newRatingUser
    problem.rating = newRatingProblem
    user.n += 1
    problem.n += 1
            
            

0
error, problem 25934 doesn't exist?
error, problem 30206 doesn't exist?
100000
error, problem 29379 doesn't exist?
200000
error, problem 27077 doesn't exist?
error, problem 30114 doesn't exist?
error, problem 27296 doesn't exist?
error, problem 29150 doesn't exist?
error, problem 30604 doesn't exist?
error, problem 32612 doesn't exist?
300000
error, problem 35483 doesn't exist?
error, problem 35521 doesn't exist?
400000
error, problem 29993 doesn't exist?
error, problem 34949 doesn't exist?
error, problem 31266 doesn't exist?
error, problem 33171 doesn't exist?
500000
error, problem 36649 doesn't exist?
error, problem 32281 doesn't exist?
error, problem 26831 doesn't exist?
600000
error, problem 33505 doesn't exist?
700000
error, problem 32070 doesn't exist?
error, problem 36192 doesn't exist?
error, problem 36222 doesn't exist?
error, problem 36320 doesn't exist?
800000
error, problem 31511 doesn't exist?
900000
error, problem 34072 doesn't exist?
error, problem 34074 doesn't exist?

In [102]:
row1 = ["user_hash", "n", "elo"] + tag_list + [tag + "_n" for tag in tag_list]

In [103]:
newCSV = open("user_elo_for_each_tag_randomized.csv", "w", newline= "")
newCSV = csv.writer(newCSV)
newCSV.writerow(row1)

for player in playerDict:
    p = playerDict[player]
    row = [player, p.n, p.rating] + p.elo_tags + p.n_tags 
    newCSV.writerow(row)

    

In [107]:
problem_info = [   [problemDict[key].problem_id, problemDict[key].rating]  for key in problemDict.keys()]

In [110]:
with open("problem_final_ratings.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["tactics_problem_id", "problemRating"])
    writer.writerows(problem_info)



In [12]:
updated_problem_path = "chess_data_top_users.csv"

f = pd.read_csv(updated_problem_path)
colnames = f.columns.tolist()


In [13]:
#for each bootstrap, each problem needs to have random tag
#each user needs to keep track of their real rating for each tag as well as a bunch of bootstrap ratings
#iterate through problem class assigning main tag and 1000 random tags
#then iterate through each attempt and update user tag elo for each bootstrap
#run through every problem and get list of tags
#save list of tags in a problem object, accessible by id through problemDict
#also keep rating, n


problemDict2 = {}
problemsWithoutTags = set([])
numBootstraps = 1000
class Problem2:
    def __init__(self, line):
        tags = line[4:]
        temp = []
        for j in range(len(tags)):
            if tags[j]:
                temp.append(j)
        tags_to_consider = list(set(temp) & set(indices))
        if len(tags_to_consider) == 0:
            self.tags = []
            problemsWithoutTags.add(line["tactics_problem_id"])
        else:
            self.tags = [random.choice(tags_to_consider)] + random.choices(indices, k = numBootstraps)

        
        '''
        self.tags = line[4:] #list of tags (strings)
        self.problem_id = line["tactics_problem_id"]
        self.rating = 1720
        self.n = 0'''



f2 = pd.read_csv(problem_data_path)

most_common_tags = ["Back Rank", "Basic Checkmates","Decoy / Deflection", "Defense", "Discovered Attack","Mate in 1","Zwischenzug", "Mate in 2", "Endgame Tactics","Double","Trapped Piece","Simplification", "Fork / Double Attack", "Hanging Piece", "Mate in 3+", "Mating Net", "Pin", "Remove the Defender", "Sacrifice", "Vulnerable King"]
tag_list = f2.columns.tolist()[4:]
indices = [tag_list.index(tag) for tag in most_common_tags]

print(tag_list)
print(indices)

for index, row in f2.iterrows():

    p = Problem2(row)
    problemDict2[row["tactics_problem_id"]] = p



['Attacking f7/f2', 'Attacking Castled King', 'Back Rank', 'Basic Checkmates', 'Exchange Sacrifice', 'Mate in 3+', 'Mating Net', 'Queen Sacrifice', 'Sacrifice', 'Vulnerable King', 'Clearance Sacrifice', 'Decoy / Deflection', 'Mate in 2', 'Discovered Attack', 'Fork / Double Attack', 'Hanging Piece', 'Double Check', 'Pin', 'Smothered Mate', 'Remove the Defender', 'Interference', 'Mate in 1', 'NULL', 'Trapped Piece', 'Pawn Promotion', 'Skewer', 'X-Ray Attack', 'Overloading', 'Simplification', 'Defense', 'Zwischenzug', 'Endgame Tactics', 'Stalemate', 'Perpetual Check', 'Desperado', 'Zugzwang', 'En passant', 'Windmill', 'Underpromotion', 'Double', 'Opposition']
[2, 3, 11, 29, 13, 21, 30, 12, 31, 39, 23, 28, 14, 15, 5, 6, 17, 19, 8, 9]


In [122]:
class Player2:
    def __init__(self):
        self.elo_tags = [[1720]*len(tag_list)]*numBootstraps
        self.n = 0
        self.n_tags = [[0]*len(tag_list)]*numBootstraps

In [14]:
class Player3:
    def __init__(self):
        self.n = 0
        self.bootstraps = []
        for i in range(numBootstraps + 1):
            self.bootstraps.append({})
        
        '''self.elo_tags = [[1720]*len(tag_list)]*numBootstraps
        self.n = 0
        self.n_tags = [[0]*len(tag_list)]*numBootstraps'''

In [None]:
playerDict2 = {}

for index, row in f.iterrows():
    if index % 100000 == 10000:
        print(index)
        
    playerID = row["user_hash"]
    if playerID not in playerDict2:
        p = Player3()
        playerDict2[playerID] = p
    else:
        p = playerDict2[playerID]
    
    problemID = row["tactics_problem_id"]
    if problemID in problemsWithoutTags:
        continue
    problem = problemDict2[problemID]
    pRating = int(row["problemRating"])
    
    passed = int(row["is_passed"]) 
    
    for bootstrap in range(len(p.bootstraps)):
        ind = problem.tags[bootstrap]
        user_dict = p.bootstraps[bootstrap]
        if ind not in user_dict:
            user_dict[ind] = 1720
        userRating = user_dict[ind]    
        newRating = calculateElo(userRating, pRating, passed)
        user_dict[ind] = newRating
    p.n += 1
        

10000
110000
210000
310000
410000
510000
610000
710000
810000
910000
1010000
1110000
1210000
1310000
1410000
1510000
1610000
1710000
1810000
1910000
2010000
2110000
2210000
2310000
2410000
2510000
2610000
2710000
2810000
2910000
3010000
3110000
3210000
3310000
3410000
3510000
3610000
3710000
3810000
3910000
4010000
4110000
4210000
4310000
4410000
4510000
4610000
4710000
4810000
4910000
5010000
5110000
5210000
5310000
5410000
5510000
5610000
5710000
5810000
5910000
6010000
6110000
6210000
6310000
6410000
6510000
6610000
6710000
6810000
6910000
7010000
7110000
7210000
7310000
7410000
7510000
7610000
7710000
7810000
7910000
8010000
8110000
8210000
8310000
8410000
8510000
8610000
8710000
8810000
8910000
9010000
9110000
9210000
9310000
9410000
9510000
9610000
9710000
9810000
9910000
10010000
10110000
10210000
10310000
10410000
10510000
10610000
10710000
10810000
10910000
11010000
11110000
11210000
11310000
11410000
11510000
11610000
11710000
11810000
11910000
12010000
12110000
12210000
1231

In [170]:
#OK, now for each player we need to calculate the variance 
#for each bootstrap + the original


var_actual = []
var_bootstraps = []
len(playerDict2)

55912

In [172]:
z = 0
for key in playerDict2:
    z += 1
    if z%1000 == 10:
        print(z)
    p = playerDict2[key]

    var_actual.append(np.var(list(p.bootstraps[0].values())))

    for bootstrap in p.bootstraps[1:]:
        var_bootstraps.append(np.var(list(bootstrap.values())))
        
    


10
1010
2010
3010
4010
5010
6010
7010
8010
9010
10010
11010
12010
13010
14010
15010
16010
17010
18010
19010
20010
21010
22010
23010
24010
25010
26010
27010
28010
29010
30010
31010
32010
33010
34010
35010
36010
37010
38010
39010
40010
41010
42010
43010
44010
45010
46010
47010
48010
49010
50010
51010
52010
53010
54010
55010


In [173]:
print(np.mean(var_actual))
print(np.mean(var_bootstraps))

5734.030232035342
3051.3221384618773


In [174]:
with open("variance_bootstrap.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["variance", "bootstrap"])
    for b in var_actual:
        writer.writerow([b, 0])
    for c in var_bootstraps:
        writer.writerow([c, 1])


In [175]:
#indices = [tag_list.index(tag) for tag in most_common_tags]
index_to_tag = {}
for tag in most_common_tags:
    ind = tag_list.index(tag)
    index_to_tag[ind] = tag
    


In [177]:
print(index_to_tag)
print(indices)

{2: 'Back Rank', 3: 'Basic Checkmates', 11: 'Decoy / Deflection', 29: 'Defense', 13: 'Discovered Attack', 21: 'Mate in 1', 30: 'Zwischenzug', 12: 'Mate in 2', 31: 'Endgame Tactics', 39: 'Double', 23: 'Trapped Piece', 28: 'Simplification', 14: 'Fork / Double Attack', 15: 'Hanging Piece', 5: 'Mate in 3+', 6: 'Mating Net', 17: 'Pin', 19: 'Remove the Defender', 8: 'Sacrifice', 9: 'Vulnerable King'}
[2, 3, 11, 29, 13, 21, 30, 12, 31, 39, 23, 28, 14, 15, 5, 6, 17, 19, 8, 9]


In [179]:
#how to get correlation matrix
#at some point map indicies to tags
#go through every user and make a list of how they did on the actual for each tag and add that row to matrix
#go through every user and for every bootstrap add that to a row of a different matrix

def dict_no_fail(d, key, fail_value):
    if key in d:
        return d[key]
    return fail_value

    
z = 0
actual_lists = []
bootstrap_lists = []
for key in playerDict2:
    z += 1
    if z%1000 == 10:
        print(z)
    p = playerDict2[key]
    actual_dict = p.bootstraps[0]
    actual_lists.append([dict_no_fail(actual_dict, ind, 1720) for ind in indices])
    for bootstrap in p.bootstraps[1:]:
        bootstrap_lists.append([dict_no_fail(bootstrap, ind, 1720) for ind in indices])

print("converting")
matrix_of_all_users_actual = np.array(actual_lists)
print("converting 2")
matrix_of_all_users_bootstraps = np.array(bootstrap_lists)

        
    

57010
58010
59010
60010
61010
62010
63010
64010
65010
66010
67010
68010
69010
70010
71010
72010
73010
74010
75010
76010
77010
78010
79010
80010
81010
82010
83010
84010
85010
86010
87010
88010
89010
90010
91010
92010
93010
94010
95010
96010
97010
98010
99010
100010
101010
102010
103010
104010
105010
106010
107010
108010
109010
110010
111010
converting
converting 2


In [180]:
with open("user_tag_elos_for_cor_matrix.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow([index_to_tag[ind] for ind in indices])
    writer.writerows(actual_lists)
    
with open("bootstrap_tag_elos_for_cor_matrix.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow([index_to_tag[ind] for ind in indices])
    writer.writerows(bootstrap_lists)

In [19]:
#calculate the mean for each bootstrap
for key in playerDict2:
    break

p1 = playerDict2[key]
mean_variances = []

print(len(p1.bootstraps))

for i in range(len(p1.bootstraps)):
    if i%10 == 2:
        print(i)
    temp = []
    for key in playerDict2:
        p = playerDict2[key]
        temp.append(np.var(list(p.bootstraps[i].values())))
    mean_variances.append(np.mean(temp))
    
        

1001
2
12
22
32
42
52
62
72
82
92
102
112
122
132
142
152
162
172
182
192
202
212
222
232
242
252
262
272
282
292
302
312
322
332
342
352
362
372
382
392
402
412
422
432
442
452
462
472
482
492
502
512
522
532
542
552
562
572
582
592
602
612
622
632
642
652
662
672
722
732
742
752
762
772
782
792
802
812
822
832
842
852
862
872
882
892
902
912
922
932
942
952
962
972
982
992


In [21]:
with open("variances_of_each_bootstrap.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["Variance", "is_bootstrap"])
    writer.writerow([mean_variances[0], 0])

    for item in mean_variances[1:]:
        
        writer.writerow([item, 1])
