# In this notebook we load the extracted position-comment pairs for

* Analysis
* Visualisation
* Cleaning
* Transcription error correction

In [71]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [72]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt

In [73]:
path = 'videos'
banter_videos = []

import os
count = 0
for subdir, dirs, files in os.walk(path):
    for file in files:
        filepath = subdir + os.sep + file
        if filepath.endswith(".mp4") and ('banter' in filepath or 'Banter' in filepath):
            banter_videos.append(filepath)
            count+=1
print(f"""{count} banter blitz videos.""")

1631 banter blitz videos.


In [74]:
import pandas as pd

def video_2_data_path(video_path):
    path_parts = video_path.split('/')
    path_parts[-1] = 'fen_comment_data.csv'
    data_path = '/'.join(path_parts)
    return data_path
    
def load_data_df(data_path):
    df = pd.read_csv(data_path)
    return df

def data_exists(data_path):
    return os.path.exists(data_path)

In [75]:
banter_data = [video_2_data_path(video_path) for video_path in banter_videos 
               if data_exists(video_2_data_path(video_path))]
len(banter_data)

875

In [76]:
banter_dfs = [load_data_df(data_path) for data_path in banter_data]
banter_dfs = [df for df in banter_dfs if len(df)>0]
len(banter_dfs)

704

In [77]:
sum([len(df) for df in banter_dfs])

342888

In [70]:
df = pd.concat(banter_dfs)
df.head()

Unnamed: 0.1,Unnamed: 0,game_number,fen,color,moves,frame_number,next_frame_number,comment
0,0,0,rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR,white,['g1f3'],170,790,"Hello everybody, welcome to round 2 of Hunter..."
1,1,0,rnbqkbnr/pppppppp/8/8/8/5N2/PPPPPPPP/RNBQKB1R,black,"['d7d5', 'g2g3']",790,870,"match. I didn't really watch his first match,..."
2,2,0,rnbqkbnr/ppp1pppp/8/3p4/8/5NP1/PPPPPP1P/RNBQKB1R,black,['b8c6'],870,930,"match. I didn't really watch his first match,..."
3,3,0,r1bqkbnr/ppp1pppp/2n5/3p4/8/5NP1/PPPPPP1P/RNBQ...,white,"['d2d4', 'c8f5']",930,1220,"match. I didn't really watch his first match,..."
4,4,0,r2qkbnr/ppp1pppp/2n5/3p1b2/3P4/5NP1/PPP1PP1P/R...,white,"['f1g2', 'e7e6']",1220,1420,"much time, so I just, this is the only line I..."


In [14]:
df = df.dropna()

In [15]:
len(df.comment.unique())

104060

In [16]:
len(df.dropna())

163203

In [17]:
len(' '.join(df.comment.unique()).split())

2986801

In [19]:
text = ' '.join(df.comment.unique())

In [20]:
words = text.split()
len(set(words))

44775

In [21]:
for punctuation in [',','.','!','?',';',':','  ','  ','  ']:
    text = text.replace(punctuation,' ')

In [22]:
words = text.split()
len(set(words))

28683

In [24]:
vocab = list(set(words))

counted_vocab = sorted([(words.count(word),word) for word in vocab])[::-1]

counted_vocab[:100]

[(133968, 'I'),
 (86962, 'to'),
 (75311, 'the'),
 (58264, 'a'),
 (52147, 'is'),
 (39077, 'this'),
 (37982, 'and'),
 (32457, "I'm"),
 (31818, 'of'),
 (30850, 'it'),
 (30242, 'that'),
 (28140, 'play'),
 (27100, 'not'),
 (27034, 'just'),
 (25746, 'on'),
 (25484, 'So'),
 (24567, 'for'),
 (24306, 'have'),
 (23836, 'here'),
 (22951, 'Okay'),
 (22454, 'you'),
 (22345, 'be'),
 (21952, 'go'),
 (21497, 'think'),
 (21274, 'in'),
 (21107, 'but'),
 (19557, 'take'),
 (19512, 'my'),
 (19123, 'can'),
 (18142, 'so'),
 (17477, 'like'),
 (16724, 'going'),
 (16472, "it's"),
 (16369, 'with'),
 (16350, "don't"),
 (16013, 'now'),
 (15964, 'And'),
 (15365, 'he'),
 (14616, 'Yeah'),
 (14468, 'was'),
 (14224, 'do'),
 (13550, "let's"),
 (13191, 'very'),
 (12882, 'if'),
 (12635, 'some'),
 (12580, "Let's"),
 (11979, 'me'),
 (11812, 'good'),
 (11724, 'know'),
 (11596, 'Queen'),
 (11194, 'should'),
 (10772, 'then'),
 (10613, 'we'),
 (10565, 'bishop'),
 (10464, 'what'),
 (10009, 'knight'),
 (9607, 'really'),
 (9520, '

In [26]:
counted_vocab[-400:]

[(1, 'Audimong'),
 (1, "Audibon's"),
 (1, 'Aubergain'),
 (1, 'Attention'),
 (1, 'Aton'),
 (1, 'Asum'),
 (1, 'Asulu'),
 (1, 'Astrovsky'),
 (1, 'Astrits'),
 (1, 'Astralhands'),
 (1, 'Astana'),
 (1, 'Assution'),
 (1, 'Aspity'),
 (1, 'Asparo'),
 (1, 'Askerrick'),
 (1, 'Ashelagraph'),
 (1, 'Ases'),
 (1, 'Arya'),
 (1, 'Artificial'),
 (1, 'Arthouse'),
 (1, 'Arrowfold'),
 (1, 'Arrimine'),
 (1, 'Arpoles'),
 (1, 'Arota'),
 (1, 'Arno'),
 (1, 'Arnand'),
 (1, 'Armourney'),
 (1, 'Armonia'),
 (1, 'Arkady'),
 (1, 'Arie'),
 (1, 'Argentinites'),
 (1, "Argard's"),
 (1, 'Areto'),
 (1, 'Arash'),
 (1, 'Aras'),
 (1, 'Apply'),
 (1, "Apple's"),
 (1, "Appetite's"),
 (1, 'Appel'),
 (1, 'Appeals'),
 (1, 'Antonio'),
 (1, 'Antbotvinnik'),
 (1, 'Answer'),
 (1, 'Anon'),
 (1, 'Annoin'),
 (1, 'Annergetic'),
 (1, 'Anitian'),
 (1, 'Anisholo'),
 (1, "Anish's"),
 (1, 'Anders'),
 (1, 'Ancelotti'),
 (1, 'Anbania'),
 (1, 'Anatolik'),
 (1, 'Analyzing'),
 (1, 'Anagons'),
 (1, 'Ampli'),
 (1, 'Amplening'),
 (1, 'Amphiruja'),
 (1,

In [27]:
counter = {}
for (count,word) in counted_vocab:
    counter[word] = count

In [30]:
def min_count(comment):
    for punctuation in [',','.','!','?',';',':','  ','  ','  ']:
        comment = comment.replace(punctuation,' ')
    counts = [counter[word] for word in comment.split()]
    if counts == []:
        return 1000000
    return min(counts)

df['rarest_word'] = df.comment.apply(min_count)
            

In [33]:
df.sort_values(by="rarest_word").comment.values

array([" Hello everybody, welcome to round 2 of Hunter Blitz Cup. I'm facing Russian Grandmaster  Grigory Oparre. He's usually quite strong online blitz, so I'm looking forward to a tough  match. I didn't really watch his first match, but I think he just won quite easily. Ok, so ",
       " Diwana.  It's a bit better.  Something tells me we've learned worse positions.  Yeah.  Yeah.  Yeah, I think that's.  This guy goes here.  Yeah, that's a nice, nice little square.  The night.  Seriously considering a vision to move.  Why wouldn't you?  I am pretty greedy.  This is what happens when I either come in sorry with Yester for like two weeks straight.  Like, oh my god, I'm.  Clearly, this is all Yester's fault.  Oh, it's.  Everything. ",
       ' So D4 here allows 95 also interesting, but D4 I like.  And if Q and F6 Bb G5, I will win immediately.  OK, this is cooperative move.  So I want D5, OK, D5 Bb G4, H3 also possible, my plan, OK, F6.  And now so I have space advantage and nice positio

In [39]:
df[df['rarest_word']>10]

Unnamed: 0.1,Unnamed: 0,game_number,fen,color,moves,frame_number,next_frame_number,comment,rarest_word
1,1,0,rnbqkbnr/pppppppp/8/8/8/5N2/PPPPPPPP/RNBQKB1R,black,"['d7d5', 'g2g3']",790,870,"match. I didn't really watch his first match,...",316
2,2,0,rnbqkbnr/ppp1pppp/8/3p4/8/5NP1/PPPPPP1P/RNBQKB1R,black,['b8c6'],870,930,"match. I didn't really watch his first match,...",316
3,3,0,r1bqkbnr/ppp1pppp/2n5/3p4/8/5NP1/PPPPPP1P/RNBQ...,white,"['d2d4', 'c8f5']",930,1220,"match. I didn't really watch his first match,...",138
4,4,0,r2qkbnr/ppp1pppp/2n5/3p1b2/3P4/5NP1/PPP1PP1P/R...,white,"['f1g2', 'e7e6']",1220,1420,"much time, so I just, this is the only line I...",138
5,5,0,r2qkbnr/ppp2ppp/2n1p3/3p1b2/3P4/5NP1/PPP1PPBP/...,white,"['f3h4', 'f5g4']",1420,2060,with this. I just made my board quite big. Ok...,225
...,...,...,...,...,...,...,...,...,...
126,126,7,8/3n4/4kp2/R2p1P2/6PB/4PK2/8/1r6,black,['e6d6'],142480,142560,What's his name? I'm teasing. I know he's cal...,12
127,127,7,8/3n4/3k1p2/R2p1P2/6PB/4PK2/8/1r6,white,"['a5a6', 'b1b6']",142560,142710,What's his name? I'm teasing. I know he's cal...,12
128,128,7,8/3n4/Rr1k1p2/3p1P2/6PB/4PK2/8/8,white,"['a6b6', 'd7b6']",142710,142800,Why do I keep messing rook B6? How about this...,42
146,146,8,5r1k/p1p3pp/1b1q1N2/4N3/2QP4/4B2P/Pr4P1/5RK1,black,['f8f6'],154460,154770,"This has not gone well. 9.6, Ruket 6.9.7, 3....",20


In [54]:
116+106+104+116+120+104+93+86+14+24+107

990