# Text mining project

## Dota 2 chat toxicity

### Sampling public matches

In [3]:
from ratelimit import limits, sleep_and_retry
import requests

ONE_MINUTE = 60 # Max calls per minutes in seconds
# Check API limit of 60 calls per minute before making request
@sleep_and_retry
@limits(calls=60, period=ONE_MINUTE)
def call_api(url, params):
    response = requests.get(url, params=params)

    if response.status_code != 200:
        raise Exception('API response: {}'.format(response.status_code))
    return response



matches = [] # list with tuples of match id and avg mmr

min_id = 6628404859 # specify some min nr of match id as start

for i in range(120):  
    
    min_id += i*5000 
    url = "https://api.opendota.com/api/publicMatches"
    params = {'less_than_match_id': min_id} # min match id to sample from

    # get response
    response = call_api(url=url, params=params)
    data = response.json()

    # Check if a previous sample included the match id, otherwise add it to list
    for match in data:
        try: 
            match_id = match['match_id']
            avg_mmr = match['avg_mmr']
            # Check if avg_mmr is None, which is indicative of it being a pro match, meaning we exclude it.
            if avg_mmr is None:
                pass
        except TypeError: 
            pass
        
        if (match_id, avg_mmr) not in matches:
            matches.append((match_id, avg_mmr))

In [24]:
# Some matches with avg_mmr == None are removed
public_matches = [(id, mmr) for id, mmr in matches if mmr is not None]

# To not exceed rate limit per month, save these to a txt file.

#with open('public_matches.txt', 'w') as f:
#    for line in public_matches:
#        f.write(f"{line}\n")

#### Run Below cell to read from txt file

In [2]:
# Initialize an empty list to store the tuples
public_matches = []

# Open the file in reading mode
with open("public_matches.txt", "r") as file:
    # Read the lines of the file into a list
    lines = file.readlines()

# Iterate over the lines
for line in lines:
    # Strip the leading and trailing whitespace from the line
    line = line.strip()
    # Split the line into a tuple of values
    values = line.strip("()").split(", ")
    # Convert the values to integers
    values = [int(x) for x in values]
    # Add the tuple to the list
    public_matches.append(tuple(values))


### Get chat logs from public matches

In [3]:
len(public_matches)

6678

#### Split data into batches

In [59]:
def chunks(xs, n):
    n = max(1, n)
    return (xs[i:i+n] for i in range(0, len(xs), n))

batches = list(chunks(public_matches, 60)) # 112 batches with 60 elements(querys)


NameError: name 'public_matches' is not defined

In [4]:
len(batches)

112

#### The chat dataframe we want to load everything into

In [5]:
from ratelimit import limits, sleep_and_retry
import requests
ONE_MINUTE = 60 # Max calls per minutes in seconds
# Check API limit of 60 calls per minute before making request
@sleep_and_retry
@limits(calls=60, period=ONE_MINUTE)
def call_api_match(url):
    response = requests.get(url)
    if response.status_code != 200:
        time.sleep(60)
        return None
        #raise Exception('API response: {}'.format(response.status_code))
    #if response.status_code != 200:
    #    raise Exception('API response: {}'.format(response.status_code))
    else:
        return response

## parsed matches
import pandas as pd
import time

# Create an empty DataFrame with column names according to the structure of the modified api call
chat_dataframe = pd.DataFrame(columns = ['time', 'type', 'key', 'slot', 'player_slot', 'id', 'avg_mmr'])

#### Use batches to handle all the requests

In [72]:
def call_batch(batch):
    """Function that takes a batch and performs the querys"""

    # Create an empty DataFrame with column names according to the structure of the modified api call
    batch_df = pd.DataFrame(columns = ['time', 'type', 'key', 'slot', 'player_slot', 'id', 'avg_mmr'])
    # Iterate the match ids 
    for id, mmr in batch:
        
        url = "https://api.opendota.com/api/matches/{}".format(id)
        
        response = call_api_match(url=url)
        if response is None:
            continue
        
        data = response.json()
        # We only care for matches with chat messages
        if data['chat'] is not None:
            chat_data = data['chat']
            df = pd.DataFrame(chat_data)
            df['id'] = id
            df['avg_mmr'] = mmr

            # Concate the current match with the others
            batch_df = pd.concat([batch_df, df])
    return batch_df

In [None]:
def call_subbatch(batches):
    subbatch_df = pd.DataFrame(columns = ['time', 'type', 'key', 'slot', 'player_slot', 'id', 'avg_mmr'])
    n = 0
    status_string = "success"
    for batch in batches:
        try:

            batch_dataframe = call_batch(batch)
            subbatch_df = pd.concat([subbatch_df, batch_dataframe])
            status_string = "success"
            
        except:
            status_string = "fail"
            pass

        #print("Batch {n} finished with status: {status_string} ".format(n=n, status_string=status_string))
        n+=1
    return subbatch_df

#### Create subbatches and then call the requests to the api to avoid running into some errors

In [None]:
## Create subbatches
batch0 = batches[:20]
batch1 = batches[20:40]
batch2 = batches[40:60]
batch3 = batches[60:80]
batch4 = batches[80:100]
batch5 = batches[100:]

In [12]:

## Call the subbbatches
df_batch0 = call_subbatch(batch0)
#df_batch0.to_csv('batch0.csv')

In [None]:
df_batch1 = call_subbatch(batch1)
#df_batch1.to_csv('batch1.csv')

In [None]:
df_batch2 = call_subbatch(batch2)
#df_batch2.to_csv('batch2.csv')

In [None]:
df_batch3 = call_subbatch(batch3)
#df_batch3.to_csv('batch3.csv')

In [None]:
df_batch4 = call_subbatch(batch4)
#df_batch4.to_csv('batch4.csv')

In [None]:
df_batch5 = call_subbatch(batch5)
#df_batch5.to_csv('batch5.csv')

### Read the csvs

In [8]:
import pandas as pd
df0 = pd.read_csv('batch0.csv') 
df1 = pd.read_csv('batch1.csv') 
df2 = pd.read_csv('batch2.csv') 
df3 = pd.read_csv('batch3.csv') 
df4 = pd.read_csv('batch4.csv') 
df5 = pd.read_csv('batch5.csv')

# Concatenate the dataframes
df_list = [df0, df1, df2, df3, df4, df5]
dota_df = pd.concat(df_list)
#dota_df.to_csv('dota_df.csv')

In [13]:
# Read and subset the chat messages
dota2_df = pd.read_csv('dota_df.csv')
dota_chats = dota2_df.loc[dota2_df['type'] == "chat"]

### The pandas dataframe with the retrieved chat messages from the API

In [14]:
dota_chats

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,time,type,key,slot,player_slot,id,avg_mmr,unit
51,51,51,807,chat,GANEN SU WEBADA,9,132,6629454411,3012,
52,52,52,820,chat,GRACIAS,9,132,6629454411,3012,
53,53,53,920,chat,TREMNENDA COMPRA CUENTAS MI AM,7,130,6629454411,3012,
54,54,54,924,chat,1K DE MRD SIN ITEMS,7,130,6629454411,3012,
67,67,67,1263,chat,xdd,0,0,6629454411,3012,
...,...,...,...,...,...,...,...,...,...,...
27078,311,17,2287,chat,pango aegis?,5,128,6662919204,2082,
27079,312,18,2287,chat,wtf,5,128,6662919204,2082,
27085,318,24,2755,chat,report PA please,5,128,6662919204,2082,
27086,319,25,2775,chat,он не пендос так что нет,4,4,6662919204,2082,


## Getting more match id:s

The data retrieval above resulted in 4720 chat messages. We will now try to increase the amount of gathered data.

We will now try to collect more data iteratively by calling batches.

In [51]:
# Visiting https://api.opendota.com/api/publicMatches 100 matches were displayed, where the latest match had id = 6953922109 and the first match had id = 6953951809
MAX_LATEST = 6953951809
MIN_LATEST = 6953922109

# Retrieve difference which we will use to subtract each sample (29700)
DIFF = MAX_LATEST-MIN_LATEST
MIN_ID = MAX_LATEST

# Number of API calls in each match retrieval batch
N_API = 60

from ratelimit import limits, sleep_and_retry
import requests
import time

ONE_MINUTE = 60 # Max calls per minutes in seconds
# Check API limit of 60 calls per minute before making request
@sleep_and_retry
@limits(calls=60, period=ONE_MINUTE)
def call_api(url, params):
    response = requests.get(url, params=params)
    
    if response.status_code != 200:
        time.sleep(60)
        return None
        
    else:
        return response

def get_matches(current_min_id):
    matches = [] # list with tuples of match id and avg mmr
    for i in range(N_API):  
        
        url = "https://api.opendota.com/api/publicMatches"
        params = {'less_than_match_id': current_min_id} # min match id to sample from

        # get response, continue if error calling api
        response = call_api(url=url, params=params)
        if response is None:
            continue
        
        data = response.json()
        # Check if a previous sample included the match id, otherwise add it to list
        for match in data:
            try: 
                match_id = match['match_id']
                avg_mmr = match['avg_mmr']
            except TypeError: 
                pass
            
            if (match_id, avg_mmr) not in matches:
                matches.append((match_id, avg_mmr))

        current_min_id = current_min_id - DIFF
        
    # Some matches with avg_mmr == None are removed
    public_matches = [(id, mmr) for id, mmr in matches if mmr is not None]

    result = [public_matches, current_min_id]
    return result   

### Calling batches.
We still have to adhere to the limitation of the API with 50000 calls per month, so we can only look into so many matches. This means we stop retrieving match ids when we have more than 498000 match ids gathered.

In [52]:
batch_size = 100
batch_matches = []

for i in range(batch_size):
    # Retrieve batch of matches and append to list
    result = get_matches(MIN_ID)
    batch_matches.append(result[0])
    # Decrease min id
    MIN_ID = result[1]

    concatenated_batch = [item for sublist in batch_matches for item in sublist]
    cb = list(set(concatenated_batch))

    if len(cb) > 49800:
        break

concatenated_batch = [item for sublist in batch_matches for item in sublist]
pub_matches = list(set(concatenated_batch))

In [57]:
print(len(pub_matches)) # more than 50k
#subset
pub_matches_subset = pub_matches[:49800]
# Save match ids to a txt file.
#with open('public_matches_subset.txt', 'w') as f:
#    for line in pub_matches_subset:
#        f.write(f"{line}\n")


51754


### With an increased number of matches we now do the call for each match again
Just copy code from when testing earlier

In [61]:
sub_batches = list(chunks(pub_matches_subset, 60))
len(sub_batches) 

830

In [62]:
from ratelimit import limits, sleep_and_retry
import requests
ONE_MINUTE = 60 # Max calls per minutes in seconds
# Check API limit of 60 calls per minute before making request
@sleep_and_retry
@limits(calls=60, period=ONE_MINUTE)
def call_api_match(url):
    response = requests.get(url)
    if response.status_code != 200:
        time.sleep(60)
        return None
        #raise Exception('API response: {}'.format(response.status_code))
    #if response.status_code != 200:
    #    raise Exception('API response: {}'.format(response.status_code))
    else:
        return response

## parsed matches
import pandas as pd
import time

# Create an empty DataFrame with column names according to the structure of the modified api call
pub_chat_dataframe = pd.DataFrame(columns = ['time', 'type', 'key', 'slot', 'player_slot', 'id', 'avg_mmr'])

In [66]:
## Create subbatches
pbatch00 = sub_batches[:20]
pbatch01 = sub_batches[20:40]
pbatch02 = sub_batches[40:60]
pbatch03 = sub_batches[60:80]
pbatch04 = sub_batches[80:100]

pbatch11 = sub_batches[100:120]
pbatch12 = sub_batches[120:140]
pbatch13 = sub_batches[140:160]
pbatch14 = sub_batches[160:180]
pbatch15 = sub_batches[180:200]

pbatch21 = sub_batches[200:220]
pbatch22 = sub_batches[220:240]
pbatch23 = sub_batches[240:260]
pbatch24 = sub_batches[260:280]
pbatch25 = sub_batches[280:300]

pbatch31 = sub_batches[300:320]
pbatch32 = sub_batches[320:340]
pbatch33 = sub_batches[340:360]
pbatch34 = sub_batches[360:380]
pbatch35 = sub_batches[380:400]

pbatch41 = sub_batches[400:420]
pbatch42 = sub_batches[420:440]
pbatch43 = sub_batches[440:460]
pbatch44 = sub_batches[460:480]
pbatch45 = sub_batches[480:500]

pbatch51 = sub_batches[500:520]
pbatch52 = sub_batches[520:540]
pbatch53 = sub_batches[540:560]
pbatch54 = sub_batches[560:580]
pbatch55 = sub_batches[580:600]

pbatch61 = sub_batches[600:620]
pbatch62 = sub_batches[620:640]
pbatch63 = sub_batches[640:660]
pbatch64 = sub_batches[660:680]
pbatch65 = sub_batches[680:700]

pbatch71 = sub_batches[700:720]
pbatch72 = sub_batches[720:740]
pbatch73 = sub_batches[740:760]
pbatch74 = sub_batches[760:780]
pbatch75 = sub_batches[780:800]

pbatch81 = sub_batches[800:820]
pbatch82 = sub_batches[820:]

### Call sub batches again

#### 0

In [None]:
df_pbatch00 = call_subbatch(pbatch00)
#df_pbatch00.to_csv('data/pbatch00.csv')

In [None]:
df_pbatch01 = call_subbatch(pbatch01)
#df_pbatch01.to_csv('data/pbatch01.csv')

In [None]:
df_pbatch02 = call_subbatch(pbatch02)
#df_pbatch02.to_csv('data/pbatch02.csv')

In [None]:
df_pbatch03 = call_subbatch(pbatch03)
#df_pbatch03.to_csv('data/pbatch03.csv')

In [None]:
df_pbatch04 = call_subbatch(pbatch04)
#df_pbatch04.to_csv('data/pbatch04.csv')

#### 1

In [80]:
df_pbatch11 = call_subbatch(pbatch11)
df_pbatch11.to_csv('data/pbatch11.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


In [81]:
df_pbatch12 = call_subbatch(pbatch12)
df_pbatch12.to_csv('data/pbatch12.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


In [82]:
df_pbatch13 = call_subbatch(pbatch13)
df_pbatch13.to_csv('data/pbatch13.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


In [83]:
df_pbatch14 = call_subbatch(pbatch14)
df_pbatch14.to_csv('data/pbatch14.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


In [84]:
df_pbatch15 = call_subbatch(pbatch15)
df_pbatch15.to_csv('data/pbatch15.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


#### 2

In [85]:
df_pbatch21 = call_subbatch(pbatch21)
df_pbatch21.to_csv('data/pbatch21.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


In [86]:
df_pbatch22 = call_subbatch(pbatch22)
df_pbatch22.to_csv('data/pbatch22.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


In [87]:
df_pbatch23 = call_subbatch(pbatch23)
df_pbatch23.to_csv('data/pbatch23.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


In [88]:
df_pbatch24 = call_subbatch(pbatch24)
df_pbatch24.to_csv('data/pbatch24.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


In [89]:
df_pbatch25 = call_subbatch(pbatch25)
df_pbatch25.to_csv('data/pbatch25.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


#### 3

In [90]:
df_pbatch31 = call_subbatch(pbatch31)
df_pbatch31.to_csv('data/pbatch31.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


In [91]:
df_pbatch32 = call_subbatch(pbatch32)
df_pbatch32.to_csv('data/pbatch32.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


In [92]:
df_pbatch33 = call_subbatch(pbatch33)
df_pbatch33.to_csv('data/pbatch33.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


In [93]:
df_pbatch34 = call_subbatch(pbatch34)
df_pbatch34.to_csv('data/pbatch34.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: fail 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


In [94]:
df_pbatch35 = call_subbatch(pbatch35)
df_pbatch35.to_csv('data/pbatch35.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


#### 4

In [95]:
df_pbatch41 = call_subbatch(pbatch41)
df_pbatch41.to_csv('data/pbatch41.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: fail 
Batch 19 finished with status: success 


In [96]:
df_pbatch42 = call_subbatch(pbatch42)
df_pbatch42.to_csv('data/pbatch42.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


In [97]:
df_pbatch43 = call_subbatch(pbatch43)
df_pbatch43.to_csv('data/pbatch43.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


In [98]:
df_pbatch44 = call_subbatch(pbatch44)
df_pbatch44.to_csv('data/pbatch44.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


In [99]:
df_pbatch45 = call_subbatch(pbatch45)
df_pbatch45.to_csv('data/pbatch45.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


#### 5

In [100]:
df_pbatch51 = call_subbatch(pbatch51)
df_pbatch51.to_csv('data/pbatch51.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


In [101]:
df_pbatch52 = call_subbatch(pbatch52)
df_pbatch52.to_csv('data/pbatch52.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


In [102]:
df_pbatch53 = call_subbatch(pbatch53)
df_pbatch53.to_csv('data/pbatch53.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


In [103]:
df_pbatch54 = call_subbatch(pbatch54)
df_pbatch54.to_csv('data/pbatch54.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


In [104]:
df_pbatch55 = call_subbatch(pbatch55)
df_pbatch55.to_csv('data/pbatch55.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


#### 6

In [105]:
df_pbatch61 = call_subbatch(pbatch61)
df_pbatch61.to_csv('data/pbatch61.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


In [106]:
df_pbatch62 = call_subbatch(pbatch62)
df_pbatch62.to_csv('data/pbatch62.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


In [107]:
df_pbatch63 = call_subbatch(pbatch63)
df_pbatch63.to_csv('data/pbatch63.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


In [108]:
df_pbatch64 = call_subbatch(pbatch64)
df_pbatch64.to_csv('data/pbatch64.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


In [109]:
df_pbatch65 = call_subbatch(pbatch65)
df_pbatch65.to_csv('data/pbatch65.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


#### 7

In [110]:
df_pbatch71 = call_subbatch(pbatch71)
df_pbatch71.to_csv('data/pbatch71.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


In [111]:
df_pbatch72 = call_subbatch(pbatch72)
df_pbatch72.to_csv('data/pbatch72.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


In [112]:
df_pbatch73 = call_subbatch(pbatch73)
df_pbatch73.to_csv('data/pbatch73.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


In [113]:
df_pbatch74 = call_subbatch(pbatch74)
df_pbatch74.to_csv('data/pbatch74.csv')

Batch 0 finished with status: success 
Batch 1 finished with status: success 
Batch 2 finished with status: success 
Batch 3 finished with status: success 
Batch 4 finished with status: success 
Batch 5 finished with status: success 
Batch 6 finished with status: success 
Batch 7 finished with status: success 
Batch 8 finished with status: success 
Batch 9 finished with status: success 
Batch 10 finished with status: success 
Batch 11 finished with status: success 
Batch 12 finished with status: success 
Batch 13 finished with status: success 
Batch 14 finished with status: success 
Batch 15 finished with status: success 
Batch 16 finished with status: success 
Batch 17 finished with status: success 
Batch 18 finished with status: success 
Batch 19 finished with status: success 


#### Can't run any more queries as API limit is exceeded.

In [None]:
#df_pbatch75 = call_subbatch(pbatch75)
#df_pbatch75.to_csv('data/pbatch75.csv')

#df_pbatch81 = call_subbatch(pbatch81)
#df_pbatch81.to_csv('data/pbatch81.csv')

#df_pbatch82 = call_subbatch(pbatch82)
#df_pbatch82.to_csv('data/pbatch82.csv')

### Read the larger query of matches

In [118]:
import pandas as pd

df00 = pd.read_csv('data/pbatch00.csv')
df01 = pd.read_csv('data/pbatch01.csv')
df02 = pd.read_csv('data/pbatch02.csv')
df03 = pd.read_csv('data/pbatch03.csv')
df04 = pd.read_csv('data/pbatch04.csv')
df11 = pd.read_csv('data/pbatch11.csv')
df12 = pd.read_csv('data/pbatch12.csv')
df13 = pd.read_csv('data/pbatch13.csv')
df14 = pd.read_csv('data/pbatch14.csv')
df15 = pd.read_csv('data/pbatch15.csv')
df21 = pd.read_csv('data/pbatch21.csv')
df22 = pd.read_csv('data/pbatch22.csv')
df23 = pd.read_csv('data/pbatch23.csv')
df24 = pd.read_csv('data/pbatch24.csv')
df25 = pd.read_csv('data/pbatch25.csv')
df31 = pd.read_csv('data/pbatch31.csv')
df32 = pd.read_csv('data/pbatch32.csv')
df33 = pd.read_csv('data/pbatch33.csv')
df34 = pd.read_csv('data/pbatch34.csv')
df35 = pd.read_csv('data/pbatch35.csv')
df41 = pd.read_csv('data/pbatch41.csv')
df42 = pd.read_csv('data/pbatch42.csv')
df43 = pd.read_csv('data/pbatch43.csv')
df44 = pd.read_csv('data/pbatch44.csv')
df45 = pd.read_csv('data/pbatch45.csv')
df51 = pd.read_csv('data/pbatch51.csv')
df52 = pd.read_csv('data/pbatch52.csv')
df53 = pd.read_csv('data/pbatch53.csv')
df54 = pd.read_csv('data/pbatch54.csv')
df55 = pd.read_csv('data/pbatch55.csv')
df61 = pd.read_csv('data/pbatch61.csv')
df62 = pd.read_csv('data/pbatch62.csv')
df63 = pd.read_csv('data/pbatch63.csv')
df64 = pd.read_csv('data/pbatch64.csv')
df65 = pd.read_csv('data/pbatch65.csv')
df71 = pd.read_csv('data/pbatch71.csv')
df72 = pd.read_csv('data/pbatch72.csv')
df73 = pd.read_csv('data/pbatch73.csv')
df74 = pd.read_csv('data/pbatch74.csv')

# Concatenate the dataframes
df_pbatch_list = [df00, df01, df02, df03, df04, df11, df12, df13, df14, df15, df21, df22, df23, df24, df25, df31, df32, df33, df34, df35,
df41, df42, df43, df44, df45, df51, df52, df53, df54, df55,
df61, df62, df63, df64, df65, df71, df72, df73, df74]

#dota_pbatch_df = pd.concat(df_pbatch_list)
#dota_pbatch_df.to_csv('dota_pbatch_df.csv')


In [123]:
## Create subbatches
# Read and subset the chat messages
dota2_pbatch_df = pd.read_csv('dota_pbatch_df.csv')
dota_chatframe = dota2_pbatch_df.loc[dota2_pbatch_df['type'] == "chat"]
dota_chatframe


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,time,type,key,slot,player_slot,id,avg_mmr,unit
74,74,74,2330,chat,feels ggood to carry this tilted animals,5,128,6946080308,6862,
208,208,133,1230,chat,dsq,8,131,6933815002,5281,
225,225,4,765,chat,笑死,9,132,6934942217,4570,
231,231,10,1559,chat,想不开？,5,128,6934942217,4570,
239,239,2,611,chat,火女？？、,7,130,6945694208,3824,
...,...,...,...,...,...,...,...,...,...,...
36948,859,11,1709,chat,gg,0,0,6944091304,3477,
36949,860,12,1714,chat,GG,0,0,6944091304,3477,
36950,861,13,1718,chat,GG,2,2,6944091304,3477,
36951,862,14,1721,chat,gg,2,2,6944091304,3477,


## Checking if we've sampled the same matches

In [125]:
dota2_df = pd.read_csv('dota_df.csv')
dota_chats = dota2_df.loc[dota2_df['type'] == "chat"]
dota_chats

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,time,type,key,slot,player_slot,id,avg_mmr,unit
51,51,51,807,chat,GANEN SU WEBADA,9,132,6629454411,3012,
52,52,52,820,chat,GRACIAS,9,132,6629454411,3012,
53,53,53,920,chat,TREMNENDA COMPRA CUENTAS MI AM,7,130,6629454411,3012,
54,54,54,924,chat,1K DE MRD SIN ITEMS,7,130,6629454411,3012,
67,67,67,1263,chat,xdd,0,0,6629454411,3012,
...,...,...,...,...,...,...,...,...,...,...
27078,311,17,2287,chat,pango aegis?,5,128,6662919204,2082,
27079,312,18,2287,chat,wtf,5,128,6662919204,2082,
27085,318,24,2755,chat,report PA please,5,128,6662919204,2082,
27086,319,25,2775,chat,он не пендос так что нет,4,4,6662919204,2082,


No matching match_ids so we can join the two data frames

In [129]:
a = set(dota_chats['id'])
b = set(dota_chatframe['id'])


set()

In [130]:
combine_list = [dota2_df, dota2_pbatch_df]
combined_df = pd.concat(combine_list)
combined_df.to_csv('combined_df.csv')


### Read em again

In [3]:
import pandas as pd
dota_combined_df = pd.read_csv('combined_df.csv')
dota_combined_df_chats = dota_combined_df.loc[dota_combined_df['type'] == "chat"]
dota_combined_df_chats

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,time,type,key,slot,player_slot,id,avg_mmr,unit
51,51,51,51,807,chat,GANEN SU WEBADA,9,132,6629454411,3012,
52,52,52,52,820,chat,GRACIAS,9,132,6629454411,3012,
53,53,53,53,920,chat,TREMNENDA COMPRA CUENTAS MI AM,7,130,6629454411,3012,
54,54,54,54,924,chat,1K DE MRD SIN ITEMS,7,130,6629454411,3012,
67,67,67,67,1263,chat,xdd,0,0,6629454411,3012,
...,...,...,...,...,...,...,...,...,...,...,...
64038,36948,859,11,1709,chat,gg,0,0,6944091304,3477,
64039,36949,860,12,1714,chat,GG,0,0,6944091304,3477,
64040,36950,861,13,1718,chat,GG,2,2,6944091304,3477,
64041,36951,862,14,1721,chat,gg,2,2,6944091304,3477,


## Retrieved chat statistics

In [4]:
len(set(dota_combined_df_chats["id"]))

900