In [1]:
# Imports
from IPython.display import clear_output
import requests
import os
import json
import pandas as pd
import time
import ast
import nltk
from nltk.corpus import stopwords
import re

from config import riot_api_key

In [2]:
# Joining together chat messages if they occured within 10 seconds. This is only needed
# for the Google sentiment analysis, which considers context.
directory = 'chatLogs'
gameID = []
chats = []
for filename in os.listdir(directory):
    data = json.load(open(os.path.join(directory,filename)))
    gameID.append(filename.split("Game ")[1][:-5])
    messageList = []
    i = 0
    textsCount = len(data['text'])-1
    while i <= textsCount:
        if i == textsCount:
            messageList.append(data['text'][i]['chat'])
            i += 1
        else:
            if (data['text'][i+1]['gameTime'] - data['text'][i]['gameTime']) < 10000000:
                j = i
                message = data['text'][j]['chat']
                while j != textsCount and (data['text'][j+1]['gameTime'] - data['text'][j]['gameTime']) < 10000000:
                    message += (" " + data['text'][j+1]['chat'])
                    j += 1
                messageList.append(message)
                i = j + 1
            else:
                messageList.append(data['text'][i]['chat'])
                i += 1
    chats.append(messageList)

In [3]:
# Collecting results into a dataframe.
gameDfRaw = pd.DataFrame()
gameDfRaw['gameid'] = gameID
gameDfRaw['chats'] = chats
gameDfRaw['chatsflat'] = gameDfRaw['chats'].apply(lambda x: [word for i in [line.split() for line in x] for word in i])

In [4]:
# Preprocessing the nltk stopwords list to remove punctuation and manually add some stopwords
# I noticed that do not appear.
stops = stopwords.words('english')
stops.append("u")
stops.append("ur")
stops.append("im")
stops = [re.sub("[^0-9a-z]+","",w) for w in stops]

In [5]:
# Collecting all preprocessing steps into one function to prepare for machine learning.
# If I am able to get more data, I will probably replace these steps with a pipeline and not use pandas.
def preprocessing(flat):
    gameInitialisms = {"gg":"good game","mb":"my bad","gj":"good job","lol":"laugh out loud"}
    result1 = [w.lower() for w in flat]
    result2 = [re.sub("[^0-9a-z]+","",w) for w in result1]
    for i in range(len(result2)):
        if result2[i] in gameInitialisms.keys():
            result2[i] = gameInitialisms[result2[i]]
    result3 = [word for i in [line.split() for line in result2] for word in i]
    result4 = [w for w in result3 if w not in stops]
    result5 = [w for w in result4 if len(w)>0]
    return result5

In [6]:
# Applying the preprocessing function. The text will still need to be encoded, which will be done in
# the model notebooks in case I want to try different encoding methods in the future.
gameDfRaw['chatsclean'] = gameDfRaw['chatsflat'].map(preprocessing)

In [7]:
# Uncomment to query Riot Games API for match results.

# directory = 'chatLogs'
# for filename in os.listdir(directory):
#     if filename[-5:] == ".json":
#         clear_output()
#         match = filename.split("Game ")[1][:-5]
#         print(match)
#         r = requests.get(f"https://na1.api.riotgames.com/lol/match/v4/matches/{match}?api_key={riot_api_key}")
#         print(r.status_code)
#         json.dump(r.json(),open(f"API/{match}.json","w"))
#         time.sleep(3)

In [8]:
# Extracting match results from the response jsons. Note that custom matches do not return participant
# identities. I only had one custom match in the data set so I manually removed it, but it will need to
# be handled if upsizing.
directory = 'API'
gameVerification = []
gameResult = []
for filename in os.listdir(directory):
    result = json.load(open(os.path.join(directory,filename)))
    for participantIdentity in result['participantIdentities']:
        if participantIdentity['player']['summonerName']=="Shawner":
            myid = participantIdentity['participantId']
    for participant in result['participants']:
        if participant['participantId']==myid:
            gameVerification.append(filename[:-5])
            gameResult.append(participant['stats']['win'])

In [9]:
# Collecting results into a dataframe to merge with the texts.
resultDf = pd.DataFrame()
resultDf['gameid'] = gameVerification
resultDf['result'] = gameResult
gameDfRes = gameDfRaw.merge(resultDf, on="gameid")

In [10]:
# Final processing. Removing matches without text, and converting "True/False" to "1/0" for the ML models.
gameDfRes['result'] = gameDfRes['result'].map(lambda x: x*1)
gameDfClean = gameDfRes[gameDfRes['chatsclean'].map(len) > 0]

In [11]:
# Exporting data set with unbalanced classes for sentiment analysis.
gameDfClean.to_csv("gameDfClean.csv")

In [12]:
# Balancing the classes.
winDf = gameDfClean[gameDfClean['result']==1]
lossDf = gameDfClean[gameDfClean['result']==0]
gameDf = pd.concat([winDf.sample(n=187,random_state=0),lossDf],axis=0)

In [13]:
# Exporting data set with balanced classes for machine learning.
gameDf.to_csv("gameDf.csv")