In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import ast

In [2]:
# Loading the data set.
gameDf = pd.read_csv('gameDf.csv')

In [3]:
# The csv has the list of chat messages as a single string. This function recovers the python list.
gameDf['chatsclean'] = gameDf['chatsclean'].apply(ast.literal_eval)

In [4]:
# BoW encoding, text has already been processed.
vectorizer = CountVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x,lowercase=False)

In [5]:
# Creating machine learning matricies and performing train-test split for validation.
X = vectorizer.fit_transform(gameDf['chatsclean'].tolist()).toarray()
y = gameDf['result'].to_numpy()
XTrain, XTest, yTrain, yTest = train_test_split(X,y,random_state=42)

In [6]:
# Creating parameter space and fitting models using grid search.
paramGrid = {'n_estimators': [2 ** n for n in range(6,11)],  
              'max_features': ["sqrt","log2"],
              'min_samples_leaf': [2 ** n for n in range(1,5)]}  
grid = GridSearchCV(RandomForestClassifier(class_weight="balanced"), paramGrid, verbose = 0) 
grid.fit(XTrain,yTrain)

GridSearchCV(estimator=RandomForestClassifier(class_weight='balanced'),
             param_grid={'max_features': ['sqrt', 'log2'],
                         'min_samples_leaf': [2, 4, 8, 16],
                         'n_estimators': [64, 128, 256, 512, 1024]})

In [7]:
# Fitting the best model found in grid search to the data set.
grid.best_estimator_.fit(XTrain,yTrain)

RandomForestClassifier(class_weight='balanced', max_features='sqrt',
                       min_samples_leaf=2, n_estimators=256)

In [8]:
# Printing all metrics of interest: fraction of "win" predictions, train/test accuracy, and f1-score.
print(sum(grid.best_estimator_.predict(XTest))/len(grid.best_estimator_.predict(XTest)))
print(grid.best_estimator_.score(XTrain,yTrain))
print(grid.best_estimator_.score(XTest,yTest))
print(classification_report(yTest, grid.best_estimator_.predict(XTest))) 

0.574468085106383
0.7357142857142858
0.5957446808510638
              precision    recall  f1-score   support

           0       0.55      0.52      0.54        42
           1       0.63      0.65      0.64        52

    accuracy                           0.60        94
   macro avg       0.59      0.59      0.59        94
weighted avg       0.59      0.60      0.59        94

