In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import sklearn, numpy
import itertools
import json_helper_functions as jshf
import csv
import sys, os, re, time, math, logging
from sklearn.utils import Bunch
from sklearn.model_selection import train_test_split, validation_curve, learning_curve, ShuffleSplit
from sklearn.metrics import f1_score, precision_recall_curve, regression, average_precision_score, precision_score, recall_score, confusion_matrix
import pickle
## classifiers
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
#### GLOBAL VARIABLES
# activate test outputs (untested)
TEST = False
global_version = "8e"
csvpath="/../dota/csv/"
# create a result dict that contains all classification results of the current session
if not 'resultdict' in globals():
	resultdict={}
### logging ###
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.setLevel(logging.DEBUG)

In [None]:
def importCsvFile(filepath, filename, append_target_names = False, data_version = global_version):
	with open(filepath+filename,'r') as csvfile:
		contents = csv.reader(csvfile)
		temp = next(contents)
		n_features = int(temp[0]) - 1 # -1 because of the target/evaluation value
		n_samples = int(temp[1])
		version = temp[2]
		assert version == data_version, "file has version {}, but the wanted version is {}!".format(version, data_version)
		target_feature_name = temp[3]
		ttargets = ['Dead', 'Alive']
		tfeatures = next(contents)
		tfeature_types = next(contents)# not currently used, everything is a float
		dtypes = []
		target_feature_column = tfeatures.index(target_feature_name)
		tfeatures.pop(target_feature_column)
		target_feature_type = tfeature_types.pop(target_feature_column)
		data = numpy.empty((n_samples, n_features), dtype=numpy.float64)
		target = numpy.empty((n_samples,), dtype=numpy.bool)
		for i, ir in enumerate(contents):
			# take target value away first
			target_val = ir.pop(target_feature_column)
			target[i] = numpy.asarray(target_val=='True', dtype=numpy.bool)
			# then take the rest of the features
			data[i] = numpy.asarray(ir, dtype=numpy.float64)
		if append_target_names:
			target_names = numpy.array(ttargets)
			feature_names = numpy.array(tfeatures)
		if append_target_names:
			return data, target, feature_names, target_names, version
		else:
			return data, target, version

def checkFileVersion(filepath, wanted_version = global_version):
	with open(filepath,'r') as csvfile:
		contents = csv.reader(csvfile)
		temp = next(contents)
		version = temp[2]
		return version == wanted_version

if TEST:
	print(importCsvFile(csvpath, "2155844500-8.csv", True, "8"))

In [None]:
# create cache
if not 'csv_import_cache' in globals():
	global csv_import_cache
	csv_import_cache = {}
# import CSVs
def csvImport(filepath, filename_regex="[0-9]*[-][0-9]*[a-z]*[.]csv$", return_X_y=False, wanted_version=global_version, force_reload=True, ignore_features=None):
	data, target, target_names, feature_names, number_of_files = (None, None, None, None, None)
	if not force_reload and csv_import_cache.get(wanted_version):
		data, target, target_names, feature_names, number_of_files = csv_import_cache.get(wanted_version)
		print("fetched data of {} files from cache".format(number_of_files))
	else:
		# initialize array with the first fitting file in the directory
		firstfile = ""
		for examplefile in os.listdir(filepath):
			if os.path.isfile(filepath+examplefile) and checkFileVersion(filepath+examplefile, wanted_version):
				firstfile = examplefile
				logger.debug("getting metadata from file",examplefile)
				break
		if firstfile == "":
			raise FileNotFoundError("no file of version {} found in the directory {}".format(wanted_version,filepath))
		data, target, feature_names, target_names, global_version = importCsvFile(filepath, firstfile, data_version=wanted_version, append_target_names=True)
		number_of_files = 0
		for elem in os.listdir(filepath):
			if not re.match(filename_regex, elem):
				logger.debug("filename regex did not match "+elem)
				continue
			logger.info("reading file",elem)
			if not checkFileVersion(filepath+elem, wanted_version):
				continue
			newdata, newtarget, file_version = importCsvFile(filepath, elem, data_version=wanted_version)
			# append new stuff
			data = numpy.append(data, newdata, axis=0)
			target = numpy.append(target, newtarget, axis=0)
			number_of_files += 1
		csv_import_cache.update({wanted_version:(data, target, target_names, feature_names, number_of_files)})
	if not ignore_features is None:
		ignore_feature_list = []
		data = data[:,[n for n in numpy.arange(len(feature_names)) if feature_names[n] not in ignore_features]]
		feature_names = feature_names[[n for n in numpy.arange(len(feature_names)) if feature_names[n] not in ignore_features],]
	if return_X_y:
		return (data, target)
	print("loaded {} files".format(number_of_files))
	return Bunch(data=data, target=target, target_names=target_names, feature_names=feature_names)
				# ["name", "maxHealth", "curHealth", "posX", "posY", "gold", "xp", "level", "team", "lastDamage"])

if TEST:
	dota_alldata = csvImport(csvpath, wanted_version="8e", force_reload=False)
	dota_data = dota_alldata.get("data")
	dota_target = dota_alldata.get("target")

In [None]:
def printDataProperties(alldata):
	data_only = alldata.get("data")
	target_only = alldata.get("target")
	print("Data set properties: {} True elements, {} False elements".format(len([t for t in dota_target if t]), len([t for t in dota_target if not t])))

def printTrainTestSetShape(X, y):
	print("data set shape: {}\nTrue/False ratio:{}".format(X.shape, str(len([t for t in y if t]))+":"+str(len([t for t in y if not t]))))

if TEST:
	alldata = csvImport(csvpath, wanted_version="3k", force_reload=False)
	printDataProperties(alldata)
	data_only = alldata.get("data")
	target_only = alldata.get("target")
	X_training, X_test, y_training, y_test = train_test_split(data_only, target_only, train_size=0.3, random_state=42)
	printTrainTestSetShape(X_training, y_training)
	printTrainTestSetShape(X_test, y_test)
	print(numpy.unique(dota_target))

In [None]:
def trainclf(inpclf, inpdata, collect_infos=False):
	X_training, y_training = inpdata
	starttime = time.time()
	print("Training classifier {}, start time: {}".format(str(inpclf),time.ctime()))
	fitclf = inpclf.fit(X_training, y_training)
	endtime = time.time()
	print("end:",time.ctime())
	if collect_infos:
		return fitclf, {"training time":endtime-starttime}
	return fitclf
if TEST:
	dcfitclf = trainclf(clf, (X_training, y_training))
	knnfitclf = trainclf(KNeighborsClassifier(3), (X_training, y_training))
	#rffitclf=trainclf(RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), (X_training, y_training))
	abfitclf=trainclf(AdaBoostClassifier(), (X_training, y_training))
	dtfitclf=trainclf(DecisionTreeClassifier(), (X_training, y_training))
	mlpfitclf = trainclf(MLPClassifier(max_iter=200, hidden_layer_sizes=(20,100,50,10)), (X_training, y_training))
	#fitclf = clf.fit(X_training, y_training)
	#gpfitclf = gpclf.fit(X_training, y_training)
	#rffitclf = rfclf.fit(X_training, y_training)
	#abfitclf = abclf.fit(X_training, y_training)
	#dcfitclf = dcclf.fit(X_training, y_training)

In [None]:
# copied from http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, numpy.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = numpy.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
# copied from http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
						n_jobs=1, train_sizes=numpy.linspace(.1, 1.0, 5)):
	plt.figure()
	plt.title(title)
	if ylim is not None:
		plt.ylim(*ylim)
	plt.xlabel("Training examples")
	plt.ylabel("Score")
	train_sizes, train_scores, test_scores = learning_curve(
		estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
	train_scores_mean = numpy.mean(train_scores, axis=1)
	train_scores_std = numpy.std(train_scores, axis=1)
	test_scores_mean = numpy.mean(test_scores, axis=1)
	test_scores_std = numpy.std(test_scores, axis=1)
	plt.grid()
	plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
						train_scores_mean + train_scores_std, alpha=0.1,
						color="r")
	plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
			test_scores_mean + test_scores_std, alpha=0.1, color="g")
	#plt.tight_layout()
	plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
			label="Training score")
	plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
			label="Cross-validation score")
	plt.legend(loc="best")
	return plt

In [None]:
def scoreclf(inpclf, inpdata, collect_infos = False, class_names=None, clfname=None):
	X_test, y_test = inpdata
	fitstarttime = time.ctime()
	print("scoring start:",fitstarttime)
	fitclfscore = inpclf.score(X_test, y_test)
	y_pred = inpclf.predict(X_test)
	clf_score = f1_score(y_test, y_pred, average=None, labels=[True, False])
	fitendtime = time.ctime()
	print("scoring end:",fitendtime)
	if collect_infos:
		infos = {"test data size":len(y_test), "F1 score True":clf_score[0],"F1 score False":clf_score[1]}
	print("{}: score {}, deviation {}".format(str(inpclf), clf_score, fitclfscore.std()*2))
	zippedlist = list(zip(y_pred,y_test))
	truepositive = len([1 for elem in zippedlist if elem == (True,True)])
	truenegative = len([1 for elem in zippedlist if elem == (False,False)])
	falsepositive = len([1 for elem in zippedlist if elem == (True,False)])
	falsenegative = len([1 for elem in zippedlist if elem == (False,True)])
	conf_matr = confusion_matrix(y_test, y_pred)
	plt.figure()
	plot_confusion_matrix(conf_matr, classes=class_names, normalize=True, title='Confusion matrix, normalized')
	plt.tight_layout(rect=(0,0,0.9,0.9))
	if not clfname is None:
		plt.savefig(clfname+"-confusionmatrix.png",format="png")
	plt.show()
	if collect_infos:
		infos.update({"true positive":truepositive, "true negative":truenegative,
				 "false positive":falsepositive, "false negative":falsenegative})
	average_precision = average_precision_score(y_test, y_pred)
	precision_sc = precision_score(y_test, y_pred)
	recall_sc = recall_score(y_test, y_pred)
	print("precision: {}, recall: {}".format(precision_sc, recall_sc))
	prec, rec, thresh = precision_recall_curve(y_test, y_pred)
	plt.step(rec, prec, color='b', alpha=0.2, where='post')
	plt.fill_between(rec, prec, step='post', alpha=0.2, color='b')
	plt.xlabel('Recall')
	plt.ylabel('Precision')
	plt.ylim([0.0, 1.05])
	plt.xlim([0.0, 1.0])
	plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(average_precision))
	plt.show()
	if collect_infos:
		return fitclfscore, prec, rec, thresh, infos
	return fitclfscore, prec, rec, thresh

if TEST:
	knnscore = scoreclf(knnfitclf, (X_test, y_test))
	rfscore = scoreclf(rffitclf, (X_test, y_test))
	abscore = scoreclf(abfitclf, (X_test, y_test))
	dtscore, dtprec, dtrec, dtthresh = scoreclf(dtfitclf, (X_test, y_test))
	mlpscore = scoreclf(mlpfitclf, (X_test, y_test))

In [None]:
def saveobj(filename, ob):
	with open(filename, "wb") as outp:
		pickle.dump(ob, outp)

if TEST:
	saveobj("fitclf42.pk", fitclf)
	saveobj("fitclfscore42.pk", fitclfscore1)

In [None]:
# classifier Read Eval Print Save
def clfREPS(inpclf, inpdata=False, inpdatapath=csvpath, inp_version=global_version, collect_infos=False, overwrite=False, train_test_split_size=0.3, random_split=False, validation=False):
	if collect_infos:
		print("collect_infos is now always on!")
	if not inpdata:
		# load input data
		inpdata = csvImport(inpdatapath, wanted_version=inp_version, force_reload=False)
	dota_data = inpdata.get("data")
	dota_target = inpdata.get("target")
	logger.debug("True:",len([t for t in dota_target if t]),"\nFalse:",len([t for t in dota_target if not t]))
	infos={"True/False ratio":len([t for t in dota_target if t])/len([t for t in dota_target if not t]),"data version":inp_version, "feature_names":inpdata.feature_names}
	X_training, X_test, y_training, y_test = train_test_split(dota_data, dota_target, train_size=train_test_split_size, #random_state=42,
							 shuffle=random_split)
	printTrainTestSetShape(X_training, y_training)
	printTrainTestSetShape(X_test, y_test)
	infos.update({"training size":len(X_training), "test size":len(X_test)})
	# train classifier
	fitclf, newinfos = trainclf(inpclf, (X_training, y_training), collect_infos=True)
	infos.update(newinfos)
	# evaluate, e.g. calculate f1 scores
	clffilename = "{}-{}_{}".format(re.split('\(',str(fitclf))[0], inp_version, len(dota_data))
	fitclfscore, prec, rec, thresh, newinfos = scoreclf(fitclf, (X_test, y_test), collect_infos=True, class_names=inpdata["target_names"], clfname=clffilename)
	if validation:
		cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
		validation_plt = plot_learning_curve(inpclf, clffilename, dota_data, dota_target, ylim=(0.6,1.01), cv=cv, n_jobs=5)
		validation_plt.savefig(clffilename+"validation.png", format="png")
		validation_plt.show()
	infos.update(newinfos)
	# save classifier and score
	clfinfos = re.split('\(',str(fitclf).strip('\)'))
	properties = re.sub('[\n\t\ ]','',clfinfos[1])
	infos.update({"classifier":re.split('\(',str(fitclf))[0], "clf params":properties})
	if not os.path.isfile(clffilename+".pickle") or overwrite:
		with open(clffilename, "wb") as outp:
			pickle.dump((fitclf, infos), outp)
	return fitclf, fitclfscore, infos

In [None]:
#define classifiers to use for next cell
classifiers = [
			  KNeighborsClassifier(),
			  AdaBoostClassifier(),
			  DecisionTreeClassifier(),
			  GradientBoostingClassifier(),
			  RandomForestClassifier(),
			  ]

In [None]:
# run this to get all the nice output, just remember to set the correct csvpath in the first cell
inpd = csvImport(csvpath, wanted_version=inp_version, force_reload=False)
for i, clf in enumerate(classifiers):
	fitclf, fitclfscore, infos = clfREPS(clf, inpd1, inp_version=inp_version, train_test_split_size=0.9, random_split=True)
	resultdict.update({infos["classifier"]+inp_version:(fitclf, fitclfscore, infos)})
for elem in zip(resultdict['DecisionTreeClassifier'+inp_version][2]['feature_names'],resultdict['DecisionTreeClassifier'+inp_version][0].feature_importances_):
	print(elem)

In [None]:
# superlambda to replace tick values with state point names
def repltick(inp):
	orig = inp.group(0)
	if orig == None:
		return ""
	if orig == "150":
		return "p1"
	if orig == "180":
		return "p2"
	if orig == "210":
		return "p3"
	if orig == "240":
		return "p4"

In [None]:
#create feature importance plots
elist = [el for el in zip([re.sub("[0-9]*$",repltick, e) for e in resultdict['DecisionTreeClassifier8e'][2]['feature_names']], resultdict['DecisionTreeClassifier8e'][0].feature_importances_)]
elist.sort(key=lambda k:k[1])
names, data = zip(*elist)
fig, ax = plt.subplots(figsize=(8,8))
ax.set_xlabel('feature importance')
ax.set_ylabel('feature')
ax.grid(True, axis='x')
fig.tight_layout(rect=(0.2,0,0.95,1))
ax.barh(names, data)
#fig.savefig("dt-feature-importnce-plot.png", format='png')

In [None]:
elist = [el for el in zip([re.sub("[0-9]*$",repltick, e) for e in resultdict['RandomForestClassifier8e'][2]['feature_names']], resultdict['RandomForestClassifier8e'][0].feature_importances_)]
elist.sort(key=lambda k:k[1])#, reverse=True)
names, data = zip(*elist)
fig, ax = plt.subplots(figsize=(8,8))
ax.set_xlabel('feature importance')
ax.set_ylabel('feature')
ax.grid(True, axis='x')
fig.tight_layout(rect=(0.2,0,0.95,1))
ax.barh(names, data)
#fig.savefig("rf-feature-importance-plot.png", format='png')

In [None]:
elist = [el for el in zip([re.sub("[0-9]*$",repltick, e) for e in resultdict['GradientBoostingClassifier8e'][2]['feature_names']], resultdict['GradientBoostingClassifier8e'][0].feature_importances_)]
elist.sort(key=lambda k:k[1])
names, data = zip(*elist)
fig, ax = plt.subplots(figsize=(8,8))
ax.set_xlabel('feature importance')
ax.set_ylabel('feature')
ax.grid(True, axis='x', )
fig.tight_layout(rect=(0.2,0,0.95,1))
ax.barh(names, data)
#fig.savefig("gb-feature-importance-plot.png", format='png')