# Importing dataset and libs

In [None]:
import pandas as pd
import numpy as np
import math
from functools import reduce

import matplotlib.pyplot as plt
from matplotlib.pyplot import style
style.use("ggplot")

from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression

replays = pd.read_csv('../data/starcraft.csv')

In [None]:
replays.head()

# Cleaning

In [None]:
# deleting one row with 1 million hours of playtime
replays = replays[replays['TotalHours'] != 1000000]

replays.dropna(inplace=True)
replays.isnull().values.any()

replays.describe()

## LeagueIndex spread

In [None]:
plt.figure(figsize=(16, 8));

leagueCounts = replays['LeagueIndex'].value_counts().sort_index()

labels = leagueCounts.index.values.tolist()
y_pos = np.arange(len(labels))

# print(leagueCounts)
# print(replays.groupby('LeagueIndex').count())

plt.bar(y_pos, leagueCounts, align='center', alpha=1)
plt.xticks(y_pos, labels)

plt.title('Leagueindex spread')
plt.ylabel('Amount of replays')


## Combining LeagueIndexes

In [None]:
replays.loc[replays['LeagueIndex'] == 1, 'LeagueIndex'] = 2
replays.loc[replays['LeagueIndex'] == 7, 'LeagueIndex'] = 6

leagueCounts = replays['LeagueIndex'].value_counts().sort_index()
labels = leagueCounts.index.values.tolist()
y_pos = np.arange(len(labels))

plt.figure(figsize=(16, 8));
plt.bar(y_pos, leagueCounts, align='center', alpha=1)
plt.xticks(y_pos, labels)
plt.title('Leagueindex spread combined leagues')
plt.ylabel('Amount of replays')

# Plotting LeagueIndex against other features

In [None]:
plt.figure(figsize=(20, 20));

plt.subplot(5, 4, 1)
plt.title('Age')
plt.xticks([2, 3, 4, 5, 6])
plt.scatter(replays.LeagueIndex, replays.Age)

plt.subplot(5, 4, 2)
plt.title('HoursPerWeek')
plt.xticks([2, 3, 4, 5, 6])
plt.scatter(replays.LeagueIndex, replays.HoursPerWeek)

plt.subplot(5, 4, 3)
plt.title('TotalHours')
plt.scatter(replays.LeagueIndex, replays.TotalHours)

plt.subplot(5, 4, 4)
plt.title('APM')
plt.xticks([2, 3, 4, 5, 6])
plt.scatter(replays.LeagueIndex, replays.APM)

plt.subplot(5, 4, 5)
plt.title('SelectByHotkeys')
plt.xticks([2, 3, 4, 5, 6])
plt.scatter(replays.LeagueIndex, replays.SelectByHotkeys)

plt.subplot(5, 4, 6)
plt.title('AssignToHotkeys')
plt.ylim([0.0,0.002])
plt.xticks([2, 3, 4, 5, 6])
plt.scatter(replays.LeagueIndex, replays.AssignToHotkeys)

plt.subplot(5, 4, 7)
plt.title('UniqueHotkeys')
plt.ylim([0.0,0.0004])
plt.xticks([2, 3, 4, 5, 6])
plt.scatter(replays.LeagueIndex, replays.UniqueHotkeys)

plt.subplot(5, 4, 8)
plt.title('MinimapAttacks')
plt.ylim([0.0,0.005])
plt.xticks([2, 3, 4, 5, 6])
plt.scatter(replays.LeagueIndex, replays.MinimapAttacks)

plt.subplot(5, 4, 9)
plt.title('MinimapRightClicks')
plt.ylim([0.0,0.005])
plt.xticks([2, 3, 4, 5, 6])
plt.scatter(replays.LeagueIndex, replays.MinimapRightClicks)

plt.subplot(5, 4, 10)
plt.title('NumberOfPACs')
plt.ylim([0.0,0.010])
plt.xticks([2, 3, 4, 5, 6])
plt.scatter(replays.LeagueIndex, replays.NumberOfPACs)

plt.subplot(5, 4, 11)
plt.title('GapBetweenPACs')
plt.xticks([2, 3, 4, 5, 6])
plt.scatter(replays.LeagueIndex, replays.GapBetweenPACs)

plt.subplot(5, 4, 12)
plt.title('ActionLatency')
plt.xticks([2, 3, 4, 5, 6])
plt.scatter(replays.LeagueIndex, replays.ActionLatency)

plt.subplot(5, 4, 13)
plt.title('ActionsInPAC')
plt.xticks([2, 3, 4, 5, 6])
plt.scatter(replays.LeagueIndex, replays.ActionsInPAC)

plt.subplot(5, 4, 14)
plt.title('TotalMapExplored')
plt.ylim([0.0,0.001])
plt.xticks([2, 3, 4, 5, 6])
plt.scatter(replays.LeagueIndex, replays.TotalMapExplored)

plt.subplot(5, 4, 15)
plt.title('WorkersMade')
plt.ylim([0.0,0.006])
plt.xticks([2, 3, 4, 5, 6])
plt.scatter(replays.LeagueIndex, replays.WorkersMade)

plt.subplot(5, 4, 16)
plt.title('UniqueUnitsMade')
plt.ylim([0.0,0.00025])
plt.xticks([2, 3, 4, 5, 6])
plt.scatter(replays.LeagueIndex, replays.UniqueUnitsMade)

plt.subplot(5, 4, 17)
plt.title('ComplexUnitsMade')
plt.ylim([0.0,0.001])
plt.xticks([2, 3, 4, 5, 6])
plt.scatter(replays.LeagueIndex, replays.ComplexUnitsMade)

plt.subplot(5, 4, 18)
plt.title('ComplexAbilityUsed')
plt.ylim([0.0,0.004])
plt.xticks([2, 3, 4, 5, 6])
plt.scatter(replays.LeagueIndex, replays.ComplexAbilityUsed)

plt.subplot(5, 4, 19)
plt.title('MaxTimeStamp')
plt.xticks([2, 3, 4, 5, 6])
plt.scatter(replays.LeagueIndex, replays.MaxTimeStamp)

plt.tight_layout()

# Predicting league index

## Splitting into train and test

In [None]:
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(replays.iloc[:, 2:].values, replays.iloc[:, 1].values, test_size=test_size)

### Normalize/standardize

In [None]:
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Feature selection

In [None]:
reg = LinearRegression()
reg.fit(X_train, y_train)

pred = reg.predict(X_test)
r_square = reg.score(X_test, y_test)
print(r_square)

## Linear Support Vector Machines

In [None]:
clf = SVC(random_state=0, tol=1e-5, kernel='linear')
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
acc = accuracy_score(pred, y_test)
print(acc)

## KFold Cross Validation

In [None]:
X = np.array(list(zip(replays.Age.values, replays.HoursPerWeek.values, replays.Age.values, replays.TotalHours.values, replays.SelectByHotkeys.values, replays.AssignToHotkeys.values, 
                      replays.UniqueHotkeys.values, replays.MinimapAttacks.values, replays.MinimapRightClicks.values, replays.NumberOfPACs.values, replays.GapBetweenPACs.values, 
                      replays.ActionLatency.values, replays.ActionsInPAC.values, replays.TotalMapExplored.values, replays.WorkersMade.values, replays.UniqueUnitsMade.values,
                      replays.ComplexUnitsMade.values, replays.ComplexAbilityUsed.values, replays.MaxTimeStamp.values, replays.APM.values)))
y = np.array(list(replays.LeagueIndex.values))

kf = KFold(n_splits=2, shuffle=True)

kf.get_n_splits(X)

accuracy_scores = []
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    
    #Create train and test set
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    #Standardize data
    scaler = preprocessing.StandardScaler().fit(X_train)
    f_train = scaler.transform(X_train)
    f_test = scaler.transform(X_test)
    
    #Create model
    clf = SVC(random_state=0, tol=1e-5, kernel='linear')
    clf.fit(f_train, y_train)
    pred = clf.predict(f_test)
    
    #Check accuracy
    acc = accuracy_score(pred, y_test)
    accuracy_scores.append(acc)
    
print(accuracy_scores) 
print(reduce(lambda x, y: x + y, accuracy_scores) / len(accuracy_scores))

## KNeighborsClassifier

In [None]:
results = []
labels = []
for i in range(20, 50):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(f_train, labels_train)
    results.append(knn.score(f_test, labels_test))
    labels.append(i)
y_pos = np.arange(len(labels))
plt.figure(figsize=(16, 8));
plt.bar(y_pos, results, align='center', alpha=1)
plt.xticks(y_pos, labels)    