# **Import Required Libraries**

---



In [1]:
import pandas as pd
import os
from collections import defaultdict
import statistics
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import mutual_info_classif
from sklearn import tree
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_theme()

# **Intialisation of Data**

---



In [2]:
# Creating a dictionary of the path to reach the data sets within the assignment 2 folder
files = defaultdict(int)
for data_types in os.listdir("/course/data/a2/"):
    for file_name in os.listdir("/course/data/a2/" + data_types):
        name = file_name.split(".")[0]
        if name == 'description':
            name = data_types + "_" + name
        files[name] = "/course/data/a2/" + data_types + "/" + file_name

# Read and save all raw data
KRmatch_raw = pd.read_csv(files['KRmatch'])
EUmatch_raw = pd.read_csv(files['EUmatch'])
NAmatch_raw = pd.read_csv(files['NAmatch'])

# Open and print game description
desc_game = open(files['games_description'], "r")

# **Helper Functions**

---



In [3]:
"""
A helper function that extracts the columns from the data, removing any 
missing value rows, and alphabetically sorting each champion while 
reseting the index. It also creates new useful columns from the
previous columns so that the data is more easily used.
"""

def cleaner(dataset):
    
    columns = ['champion', 'assists', 'damage_objectives', 'damage_building', 'damage_turrets', 
    'deaths', 'kills', 'time_cc', 'damage_taken', 'vision_score', 'damage_total', 'role', 'gold_earned']
    dataset = dataset[columns] # Takes necessary columns
    dataset = dataset.dropna() # Removes rows with missing values
    dataset = dataset.sort_values(by='champion') # Alphabetically sorted by champion
    dataset = dataset.reset_index() # Resets the index
    dataset = dataset[columns] 

    # Creating new columns Kill-to-Death Ratio and Assist-to-Death Ratio
    KD = []
    AD = []
    for row_index in range(0, len(dataset)):
        row = dataset.loc[row_index]
        deaths = row['deaths']

        if deaths == 0:
            deaths = 1

        row_KD = row['kills'] / deaths
        row_AD = row['assists'] / deaths
        KD.append(row_KD)
        AD.append(row_AD)

    dataset['KD'] = KD
    dataset['AD'] = AD

    return dataset

In [4]:
"""
A helper function that uses classification to predict the 
role of a champion, and returns the average of all the accuracies 
found, the model, accuracy of the iteration and the confustion matrix for this model
"""

def the_classifier(df_with_roles):
    conf_matrix = np.array(([0,0],[0,0]))
    # Splitting the data frame into all the columns but "ROLE"
    X = df_with_roles.iloc[:,:-1]
    # The ROLE column from the Data Frame
    y = df_with_roles.iloc[:,-1]

    # Creating a decision tree and performing kfold with 5 splits
    dtree = DecisionTreeClassifier()
    kfold = KFold(n_splits=5, random_state=None)
    accuracy_val = []

    # Repeated Kfold 10 times 
    for R in range(10):
        for train_index , test_index in kfold.split(X):
            X_train , X_test = X.iloc[train_index,:], X.iloc[test_index,:]
            y_train , y_test = y[train_index] , y[test_index]
            
            # Performing classification and determining the accuracy
            dtree.fit(X_train,y_train)
            pred_values = dtree.predict(X_test)
            current_accuracy = accuracy_score(pred_values, y_test)
            accuracy_val.append(current_accuracy)
            cm = confusion_matrix(y_test, pred_values)
            conf_matrix += cm
        one_acc = current_accuracy

    return sum(accuracy_val)/len(accuracy_val), dtree, one_acc, conf_matrix/10

# **Main Code**

---



## **Data Wrangling**

---



In [5]:
# Collates each region data into one global dataset
KRmatch = cleaner(KRmatch_raw)
EUmatch = cleaner(EUmatch_raw)
NAmatch = cleaner(NAmatch_raw)
ALLmatch = pd.concat([KRmatch, EUmatch, NAmatch])

ALLmatch = ALLmatch.drop("deaths", axis = 1)

# Discretising the role column and placing it in a list
roles = {'TopLane_Jungle' : 1, 'Other' : 0}
ALLmatch.role = [roles[item] for item in ALLmatch.role]

# Changing the index to the name of the champion
Champs = defaultdict(list)
ALLmatch = ALLmatch.to_dict('records')
for row in ALLmatch:
    Champs[row.pop('champion', None)].append(row)

# List to store TopLane/Jungle champions
TopLane_Jungle_Champs = []

"""
Iterating through the champion dictionary, and taking the median 
and SD of all champions in order to create a DataFrame of all the 
champions median and Standard Deviation scores.
"""

champion_median = defaultdict()

for champion in Champs.keys():
    champion_df = pd.DataFrame(Champs[champion])
    champdf_median = champion_df.median(numeric_only=True)
    if champdf_median['role'] > 0:
        TopLane_Jungle_Champs.append(champion)

    champion_median[champion] = champdf_median.to_dict()

# Transposing the data frame
champion_sd = pd.DataFrame(champion_median)
champion_sd_T = champion_sd.T

standardized_df = (champion_sd_T-champion_sd_T.mean())/champion_sd_T.std()
standardized_df = standardized_df

# Discretising the role and placing it in the Data Frame
role_bin = []
for row in standardized_df['role']:
    if row > 0:
        role_bin.append(1)
    else:
        role_bin.append(0)
standardized_df = standardized_df.drop('role', axis = 1)

standardized_df['ROLE'] = role_bin

## **Modelling**

---



Calculating the mutual information to measure the correlation between features and TopLane_Jungle to
determine which features negatively impact the model training and remove them 

As MI produces a positive value we need to determine a threshold where if a feature is greater than
the threshold it is kept and if its below, it's removed. 

In order to determine what the threshold should be we create a range of values and calculate the 
accuracy of each classification model and choose the highest accuracy and hence the affiliated 
threshold.

In [6]:

"""
Determining which MI threshold will produce the highest accuracy
in the classification model.
"""

mi_arr = mutual_info_classif(X=standardized_df, y=role_bin, discrete_features=False)
MI_VAL = defaultdict(int)

k_ten = []
k_one = []
Threshold_used = []
Models = ["Model 1", "Model 2", "Model 3", "Model 4", "Model 5", "Best Model"]

# Testing a range of values for the threshold
for THRESHOLD in np.arange(0,0.02, 0.004):
    filtered_features = [] 

    #This formatting of the code was taken from the Week 8 Workshop
    # Determining which features' MI values are higher than the THRESHOLD 
    for feature, mi in zip(standardized_df.columns, mi_arr):
        MI_VAL[feature] = round(mi, 5)    
        if(mi >= THRESHOLD): 
            filtered_features.append(feature)
    temp_df = standardized_df[filtered_features]

    # Calling the classification model to determine accuracy
    curr_accuracy, model, one_acc, conf_matrix = the_classifier(standardized_df[filtered_features])

    k_ten.append(curr_accuracy)
    k_one.append(one_acc)
    Threshold_used.append(THRESHOLD)
    
    # Checking if it's the first iteration
    if THRESHOLD == 0:
        best_accuracy = curr_accuracy

    # Checking for the best accuracy and performing
    # Feature selection 
    if curr_accuracy > best_accuracy:
        best_accuracy = curr_accuracy
        BEST_FEATURES = filtered_features
        Best_Thresh = THRESHOLD


MI_VAL.pop("ROLE")

MI_DF = pd.DataFrame.from_dict(MI_VAL, orient='index', columns=["MI Value"])
MI_DF.to_csv('Features.csv', index=True)

# Creating the final model with the best features
score, Model, one_acc, conf_matrix = the_classifier(standardized_df[BEST_FEATURES])

k_ten.append(score)
k_one.append(one_acc)
Threshold_used.append(Best_Thresh)

Accuracy_DF = pd.DataFrame([Threshold_used, k_one, k_ten], index=["Threshold Used", "1 - Fold Accuracy Score", "10 - Fold Accuracy Score" ]).T
Accuracy_DF["Models"] = Models
Accuracy_DF = Accuracy_DF.set_index('Models').round(5)
Accuracy_DF.to_csv('Accuracy.csv', index=True)
score = round(score, 5)

## **Plotting Data**

---



In [7]:
# Graph Decison Tree
fig = plt.figure(figsize=(25,20))
tree.plot_tree(Model)
plt.title("Decision Tree Classifer Model: 10-Fold Classification", size=32)
plt.savefig("ModelTree.png")
plt.close() #UNCOMMENT TO HIDE GRAPH

# Graph Confusion Matrix
conf_matrix_df = pd.DataFrame(conf_matrix, index = [i for i in ["Predicted Positive", "Predicted Negative"]],
                  columns = [i for i in ["Actual Positive", "Actual Negative"]])
plt.figure()
plt.title("Concacted Confusion Matrix: 10-Fold Classification")
sns.heatmap(conf_matrix_df, annot=True)
plt.autoscale()
plt.savefig("confusion_matrix.png")
plt.close() #UNCOMMENT TO HIDE GRAPH

# Graph Density Functions
Features = BEST_FEATURES[:-1]
Only_JGL = standardized_df.loc[standardized_df["ROLE"] > 0]
for feat in Features:
    CURR_DF = Only_JGL[feat]
    plt.figure()
    CURR_DF.plot.density(color='green')
    plt.title('Density plot for ' + feat)
    plt.ylabel("Probabilty Density Estimation")
    plt.xlabel("Standardized Score for " + feat)
    plt.autoscale()
    plt.savefig("graphs/" + feat + "-graph.png")
    plt.close()    #UNCOMMENT TO HIDE GRAPHS