In [1]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999

<IPython.core.display.Javascript object>

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data = pd.read_csv('high_diamond_ranked_10min.csv')

In [4]:
df = data.copy()

In [5]:
df.head()

Unnamed: 0,gameId,blueWins,blueWardsPlaced,blueWardsDestroyed,blueFirstBlood,blueKills,blueDeaths,blueAssists,blueEliteMonsters,blueDragons,...,redTowersDestroyed,redTotalGold,redAvgLevel,redTotalExperience,redTotalMinionsKilled,redTotalJungleMinionsKilled,redGoldDiff,redExperienceDiff,redCSPerMin,redGoldPerMin
0,4519157822,0,28,2,1,9,6,11,0,0,...,0,16567,6.8,17047,197,55,-643,8,19.7,1656.7
1,4523371949,0,12,1,0,5,5,5,0,0,...,1,17620,6.8,17438,240,52,2908,1173,24.0,1762.0
2,4521474530,0,15,0,0,7,11,4,1,1,...,0,17285,6.8,17254,203,28,1172,1033,20.3,1728.5
3,4524384067,0,43,1,0,4,5,5,1,0,...,0,16478,7.0,17961,235,47,1321,7,23.5,1647.8
4,4436033771,0,75,4,0,6,6,6,0,0,...,0,17404,7.0,18313,225,67,1004,-230,22.5,1740.4


In [6]:
df = df.drop('gameId', axis=1)

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
trainset, testset = train_test_split(df, test_size=0.2, random_state=0)

In [9]:
trainset['blueWins'].value_counts()

1    3953
0    3950
Name: blueWins, dtype: int64

In [10]:
testset['blueWins'].value_counts()

0    999
1    977
Name: blueWins, dtype: int64

In [11]:
from sklearn.preprocessing import StandardScaler

In [12]:
def encoding(df, hypothesis):
    # No need for this dataset, no data with 'object' type
    return df

In [13]:
def feature_engineering(df, hypothesis):
    # Keep the dataset as it is
    if hypothesis == 0:
        columns = []
        
    # Hypothesis 1 : The vars like kills and deaths, or levels and xp, or total minions and cs per minute seems too similar,
    # maybe the model would work better without them
    if hypothesis == 1:
        columns = ['blueTotalExperience', 'blueCSPerMin', 'redDeaths', 'redKills', 'redTotalExperience', 'redCSPerMin']
        
    # Hypothesis 2 : We can supress the vars related to monsters, wards and towers
    elif hypothesis == 2:
        columns = ['blueWardsPlaced', 'blueWardsDestroyed', 'blueEliteMonsters', 'blueDragons', 'blueHeralds',
                        'blueTowersDestroyed',
                        'redWardsPlaced', 'redWardsDestroyed', 'redEliteMonsters', 'redDragons', 'redHeralds',
                        'redTowersDestroyed']
        
    # Both hypothesis
    elif hypothesis == 3:
        columns = ['blueWardsPlaced', 'blueWardsDestroyed', 'blueEliteMonsters', 'blueDragons', 'blueHeralds',
                        'blueTowersDestroyed', 'blueDeaths', 'blueTotalExperience', 'blueCSPerMin',
                        'redWardsPlaced', 'redWardsDestroyed', 'redEliteMonsters', 'redDragons', 'redHeralds',
                        'redTowersDestroyed', 'redDeaths', 'redTotalExperience', 'redCSPerMin']
    df = df.drop(columns, axis=1)
    return df

In [14]:
def imputation(df, hypothesis):
    # No need for this dataset, no 'nan' data
    return df

In [15]:
from sklearn.preprocessing import OneHotEncoder

In [38]:
def preprocessing(df, hypothesis):
    df = encoding(df, hypothesis)
    df = feature_engineering(df, hypothesis)
    df = imputation(df, hypothesis)
    
    X = df.drop('blueWins', axis=1)
    y = df['blueWins']
    
    # Standardisation to keep X as a DataFrame
    scaled_features = StandardScaler().fit_transform(X)
    X = pd.DataFrame(scaled_features, index=X.index, columns=X.columns)
    
    # OneHotEncoding
    print(y.to_numpy().reshape(-1,1))
    y = OneHotEncoder().fit_transform(y.to_numpy().reshape(-1,1))
    
    return X,y

# Modelisation

In [17]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA

In [18]:
preprocessors = {
    'PCA' : PCA(n_components=3),
    'PolynomialFeature': PolynomialFeatures(3),
    'SelectKBest': SelectKBest(f_classif, k=10)
}

In [19]:
estimator = RandomForestClassifier(random_state=0)

# Evalutation hypothesis

In [20]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.model_selection import learning_curve

In [21]:
def evaluation(model, displayBar = True):
    for hypothesis in range(4):
        X_train, y_train = preprocessing(trainset, hypothesis)
        X_test, y_test = preprocessing(testset, hypothesis)
        model.fit(X_train, y_train)
        ypred = model.predict(X_test)
        
        print('------- Hypothesis ' + str(hypothesis) + '-------')
        print(confusion_matrix(y_test, ypred))
        print(classification_report(y_test, ypred))
    
        N, train_score, val_score = learning_curve(model, X_train, y_train,
                                              cv=4, scoring='f1',
                                               train_sizes=np.linspace(0.1, 1, 10))
    
    
        plt.figure(figsize=(12, 8))
        plt.plot(N, train_score.mean(axis=1), label='train score')
        plt.plot(N, val_score.mean(axis=1), label='validation score')
        plt.legend()

# Test hypothesis

In [39]:
evaluation(estimator, displayBar=False)

[[1]
 [1]
 [0]
 ...
 [1]
 [0]
 [0]]
[[0]
 [1]
 [0]
 ...
 [0]
 [1]
 [1]]


ValueError: Unknown label type: 'unknown'

We will chose hypothesis 1

# Test processors

In [None]:
def encoding(df):
    # No need for this dataset, no data with 'object' type
    return df

def feature_engineering(df):
    columns = ['blueTotalExperience', 'blueCSPerMin', 'redDeaths', 'redKills', 'redTotalExperience', 'redCSPerMin']
    df = df.drop(columns, axis=1)
    return df

def imputation(df):
    # No need for this dataset, no 'nan' data
    return df

def preprocessing(df):
    df = encoding(df)
    df = feature_engineering(df)
    df = imputation(df)
    
    X = df.drop('blueWins', axis=1)
    y = df['blueWins']
    
    # Standardisation to keep X as a DataFrame
    scaled_features = StandardScaler().fit_transform(X)
    X = pd.DataFrame(scaled_features, index=X.index, columns=X.columns)
    
    return X,y

def evaluation(model):
    X_train, y_train = preprocessing(trainset)
    X_test, y_test = preprocessing(testset)
    model.fit(X_train, y_train)
    ypred = model.predict(X_test)
        
    print(confusion_matrix(y_test, ypred))
    print(classification_report(y_test, ypred))
    
#     N, train_score, val_score = learning_curve(model, X_train, y_train,
#                                             cv=4, scoring='f1',
#                                             train_sizes=np.linspace(0.1, 1, 10))
    
    
#     plt.figure(figsize=(12, 8))
#     plt.plot(N, train_score.mean(axis=1), label='train score')
#     plt.plot(N, val_score.mean(axis=1), label='validation score')
#     plt.legend()

In [None]:
for name, preprocessor in preprocessors.items():
    print(name)
    evaluation(make_pipeline(preprocessor, estimator))

It doesn't seems to give good result