# Pro CS-GO Match Prediction

## TO-DO

- More EDA
- More complex models. Models are trained on a bunch of team/player data at time of match

In [1]:
# Import necessary Python, sklearn and/or tensorflow/keras modules for loading the dataset
import os
import pandas as pd
import numpy as np


#Load the data
DATA_FILE = 'df.csv'
if not os.path.exists(DATA_FILE):
    raise Exception('Data file not found. Make sure that the file is located in the same directory as the notebook')

df = pd.read_csv(DATA_FILE, sep=',', header=0, index_col=False)

# Basic overview of data shape, size, and type
df.info()

# Print data shape via built-in methods of sklearn, pandas or tensorflow/keras (or other modules)
print('\nDataframe shape: ', df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1059 entries, 0 to 1058
Data columns (total 18 columns):
Maps played        1059 non-null int64
Wins               1059 non-null int64
Losses             1059 non-null int64
Total kills        1059 non-null int64
Total deaths       1059 non-null int64
Rounds played      1059 non-null int64
K/D Ratio          1059 non-null object
O_Maps played      1059 non-null int64
O_Wins             1059 non-null int64
O_Losses           1059 non-null int64
O_Total kills      1059 non-null int64
O_Total deaths     1059 non-null int64
O_Rounds played    1059 non-null int64
O_K/D Ratio        1059 non-null object
dates              1059 non-null object
opponents          1059 non-null object
maps               1059 non-null object
results            1059 non-null bool
dtypes: bool(1), int64(12), object(5)
memory usage: 141.8+ KB

Dataframe shape:  (1059, 18)


In [2]:
for column in df:
    print(df[column].describe(), '\n\n')

count    1059.000000
mean        8.011331
std         5.450227
min         0.000000
25%         3.000000
50%         7.000000
75%        12.000000
max        24.000000
Name: Maps played, dtype: float64 


count    1059.000000
mean        4.351275
std         3.735505
min         0.000000
25%         1.000000
50%         4.000000
75%         7.000000
max        17.000000
Name: Wins, dtype: float64 


count    1059.000000
mean        3.590179
std         2.535222
min         0.000000
25%         2.000000
50%         3.000000
75%         5.000000
max        13.000000
Name: Losses, dtype: float64 


count    1059.00000
mean      725.84797
std       512.43665
min         0.00000
25%       295.00000
50%       628.00000
75%      1114.00000
max      2204.00000
Name: Total kills, dtype: float64 


count    1059.000000
mean      708.674221
std       481.068481
min         0.000000
25%       300.500000
50%       628.000000
75%      1064.500000
max      2125.000000
Name: Total deaths, dtype: float

In [3]:
df.loc[df['K/D Ratio'] == ' ','K/D Ratio'] = np.nan        # If K/D is NaN, no data exists for team on map
df['K/D Ratio'] = pd.to_numeric(df['K/D Ratio'])
df.dropna(subset=['K/D Ratio'], inplace=True) 

df.loc[df['O_K/D Ratio'] == ' ','O_K/D Ratio'] = np.nan        # If K/D is NaN, no data exists for team on map
df['O_K/D Ratio'] = pd.to_numeric(df['O_K/D Ratio'])
df.dropna(subset=['O_K/D Ratio'], inplace=True)

df = df[df['maps'] != 'false']  # games where opponent data was unknown had map set to false
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 562 entries, 0 to 1049
Data columns (total 18 columns):
Maps played        562 non-null int64
Wins               562 non-null int64
Losses             562 non-null int64
Total kills        562 non-null int64
Total deaths       562 non-null int64
Rounds played      562 non-null int64
K/D Ratio          562 non-null float64
O_Maps played      562 non-null int64
O_Wins             562 non-null int64
O_Losses           562 non-null int64
O_Total kills      562 non-null int64
O_Total deaths     562 non-null int64
O_Rounds played    562 non-null int64
O_K/D Ratio        562 non-null float64
dates              562 non-null object
opponents          562 non-null object
maps               562 non-null object
results            562 non-null bool
dtypes: bool(1), float64(2), int64(12), object(3)
memory usage: 79.6+ KB


In [4]:
# Some feature engineering
df['kills_per_round'] = df['Total kills'] / df['Rounds played']
df['kills_per_map'] = df['Total kills'] / df['Maps played']
df['win_percentage'] = df['Wins'] / df ['Maps played']


df['O_kills_per_round'] = df['O_Total kills'] / df['O_Rounds played']
df['O_kills_per_map'] = df['O_Total kills'] / df['O_Maps played']
df['O_win_percentage'] = df['O_Wins'] / df ['O_Maps played']

# Drop poor features
df.drop(['dates',
         'opponents'], 
        axis=1,
        inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 562 entries, 0 to 1049
Data columns (total 22 columns):
Maps played          562 non-null int64
Wins                 562 non-null int64
Losses               562 non-null int64
Total kills          562 non-null int64
Total deaths         562 non-null int64
Rounds played        562 non-null int64
K/D Ratio            562 non-null float64
O_Maps played        562 non-null int64
O_Wins               562 non-null int64
O_Losses             562 non-null int64
O_Total kills        562 non-null int64
O_Total deaths       562 non-null int64
O_Rounds played      562 non-null int64
O_K/D Ratio          562 non-null float64
maps                 562 non-null object
results              562 non-null bool
kills_per_round      562 non-null float64
kills_per_map        562 non-null float64
win_percentage       562 non-null float64
O_kills_per_round    562 non-null float64
O_kills_per_map      562 non-null float64
O_win_percentage     562 non-null float6

In [5]:
corr_matrix = df.corr()
corr_matrix['results'].sort_values(ascending=False)

results              1.000000
kills_per_map        0.102742
kills_per_round      0.091076
Wins                 0.082249
K/D Ratio            0.072308
Total kills          0.068358
Rounds played        0.062397
win_percentage       0.062331
Total deaths         0.059400
Maps played          0.053990
O_Losses            -0.000890
Losses              -0.013009
O_kills_per_round   -0.054464
O_K/D Ratio         -0.062239
O_win_percentage    -0.086554
O_kills_per_map     -0.091288
O_Total deaths      -0.103922
O_Maps played       -0.104602
O_Rounds played     -0.110485
O_Total kills       -0.113236
O_Wins              -0.140927
Name: results, dtype: float64

In [6]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer

In [7]:
cat_df_list = list(df.select_dtypes(include=['object']))
num_df_list = list(df.select_dtypes(include=['float64', 'int64']))

In [8]:
# pipeline for numerical features
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
    
])

# pipeline for categorical features
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('one_hot', OneHotEncoder()),
])

X_pipeline = ColumnTransformer([
    ('cat', cat_pipeline, cat_df_list),
    ('num', num_pipeline, num_df_list),
])

In [9]:
X = df.drop(columns=['results'])
y = df['results']

# Split to train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Prep data with pipeline
X_prepared = X_pipeline.fit_transform(X)  # Whole set ran through pipeline for cross-val
X_train_prepared = X_pipeline.transform(X_train)
X_test_prepared = X_pipeline.transform(X_test)

In [10]:
from sklearn.svm import SVC
from sklearn import metrics

# Quick SVM just for fun
svm = SVC(kernel='linear', gamma='scale', degree=1)
svm_cv_score = cross_val_score(svm, X_prepared, y.values, cv=10, scoring='roc_auc')
print('Mean AUC Score - SVM: ', svm_cv_score.mean())

svm.fit(X_train_prepared, y_train.values)
svm_preds = svm.predict(X_test_prepared)
svm_fpr, svm_tpr, svm_threshold = metrics.roc_curve(y_test.values, svm_preds)
svm_roc_auc = metrics.auc(svm_fpr, svm_tpr)
svm_roc_acc = metrics.accuracy_score(y_test.values, svm_preds)

print('Test Batch AUC: ', svm_roc_auc)
print('Test Batch Accuracy: ', svm_roc_acc)

Mean AUC Score - SVM:  0.45148770311813785
Test Batch AUC:  0.5697831978319783
Test Batch Accuracy:  0.672566371681416
