# Prediction with Rank

### Question or problem definition

But : Déterminer quel joueur va gagner en fonction du rang

### Acquire training and testing data

Importer les librairies nécessaires :

In [1]:
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

### Acquire Data

In [2]:
train_df = pd.read_csv('input/balanced-2014-2017.csv')
test_df = pd.read_csv('input/balanced-2018.csv')
combine = [train_df, test_df]

In [3]:
train_df.head()

Unnamed: 0,match_id,date,player1_id,player2_id,player1_rank,player2_rank,player1_rank_points,player2_rank_points,player1_elo_rating,player2_elo_rating,won
0,155602,21/04/2014,4894,4022,50.0,15.0,947.0,2065.0,1867.0,2036.0,1
1,155603,21/04/2014,4659,5763,92.0,16.0,605.0,2040.0,1861.0,2112.0,0
2,155604,21/04/2014,3908,4789,102.0,24.0,569.0,1580.0,1871.0,2085.0,0
3,155605,21/04/2014,4467,4585,28.0,43.0,1315.0,1016.0,1979.0,1902.0,0
4,155202,06/03/2014,5231,4742,31.0,1.0,1205.0,14085.0,1992.0,2510.0,1


### Analyze by describing data

Which features are available in the dataset?

In [4]:
print(train_df.columns.values)

['match_id' 'date' 'player1_id' 'player2_id' 'player1_rank' 'player2_rank'
 'player1_rank_points' 'player2_rank_points' 'player1_elo_rating'
 'player2_elo_rating' 'won']


In [5]:
train_df.info()
print('_'*40)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11602 entries, 0 to 11601
Data columns (total 11 columns):
match_id               11602 non-null int64
date                   11602 non-null object
player1_id             11602 non-null int64
player2_id             11602 non-null int64
player1_rank           11507 non-null float64
player2_rank           11505 non-null float64
player1_rank_points    11507 non-null float64
player2_rank_points    11505 non-null float64
player1_elo_rating     11586 non-null float64
player2_elo_rating     11586 non-null float64
won                    11602 non-null int64
dtypes: float64(6), int64(4), object(1)
memory usage: 997.2+ KB
________________________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2974 entries, 0 to 2973
Data columns (total 11 columns):
match_id               2974 non-null int64
date                   2974 non-null object
player1_id             2974 non-null int64
player2_id             2974 non-null int64
player1_r

Features contain blank, null or empty values :
- train :
    - player1_rank
    - player2_rank
    - player1_rank_points
    - player2_rank_points
    - player1_elo_rating
    - player2_elo_rating
- test :
    - player1_rank
    - player2_rank
    - player1_rank_points
    - player2_rank_points
    - player1_elo_rating
    - player2_elo_rating

In [6]:
train_df = train_df.dropna(subset=['player1_elo_rating','player2_elo_rating'])
test_df = test_df.dropna(subset=['player1_elo_rating','player2_elo_rating'])
combine = [train_df, test_df]

In [7]:
train_df = train_df.drop(['match_id','date','player1_id','player2_id','player1_rank','player2_rank','player1_rank_points','player2_rank_points'], axis=1)
test_df = test_df.drop(['match_id','date','player1_id','player2_id','player1_rank','player2_rank','player1_rank_points','player2_rank_points'], axis=1)

In [8]:
train_df.head()

Unnamed: 0,player1_elo_rating,player2_elo_rating,won
0,1867.0,2036.0,1
1,1861.0,2112.0,0
2,1871.0,2085.0,0
3,1979.0,1902.0,0
4,1992.0,2510.0,1


In [9]:
test_df.head()

Unnamed: 0,player1_elo_rating,player2_elo_rating,won
0,1989.0,1859.0,1
1,1948.0,1768.0,0
2,1876.0,1844.0,0
3,1801.0,1868.0,0
4,2040.0,1685.0,1


In [10]:
combine = [train_df, test_df]
for dataset in combine:
    dataset['dif_elo_rating'] = dataset['player1_elo_rating'] - dataset['player2_elo_rating']
train_df = train_df.drop(['player1_elo_rating','player2_elo_rating'], axis=1)
test_df = test_df.drop(['player1_elo_rating','player2_elo_rating'], axis=1)
train_df.head()

Unnamed: 0,won,dif_elo_rating
0,1,-169.0
1,0,-251.0
2,0,-214.0
3,0,77.0
4,1,-518.0


### Model, predict and solve

In [11]:
# machine learning
from sklearn.linear_model import LogisticRegression

In [12]:
X_train = train_df.drop("won", axis=1)
Y_train = train_df["won"]
X_test = test_df.drop('won', axis=1).copy()
Y_test = test_df['won']
X_train.shape, Y_train.shape, X_test.shape

((11586, 1), (11586,), (2958, 1))

In [13]:
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)

In [34]:
from sklearn.metrics import classification_report, log_loss

In [15]:
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.66      0.65      0.66      1519
           1       0.64      0.65      0.64      1439

    accuracy                           0.65      2958
   macro avg       0.65      0.65      0.65      2958
weighted avg       0.65      0.65      0.65      2958



In [16]:
from sklearn.metrics import confusion_matrix

In [17]:
confusion_matrix(Y_test, Y_pred)

array([[987, 532],
       [501, 938]], dtype=int64)

In [18]:
Y_test.shape

(2958,)

In [19]:
(987+938)/2958

0.6507775524002705

Test on 2019 data :

In [23]:
test_df_2019 = pd.read_csv('input/balanced-2019.csv')

In [24]:
test_df_2019.isnull().sum()

match_id                0
date                    0
player1_id              0
player2_id              0
player1_rank           13
player2_rank           15
player1_rank_points    13
player2_rank_points    15
player1_elo_rating     16
player2_elo_rating     16
won                     0
dtype: int64

In [25]:
test_df_2019 = test_df_2019.dropna(subset=['player1_elo_rating', 'player2_elo_rating'])

In [26]:
test_df_2019 = test_df_2019.drop(['match_id', 'date', 'player1_id', 'player2_id', 'player1_rank_points', 'player2_rank_points', 'player1_rank', 'player2_rank'], axis=1)

In [27]:
test_df_2019['elo_rating_diff'] = test_df_2019['player1_elo_rating'] - test_df_2019['player2_elo_rating']
test_df_2019 = test_df_2019.drop(['player1_elo_rating', 'player2_elo_rating'], axis=1)

In [28]:
train_df_2014_2018 = pd.concat([train_df, test_df])

In [29]:
train_df_2014_2018['won'].value_counts()

0    7311
1    7233
Name: won, dtype: int64

In [30]:
X_train_2014_2018 = train_df_2014_2018.drop('won', axis=1)
Y_train_2014_2018 = train_df_2014_2018['won']
X_test_2019 = test_df_2019.drop('won', axis=1).copy()
Y_test_2019 = test_df_2019['won']

In [31]:
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train_2014_2018, Y_train_2014_2018)
Y_pred_2019 = logreg.predict(X_test_2019)

In [32]:
print(classification_report(Y_test_2019, Y_pred_2019))

              precision    recall  f1-score   support

           0       0.66      0.66      0.66      1519
           1       0.64      0.64      0.64      1439

    accuracy                           0.65      2958
   macro avg       0.65      0.65      0.65      2958
weighted avg       0.65      0.65      0.65      2958



In [35]:
log_loss(Y_test_2019, logreg.predict_proba(X_test_2019))

0.6254623580975438