In [2]:
import pandas as pd
import os
import numpy as np

In [3]:
df = pd.read_csv(os.path.abspath('../input/standardized-diff-cleaned-featured-2008-2017.csv'))

In [4]:
df.columns.values

array(['p_number_of_matches_used', 'direct', 'o_number_of_matches_used',
       'player_id', 'opponent_id', 'tournament_name', 'date', 'p_matches',
       'fs_diff', 'w1sp_diff', 'w2sp_diff', 'wrp_diff', 'tpw_diff',
       'tmw_diff', 'aces_per_game_diff', 'df_per_game_diff',
       'bp_won_diff', 'wsp_diff', 'completeness_diff', 'serve_adv_diff'],
      dtype=object)

Drop unused features for prediction :

In [5]:
df = df.drop(['p_number_of_matches_used', 'o_number_of_matches_used', 'player_id', 'opponent_id', 'tournament_name', 'date'], axis=1)

In [6]:
X_train = df.drop(['p_matches'], axis=1).copy()
Y_train = df['p_matches']

In [7]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

estimator = LogisticRegression()
selector = RFE(estimator, n_features_to_select=1, step=1)
selector = selector.fit(X_train, Y_train)
selector.support_

array([False, False, False, False, False, False, False, False, False,
       False, False, False,  True])

In [8]:
selector.ranking_

array([ 9, 12,  7, 13,  4,  8,  5,  6, 11, 10,  3,  2,  1])

In [9]:
X_train.columns.values

array(['direct', 'fs_diff', 'w1sp_diff', 'w2sp_diff', 'wrp_diff',
       'tpw_diff', 'tmw_diff', 'aces_per_game_diff', 'df_per_game_diff',
       'bp_won_diff', 'wsp_diff', 'completeness_diff', 'serve_adv_diff'],
      dtype=object)

In [10]:
pd.DataFrame({'ranking': X_train.columns.values},index=selector.ranking_).sort_index()

Unnamed: 0,ranking
1,serve_adv_diff
2,completeness_diff
3,wsp_diff
4,wrp_diff
5,tmw_diff
6,aces_per_game_diff
7,w1sp_diff
8,tpw_diff
9,direct
10,bp_won_diff


import test dataset to evaluate with logistic loss :

In [11]:
test_df = pd.read_csv(os.path.abspath('../input/balanced-standardized-diff-cleaned-featured-2018.csv'))

In [12]:
test_df = test_df.drop(['p_number_of_matches_used', 'o_number_of_matches_used', 'player_id', 'opponent_id', 'tournament_name', 'date'], axis=1)

In [13]:
X_test = test_df.drop(['p_matches'], axis=1).copy()
Y_test = test_df['p_matches']

In [14]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_test)

In [15]:
from sklearn.metrics import classification_report, log_loss, confusion_matrix, matthews_corrcoef

In [16]:
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.53      0.56      0.55      1028
           1       0.65      0.62      0.64      1361

    accuracy                           0.60      2389
   macro avg       0.59      0.59      0.59      2389
weighted avg       0.60      0.60      0.60      2389



In [17]:
log_loss(Y_test, Y_pred)

13.893730906134502

In [18]:
matthews_corrcoef(Y_test, Y_pred)

0.1859088945803257

In [19]:
confusion_matrix(Y_test, Y_pred)

array([[579, 449],
       [512, 849]], dtype=int64)

Test with 6 features :

In [20]:
df_6 = pd.read_csv(os.path.abspath('../input/standardized-diff-cleaned-featured-2008-2017.csv'))

In [21]:
df_6 = df_6.drop(['p_number_of_matches_used', 'o_number_of_matches_used', 'player_id', 'opponent_id', 'tournament_name', 'date', 'w2sp_diff', 'fs_diff', 'df_per_game_diff', 'bp_won_diff', 'direct', 'tpw_diff', 'w1sp_diff'], axis=1)

In [22]:
X_train_6 = df_6.drop(['p_matches'], axis=1).copy()
Y_train_6 = df_6['p_matches']

In [23]:
test_df_6 = pd.read_csv(os.path.abspath('../input/balanced-standardized-diff-cleaned-featured-2018.csv'))

In [24]:
test_df_6 = test_df_6.drop(['p_number_of_matches_used', 'o_number_of_matches_used', 'player_id', 'opponent_id', 'tournament_name', 'date', 'w2sp_diff', 'fs_diff', 'df_per_game_diff', 'bp_won_diff', 'direct', 'tpw_diff', 'w1sp_diff'], axis=1)

In [25]:
X_test_6 = test_df_6.drop(['p_matches'], axis=1).copy()
Y_test_6 = test_df_6['p_matches']

In [26]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train_6, Y_train_6)

Y_pred_6 = logreg.predict(X_test_6)

In [27]:
print(classification_report(Y_test_6, Y_pred_6))

              precision    recall  f1-score   support

           0       0.53      0.56      0.54      1028
           1       0.65      0.62      0.63      1361

    accuracy                           0.59      2389
   macro avg       0.59      0.59      0.59      2389
weighted avg       0.60      0.59      0.60      2389



In [28]:
log_loss(Y_test_6, Y_pred_6)

14.0093899314848

In [29]:
matthews_corrcoef(Y_test_6, Y_pred_6)

0.18017756661618273

In [30]:
confusion_matrix(Y_test_6, Y_pred_6)

array([[580, 448],
       [521, 840]], dtype=int64)

In [31]:
X_test_6.head()

Unnamed: 0,wrp_diff,tmw_diff,aces_per_game_diff,wsp_diff,completeness_diff,serve_adv_diff
0,0.875673,0.759928,0.544421,-0.942347,-0.594263,-0.554666
1,-0.394938,0.0,0.716568,0.887831,0.732156,0.707902
2,0.257608,-1.483145,0.090078,-0.722411,-0.613331,-0.603474
3,-0.269353,-0.395505,-0.132275,0.195354,0.090785,0.077142
4,1.264731,0.565008,-0.450563,0.02362,0.497423,0.568346


In [32]:
test_odd = X_test_6.iloc[0]

In [33]:
import math
odds = math.exp(logreg.intercept_ + logreg.coef_[0][0]*test_odd['wrp_diff'] + logreg.coef_[0][1]*test_odd['tmw_diff'] + logreg.coef_[0][2]*test_odd['aces_per_game_diff'] + logreg.coef_[0][3]*test_odd['wsp_diff'] + logreg.coef_[0][4]*test_odd['completeness_diff'] + logreg.coef_[0][5]*test_odd['serve_adv_diff'])

In [34]:
odds/(1+odds)

0.593445022034779

In [35]:
Y_pred[0]

1.0

In [36]:
Y_test[0]

0