In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
df = pd.read_csv(os.path.abspath('../input/standardized-diff-cleaned-featured-2008-2017.csv'))

In [3]:
df.columns.values

array(['p_number_of_matches_used', 'direct', 'o_number_of_matches_used',
       'player_id', 'opponent_id', 'tournament_name', 'date', 'p_matches',
       'fs_diff', 'w1sp_diff', 'w2sp_diff', 'wrp_diff', 'tpw_diff',
       'tmw_diff', 'aces_per_game_diff', 'df_per_game_diff',
       'bp_won_diff', 'wsp_diff', 'completeness_diff', 'serve_adv_diff'],
      dtype=object)

Drop unused features for prediction :

In [4]:
df = df.drop(['p_number_of_matches_used', 'o_number_of_matches_used', 'player_id', 'opponent_id', 'tournament_name', 'date'], axis=1)

In [5]:
X_train = df.drop(['p_matches'], axis=1).copy()
Y_train = df['p_matches']

In [6]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

estimator = LogisticRegression()
selector = RFE(estimator, n_features_to_select=1, step=1)
selector = selector.fit(X_train, Y_train)
selector.support_

array([False, False, False, False, False, False, False, False, False,
       False, False, False,  True])

In [7]:
selector.ranking_

array([ 9, 12,  7, 13,  4,  8,  5,  6, 11, 10,  3,  2,  1])

In [8]:
pd.DataFrame({'ranking': X_train.columns.values},index=selector.ranking_).sort_index()

Unnamed: 0,ranking
1,serve_adv_diff
2,completeness_diff
3,wsp_diff
4,wrp_diff
5,tmw_diff
6,aces_per_game_diff
7,w1sp_diff
8,tpw_diff
9,direct
10,bp_won_diff


import test dataset to evaluate with logistic loss :

In [9]:
test_df = pd.read_csv(os.path.abspath('../input/standardized-diff-cleaned-featured-2018.csv'))

In [10]:
test_df = test_df.drop(['p_number_of_matches_used', 'o_number_of_matches_used', 'player_id', 'opponent_id', 'tournament_name', 'date'], axis=1)

In [11]:
X_test = df.drop(['p_matches'], axis=1).copy()
Y_test = df['p_matches']

In [12]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_test)

In [13]:
logreg.coef_

array([[ 0.11283799,  0.01706766, -0.15498612,  0.00933083, -0.75144727,
         0.1603797 ,  0.47371392,  0.2475851 ,  0.03007784, -0.0461149 ,
        -2.36620718,  5.31530005, -2.55415284]])

In [14]:
from sklearn.metrics import classification_report, log_loss, confusion_matrix

In [15]:
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

         0.0       0.68      0.68      0.68     26101
         1.0       0.68      0.68      0.68     26101

    accuracy                           0.68     52202
   macro avg       0.68      0.68      0.68     52202
weighted avg       0.68      0.68      0.68     52202



In [17]:
Y_pred_proba = logreg.predict_proba(X_test)
log_loss(Y_test, Y_pred_proba)

0.5989650998664136

In [18]:
confusion_matrix(Y_test, Y_pred)

array([[17653,  8448],
       [ 8448, 17653]], dtype=int64)

In [19]:
Y_pred_proba[0]

array([0.51584608, 0.48415392])