In [216]:
import pandas as pd
import os
import numpy as np

In [217]:
df1 = pd.read_csv(os.path.abspath('../input/standardized-diff-cleaned-featured-2008-2017.csv'))

In [218]:
df2 = pd.read_csv(os.path.abspath('../input/standardized-diff-cleaned-featured-2018.csv'))

In [219]:
df = pd.concat([df1, df2])

In [220]:
df.columns.values

array(['p_number_of_matches_used', 'direct', 'o_number_of_matches_used',
       'player_id', 'opponent_id', 'tournament_name', 'date', 'p_matches',
       'fs_diff', 'w1sp_diff', 'w2sp_diff', 'wrp_diff', 'tpw_diff',
       'tmw_diff', 'aces_per_game_diff', 'df_per_game_diff',
       'bp_won_diff', 'wsp_diff', 'completeness_diff', 'serve_adv_diff',
       'elo_rating_diff'], dtype=object)

In [221]:
df = df.drop(['p_number_of_matches_used', 'o_number_of_matches_used', 'player_id', 'opponent_id', 'tournament_name', 'date'], axis=1)

In [222]:
df.isnull().sum()

direct                0
p_matches             0
fs_diff               0
w1sp_diff             0
w2sp_diff             0
wrp_diff              0
tpw_diff              0
tmw_diff              0
aces_per_game_diff    0
df_per_game_diff      0
bp_won_diff           0
wsp_diff              0
completeness_diff     0
serve_adv_diff        0
elo_rating_diff       0
dtype: int64

In [223]:
X_train = df.drop(['p_matches'], axis=1).copy()
Y_train = df['p_matches']

In [224]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

estimator = LogisticRegression(max_iter=200)
selector = RFE(estimator, n_features_to_select=1, step=1)
selector = selector.fit(X_train, Y_train)
selector.support_

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True])

In [225]:
pd.DataFrame({'ranking': X_train.columns.values},index=selector.ranking_).sort_index()

Unnamed: 0,ranking
1,elo_rating_diff
2,serve_adv_diff
3,wsp_diff
4,wrp_diff
5,completeness_diff
6,tpw_diff
7,aces_per_game_diff
8,direct
9,w2sp_diff
10,bp_won_diff


In [226]:
test_df = pd.read_csv(os.path.abspath('../input/standardized-diff-cleaned-featured-2019.csv'))

In [227]:
test_df.shape[0]

2564

In [228]:
test_df = test_df.drop(['p_number_of_matches_used', 'o_number_of_matches_used', 'player_id', 'opponent_id', 'tournament_name', 'date'], axis=1)

In [229]:
test_df.columns.values

array(['direct', 'p_matches', 'fs_diff', 'w1sp_diff', 'w2sp_diff',
       'wrp_diff', 'tpw_diff', 'tmw_diff', 'aces_per_game_diff',
       'df_per_game_diff', 'bp_won_diff', 'wsp_diff', 'completeness_diff',
       'serve_adv_diff', 'elo_rating_diff'], dtype=object)

In [230]:
X_train.columns.values

array(['direct', 'fs_diff', 'w1sp_diff', 'w2sp_diff', 'wrp_diff',
       'tpw_diff', 'tmw_diff', 'aces_per_game_diff', 'df_per_game_diff',
       'bp_won_diff', 'wsp_diff', 'completeness_diff', 'serve_adv_diff',
       'elo_rating_diff'], dtype=object)

In [231]:
X_test = test_df.drop(['p_matches'], axis=1).copy()
Y_test = test_df['p_matches']

In [232]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=200)
logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_test)

In [233]:
from sklearn.metrics import classification_report, log_loss, confusion_matrix

In [234]:
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.49      0.52      0.50      1267
           1       0.50      0.48      0.49      1297

    accuracy                           0.50      2564
   macro avg       0.50      0.50      0.50      2564
weighted avg       0.50      0.50      0.50      2564



In [235]:
confusion_matrix(Y_test, Y_pred)

array([[656, 611],
       [679, 618]], dtype=int64)

In [236]:
test_df.head()

Unnamed: 0,direct,p_matches,fs_diff,w1sp_diff,w2sp_diff,wrp_diff,tpw_diff,tmw_diff,aces_per_game_diff,df_per_game_diff,bp_won_diff,wsp_diff,completeness_diff,serve_adv_diff,elo_rating_diff
0,0.0,1,1.007344,-0.660907,0.83902,0.089247,0.102672,-1.597219,-0.365864,0.868355,0.483959,-0.008189,0.02873,0.031951,0.269262
1,1.0,0,-1.515746,-0.027084,-0.401122,0.767656,0.26977,-1.493985,-0.124504,0.70612,-0.224898,-0.491438,-0.194762,-0.152926,0.00528
2,0.0,1,-0.454251,-0.337295,-0.295331,1.223649,0.528999,-0.102807,-0.338221,0.071793,-1.266031,-0.480624,0.028192,0.063823,0.549083
3,1.0,0,1.8981,-1.1842,1.913698,0.759803,0.635209,-1.797654,-0.472288,-1.784993,0.305178,-0.0303,0.26753,0.312066,-0.137271
4,0.0,0,0.841569,1.560695,-3.17639,1.127302,0.942312,-4.706949,0.482186,0.279313,-1.407839,0.075721,0.557361,0.585617,1.356868


In [237]:
test_df.describe()

Unnamed: 0,direct,p_matches,fs_diff,w1sp_diff,w2sp_diff,wrp_diff,tpw_diff,tmw_diff,aces_per_game_diff,df_per_game_diff,bp_won_diff,wsp_diff,completeness_diff,serve_adv_diff,elo_rating_diff
count,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0
mean,0.00457,0.50585,0.032167,-0.010342,-0.011653,0.001536,-0.001665,-0.008921,-0.007736,-0.001761,0.001804,-0.003701,-0.004265,-0.003068,-0.022885
std,0.576104,0.500063,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.0,0.0,-3.176804,-4.697876,-3.679002,-4.250947,-5.916746,-5.99218,-4.147274,-4.532387,-5.535592,-5.172253,-6.011864,-6.047336,-4.324026
25%,0.0,0.0,-0.640104,-0.640202,-0.635431,-0.641436,-0.559539,-0.406719,-0.579194,-0.662591,-0.534878,-0.614379,-0.593713,-0.592684,-0.638837
50%,0.0,1.0,0.036107,-0.039413,-0.034047,-0.003016,-0.006582,0.001468,-0.010109,-0.008688,0.004841,-0.023473,-0.021192,-0.019216,-0.031678
75%,0.0,1.0,0.67103,0.64199,0.6165,0.589151,0.538373,0.39476,0.551729,0.666238,0.530858,0.598914,0.594647,0.585323,0.601879
max,1.0,1.0,4.191206,5.881981,5.016133,4.629603,8.516128,5.261427,4.159611,4.154253,9.949665,5.000085,6.628267,7.169356,3.753825


In [238]:
pd.DataFrame({'variable':X_train.columns.values, 'value':logreg.coef_[0]})

Unnamed: 0,variable,value
0,direct,0.047756
1,fs_diff,0.03671
2,w1sp_diff,0.069094
3,w2sp_diff,0.084865
4,wrp_diff,0.522031
5,tpw_diff,0.224965
6,tmw_diff,-0.013279
7,aces_per_game_diff,0.0988
8,df_per_game_diff,0.026915
9,bp_won_diff,-0.034164


In [239]:
Y_pred_proba = logreg.predict_proba(X_test)
log_loss(Y_test, Y_pred_proba)

0.8015076392283041