In [30]:
import pandas as pd
import os
import numpy as np

In [31]:
df = pd.read_csv(os.path.abspath('../input/standardized-diff-cleaned-featured-2008-2017.csv'))

In [32]:
df.columns.values

array(['p_number_of_matches_used', 'direct', 'o_number_of_matches_used',
       'player_id', 'opponent_id', 'tournament_name', 'date', 'p_matches',
       'fs_diff', 'w1sp_diff', 'w2sp_diff', 'wrp_diff', 'tpw_diff',
       'tmw_diff', 'aces_per_game_diff', 'df_per_game_diff',
       'bp_won_diff', 'wsp_diff', 'completeness_diff', 'serve_adv_diff',
       'elo_rating_diff'], dtype=object)

In [33]:
df = df.drop(['p_number_of_matches_used', 'o_number_of_matches_used', 'player_id', 'opponent_id', 'tournament_name', 'date'], axis=1)

In [34]:
X_train = df.drop(['p_matches'], axis=1).copy()
Y_train = df['p_matches']

In [35]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

estimator = LogisticRegression()
selector = RFE(estimator, n_features_to_select=1, step=1)
selector = selector.fit(X_train, Y_train)
selector.support_

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True])

In [36]:
pd.DataFrame({'ranking': X_train.columns.values},index=selector.ranking_).sort_index()

Unnamed: 0,ranking
1,elo_rating_diff
2,completeness_diff
3,serve_adv_diff
4,tpw_diff
5,wsp_diff
6,wrp_diff
7,aces_per_game_diff
8,w1sp_diff
9,direct
10,bp_won_diff


In [37]:
test_df = pd.read_csv(os.path.abspath('../input/standardized-diff-cleaned-featured-2018.csv'))

In [38]:
test_df.shape[0]

2809

In [39]:
test_df = test_df.drop(['p_number_of_matches_used', 'o_number_of_matches_used', 'player_id', 'opponent_id', 'tournament_name', 'date'], axis=1)

In [40]:
test_df.columns.values

array(['direct', 'p_matches', 'fs_diff', 'w1sp_diff', 'w2sp_diff',
       'wrp_diff', 'tpw_diff', 'tmw_diff', 'aces_per_game_diff',
       'df_per_game_diff', 'bp_won_diff', 'wsp_diff', 'completeness_diff',
       'serve_adv_diff', 'elo_rating_diff'], dtype=object)

In [41]:
X_train.columns.values

array(['direct', 'fs_diff', 'w1sp_diff', 'w2sp_diff', 'wrp_diff',
       'tpw_diff', 'tmw_diff', 'aces_per_game_diff', 'df_per_game_diff',
       'bp_won_diff', 'wsp_diff', 'completeness_diff', 'serve_adv_diff',
       'elo_rating_diff'], dtype=object)

In [42]:
X_test = test_df.drop(['p_matches'], axis=1).copy()
Y_test = test_df['p_matches']

In [43]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_test)

In [47]:
pd.DataFrame({'variable':X_train.columns.values, 'value':logreg.coef_[0]})

Unnamed: 0,variable,value
0,direct,0.043283
1,fs_diff,0.008526
2,w1sp_diff,-0.033458
3,w2sp_diff,0.042817
4,wrp_diff,-0.319487
5,tpw_diff,0.408608
6,tmw_diff,-0.010293
7,aces_per_game_diff,0.122401
8,df_per_game_diff,0.031657
9,bp_won_diff,-0.042495


In [15]:
from sklearn.metrics import classification_report, log_loss, confusion_matrix

In [16]:
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.65      0.64      0.65      1441
           1       0.63      0.64      0.64      1368

    accuracy                           0.64      2809
   macro avg       0.64      0.64      0.64      2809
weighted avg       0.64      0.64      0.64      2809



In [17]:
Y_pred_proba = logreg.predict_proba(X_test)
log_loss(Y_test, Y_pred_proba)

0.636445120525647

In [18]:
confusion_matrix(Y_test, Y_pred)

array([[929, 512],
       [492, 876]], dtype=int64)

6 features :

In [19]:
df_6 = pd.read_csv(os.path.abspath('../input/standardized-diff-cleaned-featured-2008-2017.csv'))

In [20]:
df_6 = df_6.drop(['p_number_of_matches_used', 'o_number_of_matches_used', 'player_id', 'opponent_id', 'tournament_name', 'date', 'w2sp_diff', 'fs_diff', 'df_per_game_diff', 'bp_won_diff', 'direct', 'tmw_diff', 'w1sp_diff', 'aces_per_game_diff'], axis=1)

In [21]:
X_train_6 = df_6.drop(['p_matches'], axis=1).copy()
Y_train_6 = df_6['p_matches']

In [22]:
test_df_6 = pd.read_csv(os.path.abspath('../input/standardized-diff-cleaned-featured-2018.csv'))

In [23]:
test_df_6 = test_df_6.drop(['p_number_of_matches_used', 'o_number_of_matches_used', 'player_id', 'opponent_id', 'tournament_name', 'date', 'w2sp_diff', 'fs_diff', 'df_per_game_diff', 'bp_won_diff', 'direct', 'tmw_diff', 'w1sp_diff', 'aces_per_game_diff'], axis=1)

In [24]:
X_test_6 = test_df_6.drop(['p_matches'], axis=1).copy()
Y_test_6 = test_df_6['p_matches']

In [25]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train_6, Y_train_6)

Y_pred_6 = logreg.predict(X_test_6)

In [26]:
print(classification_report(Y_test_6, Y_pred_6))

              precision    recall  f1-score   support

           0       0.65      0.64      0.65      1441
           1       0.63      0.64      0.64      1368

    accuracy                           0.64      2809
   macro avg       0.64      0.64      0.64      2809
weighted avg       0.64      0.64      0.64      2809



In [27]:
Y_pred_proba_6 = logreg.predict_proba(X_test_6)
log_loss(Y_test_6, Y_pred_proba_6)

0.6354040786025793

In [28]:
confusion_matrix(Y_test_6, Y_pred_6)

array([[927, 514],
       [492, 876]], dtype=int64)