In [48]:
import pandas as pd
import os
import numpy as np

In [49]:
df = pd.read_csv(os.path.abspath('../input/standardized-diff-cleaned-featured-2008-2017.csv'))

In [50]:
df.columns.values

array(['p_number_of_matches_used', 'direct', 'o_number_of_matches_used',
       'player_id', 'opponent_id', 'tournament_name', 'date', 'p_matches',
       'fs_diff', 'w1sp_diff', 'w2sp_diff', 'wrp_diff', 'tpw_diff',
       'tmw_diff', 'aces_per_game_diff', 'df_per_game_diff',
       'bp_won_diff', 'wsp_diff', 'completeness_diff', 'serve_adv_diff',
       'elo_rating_diff'], dtype=object)

In [51]:
df = df.drop(['p_number_of_matches_used', 'o_number_of_matches_used', 'player_id', 'opponent_id', 'tournament_name', 'date'], axis=1)

In [52]:
X_train = df.drop(['p_matches'], axis=1).copy()
Y_train = df['p_matches']

In [53]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

estimator = LogisticRegression()
selector = RFE(estimator, n_features_to_select=1, step=1)
selector = selector.fit(X_train, Y_train)
selector.support_

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True])

In [54]:
pd.DataFrame({'ranking': X_train.columns.values},index=selector.ranking_).sort_index()

Unnamed: 0,ranking
1,elo_rating_diff
2,completeness_diff
3,serve_adv_diff
4,tpw_diff
5,wsp_diff
6,wrp_diff
7,aces_per_game_diff
8,w1sp_diff
9,direct
10,bp_won_diff


In [55]:
test_df = pd.read_csv(os.path.abspath('../input/standardized-diff-cleaned-featured-2018.csv'))

In [56]:
test_df.shape[0]

2809

In [57]:
test_df = test_df.drop(['p_number_of_matches_used', 'o_number_of_matches_used', 'player_id', 'opponent_id', 'tournament_name', 'date'], axis=1)

In [58]:
test_df.columns.values

array(['direct', 'p_matches', 'fs_diff', 'w1sp_diff', 'w2sp_diff',
       'wrp_diff', 'tpw_diff', 'tmw_diff', 'aces_per_game_diff',
       'df_per_game_diff', 'bp_won_diff', 'wsp_diff', 'completeness_diff',
       'serve_adv_diff', 'elo_rating_diff'], dtype=object)

In [59]:
X_train.columns.values

array(['direct', 'fs_diff', 'w1sp_diff', 'w2sp_diff', 'wrp_diff',
       'tpw_diff', 'tmw_diff', 'aces_per_game_diff', 'df_per_game_diff',
       'bp_won_diff', 'wsp_diff', 'completeness_diff', 'serve_adv_diff',
       'elo_rating_diff'], dtype=object)

In [60]:
X_test = test_df.drop(['p_matches'], axis=1).copy()
Y_test = test_df['p_matches']

In [61]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_test)

In [62]:
from sklearn.metrics import classification_report, log_loss, confusion_matrix

In [63]:
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.65      0.64      0.65      1441
           1       0.63      0.64      0.64      1368

    accuracy                           0.64      2809
   macro avg       0.64      0.64      0.64      2809
weighted avg       0.64      0.64      0.64      2809



In [64]:
Y_pred_proba = logreg.predict_proba(X_test)
log_loss(Y_test, Y_pred_proba)

0.636445120525647

In [65]:
confusion_matrix(Y_test, Y_pred)

array([[929, 512],
       [492, 876]], dtype=int64)

6 features :

In [77]:
df_6 = pd.read_csv(os.path.abspath('../input/standardized-diff-cleaned-featured-2008-2017.csv'))

In [78]:
df_6 = df_6.drop(['p_number_of_matches_used', 'o_number_of_matches_used', 'player_id', 'opponent_id', 'tournament_name', 'date', 'w2sp_diff', 'fs_diff', 'df_per_game_diff', 'bp_won_diff', 'direct', 'tmw_diff', 'w1sp_diff', 'aces_per_game_diff'], axis=1)

In [79]:
X_train_6 = df_6.drop(['p_matches'], axis=1).copy()
Y_train_6 = df_6['p_matches']

In [80]:
test_df_6 = pd.read_csv(os.path.abspath('../input/standardized-diff-cleaned-featured-2018.csv'))

In [81]:
test_df_6 = test_df_6.drop(['p_number_of_matches_used', 'o_number_of_matches_used', 'player_id', 'opponent_id', 'tournament_name', 'date', 'w2sp_diff', 'fs_diff', 'df_per_game_diff', 'bp_won_diff', 'direct', 'tmw_diff', 'w1sp_diff', 'aces_per_game_diff'], axis=1)

In [84]:
X_test_6 = test_df_6.drop(['p_matches'], axis=1).copy()
Y_test_6 = test_df_6['p_matches']

In [85]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train_6, Y_train_6)

Y_pred_6 = logreg.predict(X_test_6)

In [86]:
print(classification_report(Y_test_6, Y_pred_6))

              precision    recall  f1-score   support

           0       0.65      0.64      0.65      1441
           1       0.63      0.64      0.64      1368

    accuracy                           0.64      2809
   macro avg       0.64      0.64      0.64      2809
weighted avg       0.64      0.64      0.64      2809



In [88]:
Y_pred_proba_6 = logreg.predict_proba(X_test_6)
log_loss(Y_test_6, Y_pred_proba_6)

0.6354040786025793

In [89]:
confusion_matrix(Y_test_6, Y_pred_6)

array([[927, 514],
       [492, 876]], dtype=int64)