In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [16]:
df = pd.read_csv('../Resources/matches_cleaned.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)
df.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,player_id,player_seed,player_name,...,player_bpSaved,player_bpFaced,player_rank,player_rank_points,set_1_score,set_2_score,set_3_score,set_4_score,set_5_score,winner
0,2023-7696,NextGen Finals,Hard,8,F,2023-11-27,300,209098.0,6.0,Hamad Medjedovic,...,4.0,4.0,110.0,582.0,3.0,4.0,4.0,3.0,4.0,1
1,2023-7696,NextGen Finals,Hard,8,F,2023-11-27,300,209950.0,1.0,Arthur Fils,...,1.0,4.0,36.0,1158.0,4.0,1.0,2.0,4.0,1.0,0
2,2023-7696,NextGen Finals,Hard,8,F,2023-11-27,299,209950.0,1.0,Arthur Fils,...,2.0,3.0,36.0,1158.0,2.0,4.0,4.0,4.0,,1
3,2023-7696,NextGen Finals,Hard,8,F,2023-11-27,299,209414.0,2.0,Luca Van Assche,...,2.0,3.0,70.0,756.0,4.0,1.0,3.0,3.0,,0
4,2023-7696,NextGen Finals,Hard,8,F,2023-11-27,298,209098.0,6.0,Hamad Medjedovic,...,0.0,0.0,110.0,582.0,4.0,2.0,,,,1


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76434 entries, 0 to 76433
Data columns (total 35 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   tourney_id          76434 non-null  object 
 1   tourney_name        76434 non-null  object 
 2   surface             76434 non-null  object 
 3   draw_size           76434 non-null  int64  
 4   tourney_level       76434 non-null  object 
 5   tourney_date        76434 non-null  object 
 6   match_num           76434 non-null  int64  
 7   player_id           76434 non-null  float64
 8   player_seed         27936 non-null  float64
 9   player_name         76434 non-null  object 
 10  player_hand         76434 non-null  object 
 11  player_ht           75798 non-null  float64
 12  player_ioc          76434 non-null  object 
 13  player_age          76432 non-null  float64
 14  score               76434 non-null  object 
 15  best_of             76434 non-null  int64  
 16  roun

In [18]:
# drop unwanted columns
columns_to_drop = ['tourney_id', 'tourney_date', 'tourney_name', 'match_num', 'player_seed', 'score', 'set_3_score', 'set_4_score', 'set_5_score']
df.drop(columns=columns_to_drop, inplace=True)

In [19]:
# check columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76434 entries, 0 to 76433
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   surface             76434 non-null  object 
 1   draw_size           76434 non-null  int64  
 2   tourney_level       76434 non-null  object 
 3   player_id           76434 non-null  float64
 4   player_name         76434 non-null  object 
 5   player_hand         76434 non-null  object 
 6   player_ht           75798 non-null  float64
 7   player_ioc          76434 non-null  object 
 8   player_age          76432 non-null  float64
 9   best_of             76434 non-null  int64  
 10  round               76434 non-null  object 
 11  minutes             73082 non-null  float64
 12  player_ace          75788 non-null  float64
 13  player_df           75788 non-null  float64
 14  player_svpt         75788 non-null  float64
 15  player_1stIn        75788 non-null  float64
 16  play

In [20]:
# features set
X = df.copy()
targets = ['tourney_id']
X.drop('winner', axis=1, inplace=True)

# target vector
y = df['winner']

In [25]:
# lists for categorical and numeric features to standardize and encode
cat_features = ['surface', 'draw_size', 'tourney_level', 'player_id', 'player_name', 'player_hand', 'player_ioc', 'best_of', 'round']
num_features = ['player_ht', 'player_age', 'minutes', 'player_ace', 'player_df', 'player_svpt', 'player_1stIn', 'player_1stWon', 'player_2ndWon', 'player_SvGms', 'player_bpSaved',
                'player_bpFaced', 'player_rank', 'player_rank_points', 'set_1_score', 'set_2_score']

In [26]:
# imports
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

# numeric transformer
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with the mean
    ('scaler', StandardScaler())
])

# categorical transformer
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with the most frequent value
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# preprocessor from both transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
    ]
)

# create random forest classifier model after preprocessing
model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=0))
    ]
)

In [27]:
# split into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [28]:
# train the model
model = model.fit(X_train, y_train)

# making predictions using the testing data
predictions = model.predict(X_test)

In [29]:
# evaluate model's accuracy
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.2f}')

# classification report
print(classification_report(y_test, predictions))

Accuracy: 0.84
              precision    recall  f1-score   support

           0       0.83      0.86      0.84      9547
           1       0.86      0.82      0.84      9562

    accuracy                           0.84     19109
   macro avg       0.84      0.84      0.84     19109
weighted avg       0.84      0.84      0.84     19109

