In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# read the dataset with encoded categorical features
final_data = pd.read_csv(
    'https://raw.githubusercontent.com/Shakhtery/data-mining/main/Data/final_data.csv',
    index_col=0
)

# read the dataset with plain categorical features
final_data_raw = pd.read_csv(
    'https://raw.githubusercontent.com/Shakhtery/data-mining/main/Data/final_data_raw.csv',
    index_col=0
)

### current code

In [7]:
# target variables are probabilities of the first and the second players
targets = ['player_1_won', 'player_2_won']

# include in predicting features all columns except date(already encoded in the dataset)
# and data about odds
predictors = list(set(final_data.columns).difference(set(targets + ['date', 'maxw', 'maxl', 'avgw', 'avgl'])))

In [8]:
from datetime import datetime

# convert date from str to datetime
final_data['date'] = final_data['date'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))

In [10]:
# sort dataset by date
final_data.sort_values('date', inplace=True, ignore_index=True)

In [18]:
# train-test split
def train_test_split(train_end_year):
    train = final_data[final_data['date'].dt.year <= train_end_year]
    test = final_data[final_data['date'].dt.year > train_end_year]
    return (
        train[predictors], # X_train
        train[targets[0]], # Y_train
        test[predictors], # X_test
        test[targets[0]] # Y_test
    )

In [17]:
# testing function
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from tqdm import tqdm

def model_score(model, metric):
    scores = []
    for train_end_year in tqdm(range(2000, 2020)):
        X_train, Y_train, X_test, Y_test = train_test_split(train_end_year)
        model.fit(X_train, Y_train)
        if metric == 'accuracy':
            score = accuracy_score(Y_test, model.predict(X_test))
        elif metric == 'precision':
            score = precision_score(Y_test, model.predict(X_test), average=None)
        elif metric == 'recall':
            score = recall_score(Y_test, model.predict(X_test), average=None)
        elif metric == 'f1':
            score = f1_score(Y_test, model.predict(X_test), average=None)
        else:
            raise Exception()
        scores.append(score)
    return np.mean(scores)

### old code

In [105]:
raw_predictors = list(set(final_data_raw.columns).difference({'player_1_won', 'player_2_won', 'maxw', 'maxl', 'avgw', 'avgl'}))

# target variables are probabilities of the first and the second players
target = ['player_1_won']

In [47]:
from datetime import datetime

# convert date from str to datetime
final_data_raw['date'] = final_data_raw['date'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))

In [48]:
# sort dataset by date
final_data_raw.sort_values('date', inplace=True, ignore_index=True)

In [68]:
# train-test split
#def train_test_split(X: pd.DataFrame, y: pd.Series, X_date: pd.Series, train_end_year: int):
def train_test_split(X: pd.DataFrame, y: pd.Series, train_end_year: int):
    mask = X['date'].dt.year <= train_end_year
    
    # (X_train, y_train, X_test, Y_test)
    return X[mask], y[mask], X[~mask], y[~mask]

In [None]:
!pip install feature_engine

In [102]:
from feature_engine.creation import CyclicalFeatures

# Label encoding
# Location, winner_name/loser_name encode
def label_encoding(df):
    df["location"] = df["location"].astype('category').cat.codes

    conc_names = pd.concat([df["player_1_name"], df["player_2_name"]]).astype('category').cat.codes
    df["player_1_name"] = conc_names.iloc[:len(conc_names)//2]
    df["player_2_name"] = conc_names.iloc[len(conc_names)//2:]

    return df

# Date encode
def encode_date(df):
    df["year"] = df["date"].apply(lambda x: x.year)
    df["month"] = df["date"].apply(lambda x: x.month)
    df["day"] = df["date"].apply(lambda x: x.day)

    cyclical = CyclicalFeatures(variables=None, drop_original=True)
    df = pd.concat([df, cyclical.fit_transform(df[["year", "month", "day"]])], axis=1)
    df = df.drop(["date", "year", "month", "day"], axis=1)

    return df

# One-hot encoding
# Court, surface, round, best_of, tourney_level, winner_hand/loser_hand, winner_ioc/loser_ioc encoding
def one_hot_encoding(df):
    df = pd.concat([df, pd.get_dummies(df["court"])], axis=1).drop("court", axis=1)
    df = pd.concat([df, pd.get_dummies(df["surface"])], axis=1).drop("surface", axis=1)
    df = pd.concat([df, pd.get_dummies(df["round"])], axis=1).drop("round", axis=1)
    df = pd.concat([df, pd.get_dummies(df["best_of"])], axis=1).drop("best_of", axis=1)
    df = pd.concat([df, pd.get_dummies(df["tourney_level"])], axis=1).drop("tourney_level", axis=1)

    ohe_conc_players_hand = pd.get_dummies(pd.concat([df["player_1_hand"], df["player_2_hand"]]))
    df = pd.concat([df, ohe_conc_players_hand.iloc[:len(ohe_conc_players_hand)//2]], axis=1).drop("player_1_hand", axis=1)
    df = pd.concat([df, ohe_conc_players_hand.iloc[len(ohe_conc_players_hand)//2:]], axis=1).drop("player_2_hand", axis=1)

    ohe_conc_players_ioc = pd.get_dummies(pd.concat([df["player_1_ioc"], df["player_2_ioc"]]))
    df = pd.concat([df, ohe_conc_players_ioc.iloc[:len(ohe_conc_players_ioc)//2]], axis=1).drop("player_1_ioc", axis=1)
    df = pd.concat([df, ohe_conc_players_ioc.iloc[len(ohe_conc_players_ioc)//2:]], axis=1).drop("player_2_ioc", axis=1)

    return df


# Standard Scaling
# wrank/lrank, winner_ht/loser_ht, winner_age/loser_age scale
def std_scale(series, mean, std):
    return (series-mean)/std

def scale(df):
    concated = pd.concat([df["player_1_rank"], df["player_2_rank"]])
    mean = concated.mean()
    std = concated.std()
    df["player_1_rank"] = std_scale(df["player_1_rank"], mean, std)
    df["player_2_rank"] = std_scale(df["player_2_rank"], mean, std)

    concated = pd.concat([df["player_1_ht"], df["player_2_ht"]])
    mean = concated.mean()
    std = concated.std()
    df["player_1_ht"] = std_scale(df["player_1_ht"], mean, std)
    df["player_2_ht"] = std_scale(df["player_2_ht"], mean, std)

    concated = pd.concat([df["player_1_age"], df["player_2_age"]])
    mean = concated.mean()
    std = concated.std()
    df["player_1_age"] = std_scale(df["player_1_age"], mean, std)
    df["player_2_age"] = std_scale(df["player_2_age"], mean, std)

    return df

def column_names_to_str(df):
    df.columns = df.columns.astype(str)
    return df

def preprocess_data(data):
    pipeline = [
        label_encoding,
        encode_date,
        one_hot_encoding,
        scale,
        column_names_to_str
    ]

    result = data
    for operation in pipeline:
        result = operation(result)

    return result

In [97]:
# testing function
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from tqdm import tqdm

def model_score(model, metric):
    scores = []
    for train_end_year in tqdm(range(2000, 2020)):
        X_train, y_train, X_test, y_test = train_test_split(
            final_data_raw[raw_predictors],
            final_data_raw[target],
            train_end_year
        )
        X_train, X_test = preprocess_data(X_train), preprocess_data(X_test)
        model.fit(X_train, y_train)
        
        if metric == 'accuracy':
            score = accuracy_score(y_test, model.predict(X_test))
        elif metric == 'precision':
            score = precision_score(y_test, model.predict(X_test), average=None)
        elif metric == 'recall':
            score = recall_score(y_test, model.predict(X_test), average=None)
        elif metric == 'f1':
            score = f1_score(y_test, model.predict(X_test), average=None)
        else:
            raise Exception()
        
        scores.append(score)

    return np.mean(scores)

# Modeling

## Select Modeling Technique

### Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression

In [19]:
print(
    '\n',
    model_score(
        LogisticRegression(solver='liblinear'),
        metric='accuracy'
    )
)

100%|██████████| 20/20 [00:30<00:00,  1.53s/it]


 0.619267644898964





In [21]:
print(
    '\n',
    model_score(
        LogisticRegression(solver='liblinear'),
        metric='precision'
    )
)

100%|██████████| 20/20 [00:31<00:00,  1.57s/it]


 0.6195024685930328





In [23]:
print(
    '\n',
    model_score(
        LogisticRegression(solver='liblinear'),
        metric='recall'
    )
)

100%|██████████| 20/20 [00:33<00:00,  1.67s/it]


 0.6193808913264073





In [24]:
print(
    '\n',
    model_score(
        LogisticRegression(solver='liblinear'),
        metric='f1'
    )
)

100%|██████████| 20/20 [00:21<00:00,  1.08s/it]


 0.6191743298257294



