# üéæLogisticRegression

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, log_loss, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# –§—É–Ω–∫—Ü–∏—è –¥–ª—è –∏–º–∏—Ç–∞—Ü–∏–∏ —Ä–µ–∞–ª–∏—Å—Ç–∏—á–Ω—ã—Ö –ø—Ä–æ–ø—É—Å–∫–æ–≤ –≤ —Ç–µ–Ω–Ω–∏—Å–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö
def create_realistic_tennis_missing_data(tennis_df, random_state=None):
    np.random.seed(random_state)
    df = tennis_df.copy()

    # 1. –†–µ–π—Ç–∏–Ω–≥ –∏ –æ—á–∫–∏ —á–∞—â–µ –æ—Ç—Å—É—Ç—Å—Ç–≤—É—é—Ç —É –º–∞–ª–æ–∏–∑–≤–µ—Å—Ç–Ω—ã—Ö –∏–≥—Ä–æ–∫–æ–≤
    high_rank_mask = df['Rank_1'] > 100
    df.loc[high_rank_mask & (np.random.random(len(df)) < 0.3), 'Pts_1'] = np.nan

    high_rank_mask = df['Rank_2'] > 100
    df.loc[high_rank_mask & (np.random.random(len(df)) < 0.3), 'Pts_2'] = np.nan

    # 2. –ö–æ—ç—Ñ—Ñ–∏—Ü–∏–µ–Ω—Ç—ã –º–æ–≥—É—Ç –æ—Ç—Å—É—Ç—Å—Ç–≤–æ–≤–∞—Ç—å –¥–ª—è –Ω–µ–∫–æ—Ç–æ—Ä—ã—Ö –º–∞—Ç—á–µ–π
    df.loc[np.random.random(len(df)) < 0.1, ['Odd_1', 'Odd_2']] = np.nan

    # 3. –°—á–µ—Ç –º–æ–∂–µ—Ç –±—ã—Ç—å –Ω–µ–ø–æ–ª–Ω—ã–º –¥–ª—è –Ω–µ–∑–∞–≤–µ—Ä—à–µ–Ω–Ω—ã—Ö –º–∞—Ç—á–µ–π
    # –ò–º–∏—Ç–∏—Ä—É–µ–º –æ—Ç—Å—É—Ç—Å—Ç–≤–∏–µ –ø–æ—Å–ª–µ–¥–Ω–∏—Ö —Å–µ—Ç–æ–≤
    def truncate_score(score, prob_missing=0.15):
        if pd.isna(score) or np.random.random() > prob_missing:
            return score

        sets = score.split()
        if len(sets) <= 1:
            return score

        # –°–æ—Ö—Ä–∞–Ω—è–µ–º —Ç–æ–ª—å–∫–æ –ø–µ—Ä–≤—ã–π —Å–µ—Ç —Å –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å—é 0.7, –∏–Ω–∞—á–µ –ø–µ—Ä–≤—ã–µ –¥–≤–∞
        n_sets = 1 if np.random.random() < 0.7 else 2
        n_sets = min(n_sets, len(sets))

        return ' '.join(sets[:n_sets])

    df['Score'] = df['Score'].apply(lambda x: truncate_score(x))

    return df

# –§—É–Ω–∫—Ü–∏—è –¥–ª—è –∏–∑–≤–ª–µ—á–µ–Ω–∏—è –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –∏–∑ —Å—á–µ—Ç–∞ –º–∞—Ç—á–∞
def extract_score_features(score):
    if pd.isna(score):
        return [0] * 6
    
    sets = score.split()
    total_sets = len(sets)
    sets_won_1 = 0
    sets_won_2 = 0
    total_games_1 = 0
    total_games_2 = 0
    
    for set_score in sets:
        games = set_score.split('-')
        if len(games) == 2:
            games_1 = int(games[0])
            games_2 = int(games[1])
            total_games_1 += games_1
            total_games_2 += games_2
            if games_1 > games_2:
                sets_won_1 += 1
            else:
                sets_won_2 += 1
    
    return [total_sets, sets_won_1, sets_won_2, 
            total_games_1, total_games_2,
            total_games_1 + total_games_2]

# –§—É–Ω–∫—Ü–∏—è –¥–ª—è —Å–æ–∑–¥–∞–Ω–∏—è –¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤
def create_comparison_features(df):
    features = pd.DataFrame()
    
    # –†–µ–π—Ç–∏–Ω–≥–æ–≤–æ–µ –ø—Ä–µ–∏–º—É—â–µ—Å—Ç–≤–æ (—á–µ–º –±–æ–ª—å—à–µ —Ä–∞–∑–Ω–∏—Ü–∞, —Ç–µ–º —Å–∏–ª—å–Ω–µ–µ –ø—Ä–µ–∏–º—É—â–µ—Å—Ç–≤–æ)
    features['rank_advantage'] = (df['Rank_2'] - df['Rank_1']) / (df['Rank_1'] + df['Rank_2'])
    
    # –û—á–∫–æ–≤–æ–µ –ø—Ä–µ–∏–º—É—â–µ—Å—Ç–≤–æ
    features['points_advantage'] = (df['Pts_1'] - df['Pts_2']) / (df['Pts_1'] + df['Pts_2'])
    
    # –ü—Ä–µ–∏–º—É—â–µ—Å—Ç–≤–æ –ø–æ –∫–æ—ç—Ñ—Ñ–∏—Ü–∏–µ–Ω—Ç–∞–º
    features['odds_advantage'] = (df['Odd_2'] - df['Odd_1']) / (df['Odd_1'] + df['Odd_2'])
    
    # –ë–∏–Ω–∞—Ä–Ω—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏ –¥–ª—è –ø—Ä—è–º–æ–≥–æ —Å—Ä–∞–≤–Ω–µ–Ω–∏—è
    features['better_rank'] = (df['Rank_1'] < df['Rank_2']).astype(int)
    features['more_points'] = (df['Pts_1'] > df['Pts_2']).astype(int)
    features['better_odds'] = (df['Odd_1'] < df['Odd_2']).astype(int)
    
    # –°–æ—Å—Ç–∞–≤–Ω–æ–π –ø–æ–∫–∞–∑–∞—Ç–µ–ª—å –ø—Ä–µ–∏–º—É—â–µ—Å—Ç–≤–∞
    features['overall_advantage'] = (features['better_rank'] + 
                                   features['more_points'] + 
                                   features['better_odds']) / 3
    
    # –î–æ–±–∞–≤–ª—è–µ–º –∫–≤–∞–¥—Ä–∞—Ç—ã –∏ –≤–∑–∞–∏–º–æ–¥–µ–π—Å—Ç–≤–∏—è –ø—Ä–∏–∑–Ω–∞–∫–æ–≤
    features['rank_points_interaction'] = features['rank_advantage'] * features['points_advantage']
    features['rank_odds_interaction'] = features['rank_advantage'] * features['odds_advantage']
    features['points_odds_interaction'] = features['points_advantage'] * features['odds_advantage']
    
    return features

# –û–±–Ω–æ–≤–ª–µ–Ω–Ω–∞—è —Ñ—É–Ω–∫—Ü–∏—è –ø–æ–¥–≥–æ—Ç–æ–≤–∫–∏ –¥–∞–Ω–Ω—ã—Ö
def prepare_data(df):
    df = df.copy()
    
    # –ö–æ–Ω–≤–µ—Ä—Ç–∏—Ä—É–µ–º –¥–∞—Ç—É –≤ datetime
    df['Date'] = pd.to_datetime(df['Date'])
    
    # –ó–∞–º–µ–Ω—è–µ–º –æ—Ç—Ä–∏—Ü–∞—Ç–µ–ª—å–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è –Ω–∞ NaN
    numeric_columns = ['Rank_1', 'Rank_2', 'Pts_1', 'Pts_2', 'Odd_1', 'Odd_2']
    for col in numeric_columns:
        df.loc[df[col] < 0, col] = np.nan
    
    # –ò–∑–≤–ª–µ–∫–∞–µ–º –ø—Ä–∏–∑–Ω–∞–∫–∏ –∏–∑ —Å—á–µ—Ç–∞
    score_features = df['Score'].apply(extract_score_features)
    score_features_df = pd.DataFrame(score_features.tolist(), 
                                   columns=['total_sets', 'sets_won_1', 'sets_won_2',
                                          'total_games_1', 'total_games_2', 'total_games'])
    
    # –°–æ–∑–¥–∞–µ–º –ø—Ä–∏–∑–Ω–∞–∫ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–∞
    df['target'] = (df['Winner'] == df['Player_1']).astype(int)
    
    # –ö–æ–¥–∏—Ä—É–µ–º –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏
    le = LabelEncoder()
    categorical_columns = ['Series', 'Court', 'Surface', 'Round']
    for col in categorical_columns:
        df[col + '_encoded'] = le.fit_transform(df[col])
    
    # –î–æ–±–∞–≤–ª—è–µ–º –Ω–æ–≤—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏ —Å—Ä–∞–≤–Ω–µ–Ω–∏—è
    comparison_features = create_comparison_features(df)
    
    # –û–±—ä–µ–¥–∏–Ω—è–µ–º –≤—Å–µ –ø—Ä–∏–∑–Ω–∞–∫–∏
    features = pd.concat([
        df[['Rank_1', 'Rank_2', 'Pts_1', 'Pts_2', 'Odd_1', 'Odd_2']],
        df[[col + '_encoded' for col in categorical_columns]],
        score_features_df,
        comparison_features
    ], axis=1)
    
    return features, df['target']

# –§—É–Ω–∫—Ü–∏—è –¥–ª—è –æ–±—É—á–µ–Ω–∏—è –º–æ–¥–µ–ª–∏ —Å –Ω–∞—Å—Ç—Ä–æ–µ–Ω–Ω—ã–º–∏ –ø–∞—Ä–∞–º–µ—Ç—Ä–∞–º–∏
def train_model(df_train):
    X, y = prepare_data(df_train)
    
    # –ó–∞–ø–æ–ª–Ω—è–µ–º –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è
    X = X.fillna(X.mean())
    
    # –õ–æ–≥–∏—Å—Ç–∏—á–µ—Å–∫–∞—è —Ä–µ–≥—Ä–µ—Å—Å–∏—è
    model = LogisticRegression(
        C=1.0,                   # –ü–∞—Ä–∞–º–µ—Ç—Ä —Ä–µ–≥—É–ª—è—Ä–∏–∑–∞—Ü–∏–∏
        penalty='l2',            # –¢–∏–ø —Ä–µ–≥—É–ª—è—Ä–∏–∑–∞—Ü–∏–∏ (L2)
        solver='liblinear',      # –ê–ª–≥–æ—Ä–∏—Ç–º –æ–ø—Ç–∏–º–∏–∑–∞—Ü–∏–∏
        max_iter=1000,           # –ú–∞–∫—Å–∏–º–∞–ª—å–Ω–æ–µ —á–∏—Å–ª–æ –∏—Ç–µ—Ä–∞—Ü–∏–π
        random_state=32,
        class_weight='balanced'  # –ë–∞–ª–∞–Ω—Å–∏—Ä—É–µ–º –≤–µ—Å–∞ –∫–ª–∞—Å—Å–æ–≤
    )
    model.fit(X, y)
    
    return model

# –§—É–Ω–∫—Ü–∏–∏ get_predictions, create_results_table –∏ display_results –æ—Å—Ç–∞—é—Ç—Å—è –±–µ–∑ –∏–∑–º–µ–Ω–µ–Ω–∏–π
def get_predictions(model, df_test):
    X_test, _ = prepare_data(df_test)
    X_test = X_test.fillna(X_test.mean())
    probabilities = model.predict_proba(X_test)
    return probabilities

# –§—É–Ω–∫—Ü–∏—è –¥–ª—è —Å–æ–∑–¥–∞–Ω–∏—è —Ç–∞–±–ª–∏—Ü—ã —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
def create_results_table(df_test, predictions):
    results = pd.DataFrame({
        'Player_1': df_test['Player_1'],
        'Player_1_Prediction': predictions[:, 1] * 100,
        'Player_2': df_test['Player_2'],
        'Player_2_Prediction': predictions[:, 0] * 100,
        'Winner': df_test['Winner'],
        'Rank_1': df_test['Rank_1'],
        'Rank_2': df_test['Rank_2'],
        'Pts_1': df_test['Pts_1'],
        'Pts_2': df_test['Pts_2'],
        'Odd_1': df_test['Odd_1'],
        'Odd_2': df_test['Odd_2'],
        'Score': df_test['Score']
    })
    
    # –û–ø—Ä–µ–¥–µ–ª—è–µ–º, –∫–æ–≥–æ –º–æ–¥–µ–ª—å –ø—Ä–æ–≥–Ω–æ–∑–∏—Ä—É–µ—Ç –ø–æ–±–µ–¥–∏—Ç–µ–ª–µ–º
    results['predicted_winner'] = np.where(predictions[:, 1] > predictions[:, 0], 
                                         df_test['Player_1'], 
                                         df_test['Player_2'])
    
    # –û–ø—Ä–µ–¥–µ–ª—è–µ–º, –±—ã–ª–æ –ª–∏ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –≤–µ—Ä–Ω—ã–º
    results['correct_prediction'] = (results['predicted_winner'] == df_test['Winner'])
    
    # –î–æ–±–∞–≤–ª—è–µ–º —É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å –≤ –ø—Ä–æ–≥–Ω–æ–∑–µ - –º–∞–∫—Å–∏–º–∞–ª—å–Ω–∞—è –∏–∑ –¥–≤—É—Ö –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–µ–π
    results['confidence'] = np.maximum(predictions[:, 0], predictions[:, 1]) * 100
    
    return results

# –§—É–Ω–∫—Ü–∏—è –¥–ª—è –æ—Ç–æ–±—Ä–∞–∂–µ–Ω–∏—è —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
def display_results(results):
    styled_results = results.style.format({
        'Player_1_Prediction': '{:.2f}%',
        'Player_2_Prediction': '{:.2f}%',
        'Rank_1': '{:.0f}',
        'Rank_2': '{:.0f}',
        'Pts_1': '{:.0f}',
        'Pts_2': '{:.0f}',
        'Odd_1': '{:.2f}',
        'Odd_2': '{:.2f}'
    }).apply(lambda x: ['background-color: #90EE90' if x['correct_prediction'] 
                       else 'background-color: #FFB6C6' for i in range(len(x))], axis=1)
    
    display(styled_results.hide(axis='columns', subset=['correct_prediction']))

# –§—É–Ω–∫—Ü–∏—è –¥–ª—è –æ—Ü–µ–Ω–∫–∏ –∏ –≤—ã–≤–æ–¥–∞ –º–µ—Ç—Ä–∏–∫
def display_metrics(model, df_test, predictions, results):
    # –ò—Å—Ç–∏–Ω–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è (1 - –µ—Å–ª–∏ –ø–æ–±–µ–¥–∏–ª –ø–µ—Ä–≤—ã–π –∏–≥—Ä–æ–∫, 0 - –µ—Å–ª–∏ –≤—Ç–æ—Ä–æ–π)
    y_true = (df_test['Winner'] == df_test['Player_1']).astype(int)
    
    # –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏ –ø–æ–±–µ–¥—ã –ø–µ—Ä–≤–æ–≥–æ –∏–≥—Ä–æ–∫–∞
    y_pred_proba = predictions[:, 1]
    
    # –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –±–∏–Ω–∞—Ä–Ω—ã–µ –º–µ—Ç–∫–∏
    y_pred = (y_pred_proba > 0.5).astype(int)
    
    # –†–∞—Å—á–µ—Ç –º–µ—Ç—Ä–∏–∫
    accuracy = accuracy_score(y_true, y_pred) * 100
    precision = precision_score(y_true, y_pred) * 100
    recall = recall_score(y_true, y_pred) * 100
    f1 = f1_score(y_true, y_pred) * 100
    roc_auc = roc_auc_score(y_true, y_pred_proba) * 100
    entropy_loss = log_loss(y_true, y_pred_proba)
    
    # –ú–∞—Ç—Ä–∏—Ü–∞ –æ—à–∏–±–æ–∫
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    # –í—ã–≤–æ–¥–∏–º –º–µ—Ç—Ä–∏–∫–∏
    print(f"\n–ú–µ—Ç—Ä–∏–∫–∏ –∫–∞—á–µ—Å—Ç–≤–∞ –º–æ–¥–µ–ª–∏:")
    print(f"Accuracy (–¢–æ—á–Ω–æ—Å—Ç—å): {accuracy:.2f}%")
    print(f"Precision (–¢–æ—á–Ω–æ—Å—Ç—å –ø–æ –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω–æ–º—É –∫–ª–∞—Å—Å—É): {precision:.2f}%")
    print(f"Recall (–ü–æ–ª–Ω–æ—Ç–∞): {recall:.2f}%")
    print(f"F1-score (F-–º–µ—Ä–∞): {f1:.2f}%")
    print(f"ROC AUC: {roc_auc:.2f}%")
    print(f"Log Loss: {entropy_loss:.4f}")
    
    # –ê–Ω–∞–ª–∏–∑ –æ—à–∏–±–æ–∫
    print("\n–ú–∞—Ç—Ä–∏—Ü–∞ –æ—à–∏–±–æ–∫ (Confusion Matrix):")
    print(f"True Negative: {tn} (–ü—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –≤—Ç–æ—Ä–æ–≥–æ –∏–≥—Ä–æ–∫–∞)")
    print(f"False Positive: {fp} (–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –ø–µ—Ä–≤–æ–≥–æ –∏–≥—Ä–æ–∫–∞)")
    print(f"False Negative: {fn} (–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –≤—Ç–æ—Ä–æ–≥–æ –∏–≥—Ä–æ–∫–∞)")
    print(f"True Positive: {tp} (–ü—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –ø–µ—Ä–≤–æ–≥–æ –∏–≥—Ä–æ–∫–∞)")
    
    # –†–∞—Å—á–µ—Ç –º–µ—Ç—Ä–∏–∫ –ø–æ —É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç–∏ –ø—Ä–æ–≥–Ω–æ–∑–∞
    confidence_bins = [50, 60, 70, 80, 90, 100]
    print("\n–¢–æ—á–Ω–æ—Å—Ç—å –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π –ø–æ —É—Ä–æ–≤–Ω—é —É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç–∏:")
    
    for i in range(len(confidence_bins)-1):
        low = confidence_bins[i]
        high = confidence_bins[i+1]
        
        # –§–∏–ª—å—Ç—Ä—É–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è –ø–æ —É—Ä–æ–≤–Ω—é —É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç–∏
        confident_preds = results[(results['confidence'] >= low) & (results['confidence'] < high)]
        
        if len(confident_preds) > 0:
            bin_accuracy = (confident_preds['correct_prediction'].sum() / len(confident_preds)) * 100
            print(f"–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å {low}%-{high}%: {bin_accuracy:.2f}% —Ç–æ—á–Ω–æ—Å—Ç—å ({len(confident_preds)} –º–∞—Ç—á–µ–π)")
        else:
            print(f"–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å {low}%-{high}%: –Ω–µ—Ç –º–∞—Ç—á–µ–π")
    
    # –í—ã–≤–æ–¥–∏–º –≤–∞–∂–Ω–æ—Å—Ç—å –ø—Ä–∏–∑–Ω–∞–∫–æ–≤
    feature_names = (
        ['Rank_1', 'Rank_2', 'Pts_1', 'Pts_2', 'Odd_1', 'Odd_2'] +
        ['Series_encoded', 'Court_encoded', 'Surface_encoded', 'Round_encoded'] +
        ['total_sets', 'sets_won_1', 'sets_won_2', 'total_games_1', 'total_games_2', 'total_games'] +
        ['rank_advantage', 'points_advantage', 'odds_advantage', 'better_rank', 'more_points', 'better_odds', 'overall_advantage', 
         'rank_points_interaction', 'rank_odds_interaction', 'points_odds_interaction']
    )

    X_test, _ = prepare_data(df_test)
    X_test = X_test.fillna(X_test.mean())

    # –í—ã—á–∏—Å–ª—è–µ–º —Å—Ç–∞–Ω–¥–∞—Ä—Ç–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ –∫–æ—ç—Ñ—Ñ–∏—Ü–∏–µ–Ω—Ç—ã –¥–ª—è —Å—Ä–∞–≤–Ω–µ–Ω–∏—è –≤–∞–∂–Ω–æ—Å—Ç–∏ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤
    X_std = X_test.std()
    feature_importance = pd.DataFrame({
        'Feature': feature_names,
        'Importance': np.abs(model.coef_[0]) * X_std.values
    }).sort_values('Importance', ascending=False)

    # –ù–æ—Ä–º–∞–ª–∏–∑—É–µ–º –≤–∞–∂–Ω–æ—Å—Ç—å –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –¥–ª—è —Å—Ä–∞–≤–Ω–µ–Ω–∏—è —Å –¥—Ä—É–≥–∏–º–∏ –º–æ–¥–µ–ª—è–º–∏ (—Å—É–º–º–∞ = 1)
    feature_importance['Importance'] = feature_importance['Importance'] / feature_importance['Importance'].sum()
    
    print("\n–í–∞–∂–Ω–æ—Å—Ç—å –ø—Ä–∏–∑–Ω–∞–∫–æ–≤:")
    print(feature_importance.head(10))

---

## 1. –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ –Ω–∞ –±–æ–ª—å—à–æ–π –≤—ã–±–æ—Ä–∫–µ –±–µ–∑ –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π

In [2]:
df_train = pd.read_csv('atp_tennis.csv')

# –û–±—É—á–∞–µ–º –º–æ–¥–µ–ª—å
model = train_model(df_train)
print(f"–†–∞–∑–º–µ—Ä —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω–æ–≥–æ –Ω–∞–±–æ—Ä–∞: {len(df_train)}")

–†–∞–∑–º–µ—Ä —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω–æ–≥–æ –Ω–∞–±–æ—Ä–∞: 64018


### –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ –Ω–∞ –≤—ã–±–æ—Ä–∫–µ –±–µ–∑ –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π

In [3]:
df_test = pd.read_csv('atp_tennis_test_minus_1_set_47967_big.csv')

# –ü–æ–ª—É—á–∞–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è
predictions = get_predictions(model, df_test)
    
# –°–æ–∑–¥–∞–µ–º —Ç–∞–±–ª–∏—Ü—É —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
results = create_results_table(df_test, predictions)
    
# –û—Ç–æ–±—Ä–∞–∂–∞–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã
display_results(results)
    
# –û—Ç–æ–±—Ä–∞–∂–∞–µ–º –ø–æ–¥—Ä–æ–±–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏ –∫–∞—á–µ—Å—Ç–≤–∞ –º–æ–¥–µ–ª–∏
display_metrics(model, df_test, predictions, results)

Unnamed: 0,Player_1,Player_1_Prediction,Player_2,Player_2_Prediction,Winner,Rank_1,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,Score,predicted_winner,confidence
0,Haider-Maurer A.,0.00%,Dolgopolov O.,100.00%,Dolgopolov O.,448,37,81,1296,13.0,1.04,6-7 3-6,Dolgopolov O.,99.999263
1,Harrison R.,52.52%,Sela D.,47.48%,Harrison R.,45,95,1115,585,1.3,3.5,6-3 5-7 3-6 7-5,Harrison R.,52.522194
2,Anderson K.,29.09%,Edmund K.,70.91%,Edmund K.,12,49,2610,992,1.4,3.0,7-6 3-6 6-3 3-6,Edmund K.,70.908052
3,Carreno Busta P.,99.59%,Kubler J.,0.41%,Carreno Busta P.,11,243,2615,217,1.28,3.75,7-5 4-6 7-5,Carreno Busta P.,99.5907
4,Youzhny M.,0.00%,Cuevas P.,100.00%,Cuevas P.,90,34,604,1345,2.1,1.72,6-7 3-6,Cuevas P.,99.997613
5,Istomin D.,99.96%,Herbert P.H.,0.04%,Istomin D.,60,74,848,690,1.66,2.2,6-2 6-1 5-7,Istomin D.,99.956255
6,Delbonis F.,0.00%,Muller G.,100.00%,Muller G.,67,28,755,1490,4.0,1.25,5-7 4-6,Muller G.,99.998053
7,Seppi A.,99.72%,Moutet C.,0.28%,Seppi A.,76,155,686,361,1.33,3.4,3-6 6-4 6-2,Seppi A.,99.721848
8,Ferrer D.,9.57%,Rublev A.,90.43%,Rublev A.,33,32,1360,1373,1.9,1.9,5-7 7-6 2-6 7-6,Rublev A.,90.4275
9,Ebden M.,99.15%,Isner J.,0.85%,Ebden M.,78,16,684,2265,3.5,1.3,6-4 3-6 6-3,Ebden M.,99.152734



–ú–µ—Ç—Ä–∏–∫–∏ –∫–∞—á–µ—Å—Ç–≤–∞ –º–æ–¥–µ–ª–∏:
Accuracy (–¢–æ—á–Ω–æ—Å—Ç—å): 95.12%
Precision (–¢–æ—á–Ω–æ—Å—Ç—å –ø–æ –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω–æ–º—É –∫–ª–∞—Å—Å—É): 92.31%
Recall (–ü–æ–ª–Ω–æ—Ç–∞): 98.36%
F1-score (F-–º–µ—Ä–∞): 95.24%
ROC AUC: 99.39%
Log Loss: 0.1081

–ú–∞—Ç—Ä–∏—Ü–∞ –æ—à–∏–±–æ–∫ (Confusion Matrix):
True Negative: 57 (–ü—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –≤—Ç–æ—Ä–æ–≥–æ –∏–≥—Ä–æ–∫–∞)
False Positive: 5 (–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –ø–µ—Ä–≤–æ–≥–æ –∏–≥—Ä–æ–∫–∞)
False Negative: 1 (–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –≤—Ç–æ—Ä–æ–≥–æ –∏–≥—Ä–æ–∫–∞)
True Positive: 60 (–ü—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –ø–µ—Ä–≤–æ–≥–æ –∏–≥—Ä–æ–∫–∞)

–¢–æ—á–Ω–æ—Å—Ç—å –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π –ø–æ —É—Ä–æ–≤–Ω—é —É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç–∏:
–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å 50%-60%: 66.67% —Ç–æ—á–Ω–æ—Å—Ç—å (6 –º–∞—Ç—á–µ–π)
–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å 60%-70%: 75.00% —Ç–æ—á–Ω–æ—Å—Ç—å (4 –º–∞—Ç—á–µ–π)
–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å

### –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ –Ω–∞ –≤—ã–±–æ—Ä–∫–µ —Å –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã–º–∏ –∑–Ω–∞—á–µ–Ω–∏—è–º–∏

In [4]:
df_test = pd.read_csv('atp_tennis_test_minus_1_set_47967_big.csv')

# –ò–º–∏—Ç–∏—Ä—É–µ–º –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è –≤ —Ç–µ—Å—Ç–æ–≤–æ–π –≤—ã–±–æ—Ä–∫–µ
df_test_missing = create_realistic_tennis_missing_data(df_test, random_state=32)

# –ü–æ–ª—É—á–∞–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è
predictions = get_predictions(model, df_test_missing)
    
# –°–æ–∑–¥–∞–µ–º —Ç–∞–±–ª–∏—Ü—É —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
results = create_results_table(df_test_missing, predictions)
    
# –û—Ç–æ–±—Ä–∞–∂–∞–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã
display_results(results)
    
# –û—Ç–æ–±—Ä–∞–∂–∞–µ–º –ø–æ–¥—Ä–æ–±–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏ –∫–∞—á–µ—Å—Ç–≤–∞ –º–æ–¥–µ–ª–∏
display_metrics(model, df_test_missing, predictions, results)

Unnamed: 0,Player_1,Player_1_Prediction,Player_2,Player_2_Prediction,Winner,Rank_1,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,Score,predicted_winner,confidence
0,Haider-Maurer A.,0.23%,Dolgopolov O.,99.77%,Dolgopolov O.,448,37,81.0,1296.0,13.0,1.04,6-7,Dolgopolov O.,99.77107
1,Harrison R.,52.52%,Sela D.,47.48%,Harrison R.,45,95,1115.0,585.0,1.3,3.5,6-3 5-7 3-6 7-5,Harrison R.,52.522194
2,Anderson K.,29.09%,Edmund K.,70.91%,Edmund K.,12,49,2610.0,992.0,1.4,3.0,7-6 3-6 6-3 3-6,Edmund K.,70.908052
3,Carreno Busta P.,99.59%,Kubler J.,0.41%,Carreno Busta P.,11,243,2615.0,217.0,1.28,3.75,7-5 4-6 7-5,Carreno Busta P.,99.5907
4,Youzhny M.,0.00%,Cuevas P.,100.00%,Cuevas P.,90,34,604.0,1345.0,2.1,1.72,6-7 3-6,Cuevas P.,99.997613
5,Istomin D.,99.96%,Herbert P.H.,0.04%,Istomin D.,60,74,848.0,690.0,1.66,2.2,6-2 6-1 5-7,Istomin D.,99.956255
6,Delbonis F.,0.00%,Muller G.,100.00%,Muller G.,67,28,755.0,1490.0,4.0,1.25,5-7 4-6,Muller G.,99.998053
7,Seppi A.,0.42%,Moutet C.,99.58%,Seppi A.,76,155,686.0,361.0,1.33,3.4,3-6,Moutet C.,99.58114
8,Ferrer D.,9.57%,Rublev A.,90.43%,Rublev A.,33,32,1360.0,1373.0,1.9,1.9,5-7 7-6 2-6 7-6,Rublev A.,90.4275
9,Ebden M.,99.21%,Isner J.,0.79%,Ebden M.,78,16,684.0,2265.0,3.5,1.3,6-4,Ebden M.,99.205166



–ú–µ—Ç—Ä–∏–∫–∏ –∫–∞—á–µ—Å—Ç–≤–∞ –º–æ–¥–µ–ª–∏:
Accuracy (–¢–æ—á–Ω–æ—Å—Ç—å): 93.50%
Precision (–¢–æ—á–Ω–æ—Å—Ç—å –ø–æ –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω–æ–º—É –∫–ª–∞—Å—Å—É): 90.77%
Recall (–ü–æ–ª–Ω–æ—Ç–∞): 96.72%
F1-score (F-–º–µ—Ä–∞): 93.65%
ROC AUC: 98.73%
Log Loss: 0.1597

–ú–∞—Ç—Ä–∏—Ü–∞ –æ—à–∏–±–æ–∫ (Confusion Matrix):
True Negative: 56 (–ü—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –≤—Ç–æ—Ä–æ–≥–æ –∏–≥—Ä–æ–∫–∞)
False Positive: 6 (–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –ø–µ—Ä–≤–æ–≥–æ –∏–≥—Ä–æ–∫–∞)
False Negative: 2 (–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –≤—Ç–æ—Ä–æ–≥–æ –∏–≥—Ä–æ–∫–∞)
True Positive: 59 (–ü—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –ø–µ—Ä–≤–æ–≥–æ –∏–≥—Ä–æ–∫–∞)

–¢–æ—á–Ω–æ—Å—Ç—å –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π –ø–æ —É—Ä–æ–≤–Ω—é —É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç–∏:
–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å 50%-60%: 66.67% —Ç–æ—á–Ω–æ—Å—Ç—å (6 –º–∞—Ç—á–µ–π)
–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å 60%-70%: 60.00% —Ç–æ—á–Ω–æ—Å—Ç—å (5 –º–∞—Ç—á–µ–π)
–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å

---

## 2. –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ –Ω–∞ –±–æ–ª—å—à–æ–π –≤—ã–±–æ—Ä–∫–µ —Å –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã–º–∏ –∑–Ω–∞—á–µ–Ω–∏—è–º–∏

In [5]:
df_train = pd.read_csv('atp_tennis.csv')

# –ò–º–∏—Ç–∏—Ä—É–µ–º –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è –≤ –æ–±—É—á–∞—é—â–µ–π –≤—ã–±–æ—Ä–∫–µ
df_train_missing = create_realistic_tennis_missing_data(df_train, random_state=32)

# –û–±—É—á–∞–µ–º –º–æ–¥–µ–ª—å
model = train_model(df_train_missing)
print(f"–†–∞–∑–º–µ—Ä —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω–æ–≥–æ –Ω–∞–±–æ—Ä–∞: {len(df_train)}")

–†–∞–∑–º–µ—Ä —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω–æ–≥–æ –Ω–∞–±–æ—Ä–∞: 64018


### –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ –Ω–∞ –≤—ã–±–æ—Ä–∫–µ –±–µ–∑ –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π

In [6]:
df_test = pd.read_csv('atp_tennis_test_minus_1_set_47967_big.csv')

# –ü–æ–ª—É—á–∞–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è
predictions = get_predictions(model, df_test)
    
# –°–æ–∑–¥–∞–µ–º —Ç–∞–±–ª–∏—Ü—É —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
results = create_results_table(df_test, predictions)
    
# –û—Ç–æ–±—Ä–∞–∂–∞–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã
display_results(results)
    
# –û—Ç–æ–±—Ä–∞–∂–∞–µ–º –ø–æ–¥—Ä–æ–±–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏ –∫–∞—á–µ—Å—Ç–≤–∞ –º–æ–¥–µ–ª–∏
display_metrics(model, df_test, predictions, results)

Unnamed: 0,Player_1,Player_1_Prediction,Player_2,Player_2_Prediction,Winner,Rank_1,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,Score,predicted_winner,confidence
0,Haider-Maurer A.,0.02%,Dolgopolov O.,99.98%,Dolgopolov O.,448,37,81,1296,13.0,1.04,6-7 3-6,Dolgopolov O.,99.975751
1,Harrison R.,66.57%,Sela D.,33.43%,Harrison R.,45,95,1115,585,1.3,3.5,6-3 5-7 3-6 7-5,Harrison R.,66.566435
2,Anderson K.,62.93%,Edmund K.,37.07%,Edmund K.,12,49,2610,992,1.4,3.0,7-6 3-6 6-3 3-6,Anderson K.,62.928525
3,Carreno Busta P.,97.85%,Kubler J.,2.15%,Carreno Busta P.,11,243,2615,217,1.28,3.75,7-5 4-6 7-5,Carreno Busta P.,97.850899
4,Youzhny M.,0.22%,Cuevas P.,99.78%,Cuevas P.,90,34,604,1345,2.1,1.72,6-7 3-6,Cuevas P.,99.776081
5,Istomin D.,97.09%,Herbert P.H.,2.91%,Istomin D.,60,74,848,690,1.66,2.2,6-2 6-1 5-7,Istomin D.,97.094755
6,Delbonis F.,0.14%,Muller G.,99.86%,Muller G.,67,28,755,1490,4.0,1.25,5-7 4-6,Muller G.,99.857966
7,Seppi A.,97.45%,Moutet C.,2.55%,Seppi A.,76,155,686,361,1.33,3.4,3-6 6-4 6-2,Seppi A.,97.451232
8,Ferrer D.,41.29%,Rublev A.,58.71%,Rublev A.,33,32,1360,1373,1.9,1.9,5-7 7-6 2-6 7-6,Rublev A.,58.712902
9,Ebden M.,89.92%,Isner J.,10.08%,Ebden M.,78,16,684,2265,3.5,1.3,6-4 3-6 6-3,Ebden M.,89.916496



–ú–µ—Ç—Ä–∏–∫–∏ –∫–∞—á–µ—Å—Ç–≤–∞ –º–æ–¥–µ–ª–∏:
Accuracy (–¢–æ—á–Ω–æ—Å—Ç—å): 94.31%
Precision (–¢–æ—á–Ω–æ—Å—Ç—å –ø–æ –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω–æ–º—É –∫–ª–∞—Å—Å—É): 90.91%
Recall (–ü–æ–ª–Ω–æ—Ç–∞): 98.36%
F1-score (F-–º–µ—Ä–∞): 94.49%
ROC AUC: 99.15%
Log Loss: 0.1377

–ú–∞—Ç—Ä–∏—Ü–∞ –æ—à–∏–±–æ–∫ (Confusion Matrix):
True Negative: 56 (–ü—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –≤—Ç–æ—Ä–æ–≥–æ –∏–≥—Ä–æ–∫–∞)
False Positive: 6 (–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –ø–µ—Ä–≤–æ–≥–æ –∏–≥—Ä–æ–∫–∞)
False Negative: 1 (–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –≤—Ç–æ—Ä–æ–≥–æ –∏–≥—Ä–æ–∫–∞)
True Positive: 60 (–ü—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –ø–µ—Ä–≤–æ–≥–æ –∏–≥—Ä–æ–∫–∞)

–¢–æ—á–Ω–æ—Å—Ç—å –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π –ø–æ —É—Ä–æ–≤–Ω—é —É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç–∏:
–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å 50%-60%: 100.00% —Ç–æ—á–Ω–æ—Å—Ç—å (5 –º–∞—Ç—á–µ–π)
–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å 60%-70%: 50.00% —Ç–æ—á–Ω–æ—Å—Ç—å (10 –º–∞—Ç—á–µ–π)
–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç

### –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ –Ω–∞ –≤—ã–±–æ—Ä–∫–µ —Å –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã–º–∏ –∑–Ω–∞—á–µ–Ω–∏—è–º–∏

In [7]:
df_test = pd.read_csv('atp_tennis_test_minus_1_set_47967_big.csv')

# –ò–º–∏—Ç–∏—Ä—É–µ–º –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è –≤ —Ç–µ—Å—Ç–æ–≤–æ–π –≤—ã–±–æ—Ä–∫–µ
df_test_missing = create_realistic_tennis_missing_data(df_test, random_state=32)

# –ü–æ–ª—É—á–∞–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è
predictions = get_predictions(model, df_test_missing)
    
# –°–æ–∑–¥–∞–µ–º —Ç–∞–±–ª–∏—Ü—É —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
results = create_results_table(df_test_missing, predictions)
    
# –û—Ç–æ–±—Ä–∞–∂–∞–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã
display_results(results)
    
# –û—Ç–æ–±—Ä–∞–∂–∞–µ–º –ø–æ–¥—Ä–æ–±–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏ –∫–∞—á–µ—Å—Ç–≤–∞ –º–æ–¥–µ–ª–∏
display_metrics(model, df_test_missing, predictions, results)

Unnamed: 0,Player_1,Player_1_Prediction,Player_2,Player_2_Prediction,Winner,Rank_1,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,Score,predicted_winner,confidence
0,Haider-Maurer A.,0.47%,Dolgopolov O.,99.53%,Dolgopolov O.,448,37,81.0,1296.0,13.0,1.04,6-7,Dolgopolov O.,99.53154
1,Harrison R.,66.57%,Sela D.,33.43%,Harrison R.,45,95,1115.0,585.0,1.3,3.5,6-3 5-7 3-6 7-5,Harrison R.,66.566435
2,Anderson K.,62.93%,Edmund K.,37.07%,Edmund K.,12,49,2610.0,992.0,1.4,3.0,7-6 3-6 6-3 3-6,Anderson K.,62.928525
3,Carreno Busta P.,97.85%,Kubler J.,2.15%,Carreno Busta P.,11,243,2615.0,217.0,1.28,3.75,7-5 4-6 7-5,Carreno Busta P.,97.850899
4,Youzhny M.,0.22%,Cuevas P.,99.78%,Cuevas P.,90,34,604.0,1345.0,2.1,1.72,6-7 3-6,Cuevas P.,99.776081
5,Istomin D.,97.09%,Herbert P.H.,2.91%,Istomin D.,60,74,848.0,690.0,1.66,2.2,6-2 6-1 5-7,Istomin D.,97.094755
6,Delbonis F.,0.14%,Muller G.,99.86%,Muller G.,67,28,755.0,1490.0,4.0,1.25,5-7 4-6,Muller G.,99.857966
7,Seppi A.,9.27%,Moutet C.,90.73%,Seppi A.,76,155,686.0,361.0,1.33,3.4,3-6,Moutet C.,90.731134
8,Ferrer D.,41.29%,Rublev A.,58.71%,Rublev A.,33,32,1360.0,1373.0,1.9,1.9,5-7 7-6 2-6 7-6,Rublev A.,58.712902
9,Ebden M.,89.94%,Isner J.,10.06%,Ebden M.,78,16,684.0,2265.0,3.5,1.3,6-4,Ebden M.,89.944099



–ú–µ—Ç—Ä–∏–∫–∏ –∫–∞—á–µ—Å—Ç–≤–∞ –º–æ–¥–µ–ª–∏:
Accuracy (–¢–æ—á–Ω–æ—Å—Ç—å): 93.50%
Precision (–¢–æ—á–Ω–æ—Å—Ç—å –ø–æ –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω–æ–º—É –∫–ª–∞—Å—Å—É): 90.77%
Recall (–ü–æ–ª–Ω–æ—Ç–∞): 96.72%
F1-score (F-–º–µ—Ä–∞): 93.65%
ROC AUC: 98.47%
Log Loss: 0.1602

–ú–∞—Ç—Ä–∏—Ü–∞ –æ—à–∏–±–æ–∫ (Confusion Matrix):
True Negative: 56 (–ü—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –≤—Ç–æ—Ä–æ–≥–æ –∏–≥—Ä–æ–∫–∞)
False Positive: 6 (–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –ø–µ—Ä–≤–æ–≥–æ –∏–≥—Ä–æ–∫–∞)
False Negative: 2 (–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –≤—Ç–æ—Ä–æ–≥–æ –∏–≥—Ä–æ–∫–∞)
True Positive: 59 (–ü—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –ø–µ—Ä–≤–æ–≥–æ –∏–≥—Ä–æ–∫–∞)

–¢–æ—á–Ω–æ—Å—Ç—å –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π –ø–æ —É—Ä–æ–≤–Ω—é —É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç–∏:
–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å 50%-60%: 100.00% —Ç–æ—á–Ω–æ—Å—Ç—å (6 –º–∞—Ç—á–µ–π)
–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å 60%-70%: 58.33% —Ç–æ—á–Ω–æ—Å—Ç—å (12 –º–∞—Ç—á–µ–π)
–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç

---

## 3. –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ –Ω–∞ –º–∞–ª–µ–Ω—å–∫–æ–π –≤—ã–±–æ—Ä–∫–µ –±–µ–∑ –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π

In [8]:
df_train = pd.read_csv('atp_tennis_47302_xxs.csv')

# –û–±—É—á–∞–µ–º –º–æ–¥–µ–ª—å
model = train_model(df_train)
print(f"–†–∞–∑–º–µ—Ä —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω–æ–≥–æ –Ω–∞–±–æ—Ä–∞: {len(df_train)}")

–†–∞–∑–º–µ—Ä —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω–æ–≥–æ –Ω–∞–±–æ—Ä–∞: 50


### –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ –Ω–∞ –≤—ã–±–æ—Ä–∫–µ –±–µ–∑ –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π

In [9]:
df_test = pd.read_csv('atp_tennis_test_minus_1_set_47967_big.csv')

# –ü–æ–ª—É—á–∞–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è
predictions = get_predictions(model, df_test)
    
# –°–æ–∑–¥–∞–µ–º —Ç–∞–±–ª–∏—Ü—É —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
results = create_results_table(df_test, predictions)
    
# –û—Ç–æ–±—Ä–∞–∂–∞–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã
display_results(results)
    
# –û—Ç–æ–±—Ä–∞–∂–∞–µ–º –ø–æ–¥—Ä–æ–±–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏ –∫–∞—á–µ—Å—Ç–≤–∞ –º–æ–¥–µ–ª–∏
display_metrics(model, df_test, predictions, results)

Unnamed: 0,Player_1,Player_1_Prediction,Player_2,Player_2_Prediction,Winner,Rank_1,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,Score,predicted_winner,confidence
0,Haider-Maurer A.,0.00%,Dolgopolov O.,100.00%,Dolgopolov O.,448,37,81,1296,13.0,1.04,6-7 3-6,Dolgopolov O.,100.0
1,Harrison R.,95.33%,Sela D.,4.67%,Harrison R.,45,95,1115,585,1.3,3.5,6-3 5-7 3-6 7-5,Harrison R.,95.327518
2,Anderson K.,41.16%,Edmund K.,58.84%,Edmund K.,12,49,2610,992,1.4,3.0,7-6 3-6 6-3 3-6,Edmund K.,58.842877
3,Carreno Busta P.,100.00%,Kubler J.,0.00%,Carreno Busta P.,11,243,2615,217,1.28,3.75,7-5 4-6 7-5,Carreno Busta P.,99.999936
4,Youzhny M.,0.19%,Cuevas P.,99.81%,Cuevas P.,90,34,604,1345,2.1,1.72,6-7 3-6,Cuevas P.,99.811267
5,Istomin D.,99.91%,Herbert P.H.,0.09%,Istomin D.,60,74,848,690,1.66,2.2,6-2 6-1 5-7,Istomin D.,99.90989
6,Delbonis F.,0.31%,Muller G.,99.69%,Muller G.,67,28,755,1490,4.0,1.25,5-7 4-6,Muller G.,99.688377
7,Seppi A.,99.96%,Moutet C.,0.04%,Seppi A.,76,155,686,361,1.33,3.4,3-6 6-4 6-2,Seppi A.,99.961839
8,Ferrer D.,4.79%,Rublev A.,95.21%,Rublev A.,33,32,1360,1373,1.9,1.9,5-7 7-6 2-6 7-6,Rublev A.,95.213997
9,Ebden M.,46.71%,Isner J.,53.29%,Ebden M.,78,16,684,2265,3.5,1.3,6-4 3-6 6-3,Isner J.,53.292957



–ú–µ—Ç—Ä–∏–∫–∏ –∫–∞—á–µ—Å—Ç–≤–∞ –º–æ–¥–µ–ª–∏:
Accuracy (–¢–æ—á–Ω–æ—Å—Ç—å): 85.37%
Precision (–¢–æ—á–Ω–æ—Å—Ç—å –ø–æ –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω–æ–º—É –∫–ª–∞—Å—Å—É): 80.28%
Recall (–ü–æ–ª–Ω–æ—Ç–∞): 93.44%
F1-score (F-–º–µ—Ä–∞): 86.36%
ROC AUC: 93.31%
Log Loss: 0.4650

–ú–∞—Ç—Ä–∏—Ü–∞ –æ—à–∏–±–æ–∫ (Confusion Matrix):
True Negative: 48 (–ü—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –≤—Ç–æ—Ä–æ–≥–æ –∏–≥—Ä–æ–∫–∞)
False Positive: 14 (–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –ø–µ—Ä–≤–æ–≥–æ –∏–≥—Ä–æ–∫–∞)
False Negative: 4 (–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –≤—Ç–æ—Ä–æ–≥–æ –∏–≥—Ä–æ–∫–∞)
True Positive: 57 (–ü—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –ø–µ—Ä–≤–æ–≥–æ –∏–≥—Ä–æ–∫–∞)

–¢–æ—á–Ω–æ—Å—Ç—å –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π –ø–æ —É—Ä–æ–≤–Ω—é —É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç–∏:
–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å 50%-60%: 71.43% —Ç–æ—á–Ω–æ—Å—Ç—å (7 –º–∞—Ç—á–µ–π)
–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å 60%-70%: 25.00% —Ç–æ—á–Ω–æ—Å—Ç—å (4 –º–∞—Ç—á–µ–π)
–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—

### –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ –Ω–∞ –≤—ã–±–æ—Ä–∫–µ —Å –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã–º–∏ –∑–Ω–∞—á–µ–Ω–∏—è–º–∏

In [10]:
df_test = pd.read_csv('atp_tennis_test_minus_1_set_47967_big.csv')

# –ò–º–∏—Ç–∏—Ä—É–µ–º –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è –≤ —Ç–µ—Å—Ç–æ–≤–æ–π –≤—ã–±–æ—Ä–∫–µ
df_test_missing = create_realistic_tennis_missing_data(df_test, random_state=32)

# –ü–æ–ª—É—á–∞–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è
predictions = get_predictions(model, df_test_missing)
    
# –°–æ–∑–¥–∞–µ–º —Ç–∞–±–ª–∏—Ü—É —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
results = create_results_table(df_test_missing, predictions)
    
# –û—Ç–æ–±—Ä–∞–∂–∞–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã
display_results(results)
    
# –û—Ç–æ–±—Ä–∞–∂–∞–µ–º –ø–æ–¥—Ä–æ–±–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏ –∫–∞—á–µ—Å—Ç–≤–∞ –º–æ–¥–µ–ª–∏
display_metrics(model, df_test_missing, predictions, results)

Unnamed: 0,Player_1,Player_1_Prediction,Player_2,Player_2_Prediction,Winner,Rank_1,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,Score,predicted_winner,confidence
0,Haider-Maurer A.,0.00%,Dolgopolov O.,100.00%,Dolgopolov O.,448,37,81.0,1296.0,13.0,1.04,6-7,Dolgopolov O.,100.0
1,Harrison R.,95.33%,Sela D.,4.67%,Harrison R.,45,95,1115.0,585.0,1.3,3.5,6-3 5-7 3-6 7-5,Harrison R.,95.327518
2,Anderson K.,41.16%,Edmund K.,58.84%,Edmund K.,12,49,2610.0,992.0,1.4,3.0,7-6 3-6 6-3 3-6,Edmund K.,58.842877
3,Carreno Busta P.,100.00%,Kubler J.,0.00%,Carreno Busta P.,11,243,2615.0,217.0,1.28,3.75,7-5 4-6 7-5,Carreno Busta P.,99.999936
4,Youzhny M.,0.19%,Cuevas P.,99.81%,Cuevas P.,90,34,604.0,1345.0,2.1,1.72,6-7 3-6,Cuevas P.,99.811267
5,Istomin D.,99.91%,Herbert P.H.,0.09%,Istomin D.,60,74,848.0,690.0,1.66,2.2,6-2 6-1 5-7,Istomin D.,99.90989
6,Delbonis F.,0.31%,Muller G.,99.69%,Muller G.,67,28,755.0,1490.0,4.0,1.25,5-7 4-6,Muller G.,99.688377
7,Seppi A.,91.15%,Moutet C.,8.85%,Seppi A.,76,155,686.0,361.0,1.33,3.4,3-6,Seppi A.,91.146879
8,Ferrer D.,4.79%,Rublev A.,95.21%,Rublev A.,33,32,1360.0,1373.0,1.9,1.9,5-7 7-6 2-6 7-6,Rublev A.,95.213997
9,Ebden M.,46.37%,Isner J.,53.63%,Ebden M.,78,16,684.0,2265.0,3.5,1.3,6-4,Isner J.,53.627422



–ú–µ—Ç—Ä–∏–∫–∏ –∫–∞—á–µ—Å—Ç–≤–∞ –º–æ–¥–µ–ª–∏:
Accuracy (–¢–æ—á–Ω–æ—Å—Ç—å): 85.37%
Precision (–¢–æ—á–Ω–æ—Å—Ç—å –ø–æ –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω–æ–º—É –∫–ª–∞—Å—Å—É): 80.28%
Recall (–ü–æ–ª–Ω–æ—Ç–∞): 93.44%
F1-score (F-–º–µ—Ä–∞): 86.36%
ROC AUC: 92.86%
Log Loss: 0.4859

–ú–∞—Ç—Ä–∏—Ü–∞ –æ—à–∏–±–æ–∫ (Confusion Matrix):
True Negative: 48 (–ü—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –≤—Ç–æ—Ä–æ–≥–æ –∏–≥—Ä–æ–∫–∞)
False Positive: 14 (–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –ø–µ—Ä–≤–æ–≥–æ –∏–≥—Ä–æ–∫–∞)
False Negative: 4 (–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –≤—Ç–æ—Ä–æ–≥–æ –∏–≥—Ä–æ–∫–∞)
True Positive: 57 (–ü—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –ø–µ—Ä–≤–æ–≥–æ –∏–≥—Ä–æ–∫–∞)

–¢–æ—á–Ω–æ—Å—Ç—å –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π –ø–æ —É—Ä–æ–≤–Ω—é —É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç–∏:
–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å 50%-60%: 77.78% —Ç–æ—á–Ω–æ—Å—Ç—å (9 –º–∞—Ç—á–µ–π)
–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å 60%-70%: 33.33% —Ç–æ—á–Ω–æ—Å—Ç—å (3 –º–∞—Ç—á–µ–π)
–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—

---

## 4. –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ –Ω–∞ –º–∞–ª–µ–Ω—å–∫–æ–π –≤—ã–±–æ—Ä–∫–µ —Å –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã–º–∏ –∑–Ω–∞—á–µ–Ω–∏—è–º–∏

In [11]:
df_train = pd.read_csv('atp_tennis_47302_xxs.csv')

# –ò–º–∏—Ç–∏—Ä—É–µ–º –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è –≤ –æ–±—É—á–∞—é—â–µ–π –≤—ã–±–æ—Ä–∫–µ
df_train_missing = create_realistic_tennis_missing_data(df_train, random_state=32)

# –û–±—É—á–∞–µ–º –º–æ–¥–µ–ª—å
model = train_model(df_train_missing)
print(f"–†–∞–∑–º–µ—Ä —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω–æ–≥–æ –Ω–∞–±–æ—Ä–∞: {len(df_train)}")

–†–∞–∑–º–µ—Ä —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω–æ–≥–æ –Ω–∞–±–æ—Ä–∞: 50


### –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ –Ω–∞ –≤—ã–±–æ—Ä–∫–µ –±–µ–∑ –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π

In [12]:
df_test = pd.read_csv('atp_tennis_test_minus_1_set_47967_big.csv')

# –ü–æ–ª—É—á–∞–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è
predictions = get_predictions(model, df_test)
    
# –°–æ–∑–¥–∞–µ–º —Ç–∞–±–ª–∏—Ü—É —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
results = create_results_table(df_test, predictions)
    
# –û—Ç–æ–±—Ä–∞–∂–∞–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã
display_results(results)
    
# –û—Ç–æ–±—Ä–∞–∂–∞–µ–º –ø–æ–¥—Ä–æ–±–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏ –∫–∞—á–µ—Å—Ç–≤–∞ –º–æ–¥–µ–ª–∏
display_metrics(model, df_test, predictions, results)

Unnamed: 0,Player_1,Player_1_Prediction,Player_2,Player_2_Prediction,Winner,Rank_1,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,Score,predicted_winner,confidence
0,Haider-Maurer A.,0.00%,Dolgopolov O.,100.00%,Dolgopolov O.,448,37,81,1296,13.0,1.04,6-7 3-6,Dolgopolov O.,100.0
1,Harrison R.,96.45%,Sela D.,3.55%,Harrison R.,45,95,1115,585,1.3,3.5,6-3 5-7 3-6 7-5,Harrison R.,96.45367
2,Anderson K.,53.34%,Edmund K.,46.66%,Edmund K.,12,49,2610,992,1.4,3.0,7-6 3-6 6-3 3-6,Anderson K.,53.339795
3,Carreno Busta P.,100.00%,Kubler J.,0.00%,Carreno Busta P.,11,243,2615,217,1.28,3.75,7-5 4-6 7-5,Carreno Busta P.,99.99965
4,Youzhny M.,0.10%,Cuevas P.,99.90%,Cuevas P.,90,34,604,1345,2.1,1.72,6-7 3-6,Cuevas P.,99.902443
5,Istomin D.,99.89%,Herbert P.H.,0.11%,Istomin D.,60,74,848,690,1.66,2.2,6-2 6-1 5-7,Istomin D.,99.892574
6,Delbonis F.,0.27%,Muller G.,99.73%,Muller G.,67,28,755,1490,4.0,1.25,5-7 4-6,Muller G.,99.727268
7,Seppi A.,99.88%,Moutet C.,0.12%,Seppi A.,76,155,686,361,1.33,3.4,3-6 6-4 6-2,Seppi A.,99.875848
8,Ferrer D.,12.78%,Rublev A.,87.22%,Rublev A.,33,32,1360,1373,1.9,1.9,5-7 7-6 2-6 7-6,Rublev A.,87.220674
9,Ebden M.,47.74%,Isner J.,52.26%,Ebden M.,78,16,684,2265,3.5,1.3,6-4 3-6 6-3,Isner J.,52.26486



–ú–µ—Ç—Ä–∏–∫–∏ –∫–∞—á–µ—Å—Ç–≤–∞ –º–æ–¥–µ–ª–∏:
Accuracy (–¢–æ—á–Ω–æ—Å—Ç—å): 86.18%
Precision (–¢–æ—á–Ω–æ—Å—Ç—å –ø–æ –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω–æ–º—É –∫–ª–∞—Å—Å—É): 82.35%
Recall (–ü–æ–ª–Ω–æ—Ç–∞): 91.80%
F1-score (F-–º–µ—Ä–∞): 86.82%
ROC AUC: 94.10%
Log Loss: 0.4044

–ú–∞—Ç—Ä–∏—Ü–∞ –æ—à–∏–±–æ–∫ (Confusion Matrix):
True Negative: 50 (–ü—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –≤—Ç–æ—Ä–æ–≥–æ –∏–≥—Ä–æ–∫–∞)
False Positive: 12 (–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –ø–µ—Ä–≤–æ–≥–æ –∏–≥—Ä–æ–∫–∞)
False Negative: 5 (–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –≤—Ç–æ—Ä–æ–≥–æ –∏–≥—Ä–æ–∫–∞)
True Positive: 56 (–ü—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –ø–µ—Ä–≤–æ–≥–æ –∏–≥—Ä–æ–∫–∞)

–¢–æ—á–Ω–æ—Å—Ç—å –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π –ø–æ —É—Ä–æ–≤–Ω—é —É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç–∏:
–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å 50%-60%: 50.00% —Ç–æ—á–Ω–æ—Å—Ç—å (6 –º–∞—Ç—á–µ–π)
–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å 60%-70%: 80.00% —Ç–æ—á–Ω–æ—Å—Ç—å (5 –º–∞—Ç—á–µ–π)
–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—

### –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ –Ω–∞ –≤—ã–±–æ—Ä–∫–µ —Å –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã–º–∏ –∑–Ω–∞—á–µ–Ω–∏—è–º–∏

In [13]:
df_test = pd.read_csv('atp_tennis_test_minus_1_set_47967_big.csv')

# –ò–º–∏—Ç–∏—Ä—É–µ–º –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è –≤ —Ç–µ—Å—Ç–æ–≤–æ–π –≤—ã–±–æ—Ä–∫–µ
df_test_missing = create_realistic_tennis_missing_data(df_test, random_state=32)

# –ü–æ–ª—É—á–∞–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è
predictions = get_predictions(model, df_test_missing)
    
# –°–æ–∑–¥–∞–µ–º —Ç–∞–±–ª–∏—Ü—É —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
results = create_results_table(df_test_missing, predictions)
    
# –û—Ç–æ–±—Ä–∞–∂–∞–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã
display_results(results)
    
# –û—Ç–æ–±—Ä–∞–∂–∞–µ–º –ø–æ–¥—Ä–æ–±–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏ –∫–∞—á–µ—Å—Ç–≤–∞ –º–æ–¥–µ–ª–∏
display_metrics(model, df_test_missing, predictions, results)

Unnamed: 0,Player_1,Player_1_Prediction,Player_2,Player_2_Prediction,Winner,Rank_1,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,Score,predicted_winner,confidence
0,Haider-Maurer A.,0.00%,Dolgopolov O.,100.00%,Dolgopolov O.,448,37,81.0,1296.0,13.0,1.04,6-7,Dolgopolov O.,100.0
1,Harrison R.,96.45%,Sela D.,3.55%,Harrison R.,45,95,1115.0,585.0,1.3,3.5,6-3 5-7 3-6 7-5,Harrison R.,96.45367
2,Anderson K.,53.34%,Edmund K.,46.66%,Edmund K.,12,49,2610.0,992.0,1.4,3.0,7-6 3-6 6-3 3-6,Anderson K.,53.339795
3,Carreno Busta P.,100.00%,Kubler J.,0.00%,Carreno Busta P.,11,243,2615.0,217.0,1.28,3.75,7-5 4-6 7-5,Carreno Busta P.,99.99965
4,Youzhny M.,0.10%,Cuevas P.,99.90%,Cuevas P.,90,34,604.0,1345.0,2.1,1.72,6-7 3-6,Cuevas P.,99.902443
5,Istomin D.,99.89%,Herbert P.H.,0.11%,Istomin D.,60,74,848.0,690.0,1.66,2.2,6-2 6-1 5-7,Istomin D.,99.892574
6,Delbonis F.,0.27%,Muller G.,99.73%,Muller G.,67,28,755.0,1490.0,4.0,1.25,5-7 4-6,Muller G.,99.727268
7,Seppi A.,46.93%,Moutet C.,53.07%,Seppi A.,76,155,686.0,361.0,1.33,3.4,3-6,Moutet C.,53.068554
8,Ferrer D.,12.78%,Rublev A.,87.22%,Rublev A.,33,32,1360.0,1373.0,1.9,1.9,5-7 7-6 2-6 7-6,Rublev A.,87.220674
9,Ebden M.,24.66%,Isner J.,75.34%,Ebden M.,78,16,684.0,2265.0,3.5,1.3,6-4,Isner J.,75.336017



–ú–µ—Ç—Ä–∏–∫–∏ –∫–∞—á–µ—Å—Ç–≤–∞ –º–æ–¥–µ–ª–∏:
Accuracy (–¢–æ—á–Ω–æ—Å—Ç—å): 84.55%
Precision (–¢–æ—á–Ω–æ—Å—Ç—å –ø–æ –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω–æ–º—É –∫–ª–∞—Å—Å—É): 80.88%
Recall (–ü–æ–ª–Ω–æ—Ç–∞): 90.16%
F1-score (F-–º–µ—Ä–∞): 85.27%
ROC AUC: 93.42%
Log Loss: 0.4290

–ú–∞—Ç—Ä–∏—Ü–∞ –æ—à–∏–±–æ–∫ (Confusion Matrix):
True Negative: 49 (–ü—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –≤—Ç–æ—Ä–æ–≥–æ –∏–≥—Ä–æ–∫–∞)
False Positive: 13 (–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –ø–µ—Ä–≤–æ–≥–æ –∏–≥—Ä–æ–∫–∞)
False Negative: 6 (–ù–µ–ø—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –≤—Ç–æ—Ä–æ–≥–æ –∏–≥—Ä–æ–∫–∞)
True Positive: 55 (–ü—Ä–∞–≤–∏–ª—å–Ω–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–µ –ø–æ–±–µ–¥—ã –ø–µ—Ä–≤–æ–≥–æ –∏–≥—Ä–æ–∫–∞)

–¢–æ—á–Ω–æ—Å—Ç—å –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π –ø–æ —É—Ä–æ–≤–Ω—é —É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç–∏:
–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å 50%-60%: 57.14% —Ç–æ—á–Ω–æ—Å—Ç—å (7 –º–∞—Ç—á–µ–π)
–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å 60%-70%: 60.00% —Ç–æ—á–Ω–æ—Å—Ç—å (5 –º–∞—Ç—á–µ–π)
–£–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—