Here, in line-up prediction we cannot use a whole year as the test set. Since many changes have been occured (data shift), we have many new players this year and many retired.

In [1]:
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import os
#from sklearn.preprocessing import OneHotEncoder
"""from google.colab import drive


drive.mount('/content/drive')
ds_path = '/content/drive/MyDrive/Data_Mining_Project/Datasets/'"""

ds_path = 'Datasets/'
assert os.path.exists(ds_path)

features = ['outcome', 'season', 'home_team','away_team','starting_min','home_0','home_1','home_2','home_3','home_4','away_0','away_1','away_2','away_3','away_4']
home_players_columns = ['home_0','home_1','home_2','home_3','home_4']
away_players_columns = ['away_0','away_1','away_2','away_3','away_4']


df = pd.DataFrame()
# Load data from matchups-2007.csv to matchups-2012.csv and append them to df
for i in range(2007, 2013):
    df1 = pd.read_csv(ds_path + "matchups-" + str(i) + ".csv")[features]
    print(f"size of the matchup {i} is {len(df1)}")
    df = pd.concat([df, df1])
df.reset_index(drop=True, inplace=True)
print(f"len of final df: {len(df)}")
print("first print \n" + str(df))
print(df.shape)


def encode_teams_names(df):
    global unique_teams
    # Get unique team names
    unique_teams = pd.concat([df['home_team'], df['away_team']]).unique()
    # Initialize a DataFrame with all zeros
    encoded_df = pd.DataFrame(0, index=df.index, columns=unique_teams)
    # Set values for home teams and away teams
    for i in range(df.shape[0]):
      encoded_df.loc[i, df.loc[i, 'home_team']] = 1
      encoded_df.loc[i, df.loc[i, 'away_team']] = -1
    #encoded_df[df['home_team']] = 1 # works incorrect
    #encoded_df[df['away_team']] = -1 # works incorrect
    # Concatenate the encoded team DataFrame with the original DataFrame
    df = pd.concat([df, encoded_df], axis=1)
    # Drop the original home_team and away_team columns
    df = df.drop(['home_team', 'away_team'], axis=1)
    return df


def encode_player_names(df):
    global all_players
    # Get unique player names
    all_players = df[home_players_columns + away_players_columns].stack().unique()
    # Initialize a DataFrame with all zeros
    encoded_df = pd.DataFrame(0, index=df.index, columns=all_players)
    # Set values for home team players and away team players
    for i in range(df.shape[0]):
      encoded_df.loc[i, df.loc[i, home_players_columns]] = 1
      encoded_df.loc[i, df.loc[i, away_players_columns]] = -1
    #encoded_df[df[home_players_columns]] = 1
    #encoded_df[df[away_players_columns]] = -1
    # Concatenate the encoded player DataFrame with the original DataFrame
    df = pd.concat([df, encoded_df], axis=1)
    # Drop the original player columns
    df = df.drop(home_players_columns + away_players_columns, axis=1)
    return df


raw_df = df.copy()

df = encode_teams_names(df)
print("print after encode teams names \n" + str(df))
#print(df[['LAL', 'PHO', 'MIL', 'CHA']])
df = encode_player_names(df)
print("print after encode players names \n" + str(df))
#print(df[['Smush Parker', 'Boris Diaw', 'Matt Carroll']]) # CHECKED AND CORRECT SO FAR
print(f"size: {df.size}")


size of the matchup 2007 is 27500
size of the matchup 2008 is 26593
size of the matchup 2009 is 26407
size of the matchup 2010 is 26344
size of the matchup 2011 is 26447
size of the matchup 2012 is 21241
len of final df: 154532
first print 
        outcome  season home_team away_team  starting_min           home_0  \
0            -1    2007       LAL       PHO             0     Andrew Bynum   
1            -1    2007       LAL       PHO             6     Andrew Bynum   
2             1    2007       LAL       PHO             8       Lamar Odom   
3             1    2007       LAL       PHO            10       Lamar Odom   
4            -1    2007       LAL       PHO            11      Luke Walton   
...         ...     ...       ...       ...           ...              ...   
154527       -1    2012       CHA       NOH            38  Bismack Biyombo   
154528       -1    2012       CHA       NOH            40  Bismack Biyombo   
154529       -1    2012       CHA       NOH            42

In [2]:
print(f"size: {df.shape}")
print(f"size: {raw_df.shape}")

size: (154532, 847)
size: (154532, 15)


In [3]:
import numpy as np

def infer_missing_player(distances, indices, modified_row, X_train):
    # Start with the closest sample
    closest_sample_index = indices[0][0]  # The first neighbor's index
    closest_sample = X_train.iloc[closest_sample_index].values
    
    # Iterate through each column in the closest sample
    for col_index in range(len(closest_sample)):
        # Check if the column is 1 or -1 in the closest sample while being 0 in the test row
        if (closest_sample[col_index] in [1, -1]) and (modified_row[col_index] == 0):
            # Return the column name for this index
            return X_train.columns[col_index]
    return None  # In case no column meets the condition


In [4]:
X_train_bu, X_test, raw_train_bu, raw_test = train_test_split(df, raw_df, test_size=0.2, random_state=42)

In [5]:
X_test = X_test.reset_index(drop=True)
raw_test = raw_test.reset_index(drop=True)

In [6]:
import warnings

warnings.filterwarnings('ignore')

For each row in the test set, remove the encoding for one random player (setting their encoding to 0), and use the KNN model to predict which player was removed based on the closest neighbors in the training set.
Count the number of correct predictions where the KNN model correctly identifies the missing player.
Calculate the accuracy of these predictions.

Let's start with the closest sample to the modified test row. For the closest sample, we'll identify the columns where the value is either 1 (home player) or -1 (away player), and the corresponding column in our test row is 0 (indicating the removed player). We'll return the name of the first column that meets this condition as our prediction.

In [7]:
from sklearn.neighbors import NearestNeighbors

accuracy_list = []
for a_seed in range(100, 110):
    X_train, X_val, raw_train, raw_val = train_test_split(X_train_bu, raw_train_bu, test_size=0.2, random_state=a_seed)
    X_val = X_val.reset_index(drop=True)
    raw_val = raw_val.reset_index(drop=True)
    
    correct_predictions = 0
    total_predictions = 0
    # Iterate over the test set
    knn = NearestNeighbors(n_neighbors=5, metric='cosine', algorithm='brute')
    knn.fit(X_train)
    for index, encoded_row in X_val.iterrows():
        original_row = raw_val.iloc[index]  # Corresponding row in the raw dataframe for validation
        
        # removing each player by setting their feature to 0
        #for player_col in home_players_columns + home_players_columns: 
        player_col = np.random.choice(home_players_columns + home_players_columns)
        modified_row = encoded_row.copy()
        
        modified_row[original_row[player_col]] = 0 
        
        # Find the nearest neighbors for the modified row
        distances, indices = knn.kneighbors([modified_row], n_neighbors=1)
        
        # Infer the missing player's column name
        predicted_column = infer_missing_player(distances, indices, modified_row, X_train)
        
        if predicted_column and encoded_row[predicted_column] in [1, -1]:
            correct_predictions += 1
        total_predictions += 1
        
        
        progress = f"{total_predictions}/{len(X_val)}: accuracy: {correct_predictions/total_predictions}"
        print(f"\rProgress: {progress}", end='', flush=True)
            
    
    # Calculate and print the accuracy
    accuracy = correct_predictions / total_predictions
    print()
    print(f"\nAccuracy: {accuracy:.4f}")
    accuracy_list.append(accuracy)
print()
print(f"Accuracy: {np.mean(accuracy_list):.4f}, std: {np.std(accuracy_list):.4f}")



Progress: 24725/24725: accuracy: 0.23745197168857432

Accuracy: 0.2375
Progress: 24725/24725: accuracy: 0.24230535894843275

Accuracy: 0.2423
Progress: 24725/24725: accuracy: 0.24254802831142578

Accuracy: 0.2425
Progress: 24725/24725: accuracy: 0.23967644084934278

Accuracy: 0.2397
Progress: 24725/24725: accuracy: 0.24501516683518706

Accuracy: 0.2450
Progress: 24725/24725: accuracy: 0.24012133468149646

Accuracy: 0.2401
Progress: 24725/24725: accuracy: 0.24513650151668358

Accuracy: 0.2451
Progress: 24725/24725: accuracy: 0.24048533872598585

Accuracy: 0.2405
Progress: 24725/24725: accuracy: 0.24141557128412539

Accuracy: 0.2414
Progress: 24725/24725: accuracy: 0.24469160768452983

Accuracy: 0.2447

Accuracy: 0.2419, std: 0.0024


In [8]:
knn = NearestNeighbors(n_neighbors=5, metric='cosine', algorithm='brute')
knn.fit(X_train_bu)
for index, encoded_row in X_test.iterrows():
    original_row = raw_test.iloc[index]  # Corresponding row in the raw dataframe for validation
    
    # removing each player by setting their feature to 0
    #for player_col in home_players_columns + home_players_columns: 
    player_col = np.random.choice(home_players_columns + home_players_columns)
    modified_row = encoded_row.copy()
    
    modified_row[original_row[player_col]] = 0 
    
    # Find the nearest neighbors for the modified row
    distances, indices = knn.kneighbors([modified_row], n_neighbors=1)
    
    # Infer the missing player's column name
    predicted_column = infer_missing_player(distances, indices, modified_row, X_train_bu)
    
    if predicted_column and encoded_row[predicted_column] in [1, -1]:
        correct_predictions += 1
    total_predictions += 1
    
    
    progress = f"{total_predictions}/{len(X_test)}: accuracy: {correct_predictions/total_predictions}"
    print(f"\rProgress: {progress}", end='', flush=True)
        

# Calculate and print the accuracy
accuracy = correct_predictions / total_predictions
print()
print(f"\nAccuracy: {accuracy:.4f}")


Progress: 55632/30907: accuracy: 0.25774733966062696

Accuracy: 0.2577


In [7]:
"""team_season_players = {}

# Process home team players
for _, row in raw_df.iterrows():
    # Home team
    home_team = row['home_team']
    season = row['season']
    home_players = {row[f'home_{i}'] for i in range(5)}
    
    if home_team not in team_season_players:
        team_season_players[home_team] = {}
    if season not in team_season_players[home_team]:
        team_season_players[home_team][season] = set()
    
    team_season_players[home_team][season].update(home_players)
    
    # Away team (repeating the process for the away team)
    away_team = row['away_team']
    away_players = {row[f'away_{i}'] for i in range(5)}
    
    if away_team not in team_season_players:
        team_season_players[away_team] = {}
    if season not in team_season_players[away_team]:
        team_season_players[away_team][season] = set()
    
    team_season_players[away_team][season].update(away_players)"""