In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs
from typing import Dict, Text
import os

from sklearn.model_selection import train_test_split

#from sklearn.preprocessing import OneHotEncoder
"""from google.colab import drive
drive.mount('/content/drive')
ds_path = '/content/drive/MyDrive/Data_Mining_Project/Datasets/'"""
ds_path = 'Datasets/'
assert os.path.exists(ds_path)



features = ['home_0','home_1','home_2','home_3','home_4','away_0','away_1','away_2','away_3','away_4']

df = pd.DataFrame()
# Load data from matchups-2007.csv to matchups-2012.csv and append them to df
for i in range(2007, 2013):
    df1 = pd.read_csv(ds_path + "matchups-" + str(i) + ".csv")[features]
    print(len(df1))
    df = pd.concat([df, df1])
df.reset_index(drop=True, inplace=True)
print(f"len of final df: {len(df)}")
print("first print \n" + str(df))
print(df.shape)


27500
26593
26407
26344
26447
21241
len of final df: 154532
first print 
                 home_0            home_1            home_2            home_3  \
0          Andrew Bynum        Lamar Odom       Luke Walton     Sasha Vujacic   
1          Andrew Bynum        Lamar Odom       Luke Walton     Sasha Vujacic   
2            Lamar Odom       Luke Walton     Maurice Evans      Ronny Turiaf   
3            Lamar Odom       Luke Walton     Maurice Evans      Ronny Turiaf   
4           Luke Walton     Maurice Evans      Ronny Turiaf      Smush Parker   
...                 ...               ...               ...               ...   
154527  Bismack Biyombo  Gerald Henderson      Jamario Moon      Kemba Walker   
154528  Bismack Biyombo  Gerald Henderson      Jamario Moon      Kemba Walker   
154529  Bismack Biyombo     Byron Mullens     Derrick Brown  Gerald Henderson   
154530  Bismack Biyombo     Derrick Brown  Gerald Henderson      Kemba Walker   
154531  Bismack Biyombo     Derrick 

In [6]:
# Flatten the DataFrame slice into a single array of player names
player_names_array = df[["home_0", "home_1", "home_2", "home_3", "home_4", "away_0", "away_1", "away_2", "away_3", "away_4"]].values.flatten()

# Find the unique player names in the array
unique_player_names = np.unique(player_names_array)
print(unique_player_names)

['A.J. Price' 'Aaron Brooks' 'Aaron Gray' 'Aaron McKie' 'Aaron Williams'
 'Acie Law' 'Adam Morrison' 'Adonal Foyle' 'Adrian Griffin'
 'Al Harrington' 'Al Horford' 'Al Jefferson' 'Al Thornton'
 'Al-Farouq Aminu' 'Alan Anderson' 'Alan Henderson' 'Alando Tucker'
 'Alec Burks' 'Alex Acker' 'Alexander Johnson' 'Alexis Ajinca' 'Allan Ray'
 'Allen Iverson' 'Alonzo Gee' 'Alonzo Mourning' 'Alvin Williams'
 "Amar'e Stoudemire" 'Amir Johnson' 'Anderson Varejao' 'Andray Blatche'
 'Andre Barrett' 'Andre Brown' 'Andre Emmett' 'Andre Iguodala'
 'Andre Miller' 'Andre Owens' 'Andrea Bargnani' 'Andreas Glyniadakis'
 'Andrei Kirilenko' 'Andres Nocioni' 'Andrew Bogut' 'Andrew Bynum'
 'Andrew Goudelock' 'Andris Biedrins' 'Andy Rautins' 'Anfernee Hardaway'
 'Antawn Jamison' 'Anthony Carter' 'Anthony Johnson' 'Anthony Morrow'
 'Anthony Parker' 'Anthony Randolph' 'Anthony Roberson' 'Anthony Tolliver'
 'Antoine Walker' 'Antoine Wright' 'Antonio Anderson' 'Antonio Daniels'
 'Antonio McDyess' 'Armon Johnson' 'Ar



We can approach the task of recommending a player based on the other nine players using a simplified item-based collaborative filtering approach. This method will focus on understanding the similarities between players based on their occurrences together in lineups, without explicitly using user data or game outcomes.

In a real-world scenario, this approach would involve complex similarity computations and possibly leveraging player stats, positions, or other features to find the best matches. However, for simplicity, let's create a basic example that demonstrates the concept using Python and common libraries like Pandas. This example will focus on the core idea of recommending a player based on the presence of other players, assuming we have pre-computed some form of similarity scores between players or we derive similarity from their co-occurrences.

Step 1: Prepare the Data

We have a DataFrame df with game lineups where each row represents a lineup and contains the names of nine players (excluding the player to be predicted). In a real setup, we also have data on player performances, positions, etc., but here we'll keep it to player names for simplicity.

Step 2: Compute Similarity Scores

Let's assume a basic method to compute similarity: the number of times players appear together.

Step 3: Recommend a Player

The recommendation for the missing player can be based on the most similar players to the given lineup of nine players, leveraging the computed similarity scores.

In [7]:
# First approach: a simplified item-based collaborative filtering

# Function to recommend a player based on a lineup and similarity_scores
def recommend_player(current_lineup, similarity_scores):
    recommended_candidates = defaultdict(int)
    for player in current_lineup:
        for pair, score in similarity_scores.items():
            if player in pair:
                recommended_candidates[pair[1 if pair[0] == player else 0]] += score
    # Sort candidates based on their scores
    recommended_sorted = sorted(recommended_candidates.items(), key=lambda x: x[1], reverse=True)
    # Return the top recommended player not already in the lineup
    for recommendation, _ in recommended_sorted:
        if recommendation not in current_lineup:
            return recommendation
    return None  # In case no recommendation is found

# Example
#current_lineup = X_test.iloc[110,:9]  # Up to 9 players
#print(current_lineup)
#recommended_player = recommend_player(current_lineup)
#print(f"Recommended player to add: {recommended_player}, True player: {X_test.iloc[110,9]}")

In [10]:
from collections import defaultdict

accuracy_list = []

for a_seed in range(100, 102):
    X_train, X_test = train_test_split(df[["home_0", "home_1", "home_2", "home_3", "home_4",
                                           "away_0", "away_1", "away_2", "away_3", "away_4"]], test_size=0.2, random_state=a_seed)

    
    similarity_scores = defaultdict(int)
    
    for index, row in X_train.iterrows():
        players = row.values
        #print(players)
        for player in players:
            for other_player in players:
                if player != other_player:
                    similarity_scores[tuple(sorted((player, other_player)))] += 1 # it reduces the memory by half.
    print()
    print(f"len(similarity_scores): {len(similarity_scores)}")




    correct_recommendations = 0
    total_recommendations = 0
    
    total_recommendations_to_make = len(X_test) * 10
    # Iterate over the test dataset
    for index, row in X_test.iterrows():
        players = row.values
        for target_player in players:
            current_lineup = [player for player in players if player != target_player]
            recommended_player = recommend_player(current_lineup, similarity_scores)
            if recommended_player == target_player:
                correct_recommendations += 1
            total_recommendations += 1
    
        progress = f"{total_recommendations}/{total_recommendations_to_make}: accuracy: {correct_recommendations/total_recommendations}"
        print(f"\rProgress: {progress}", end='', flush=True)
    
    # Calculate and print the accuracy
    accuracy = correct_recommendations / total_recommendations
    print()
    print(f"Accuracy: {accuracy:.4f}")
    accuracy_list.append(accuracy)

print()
print()
print(f"Accuracy: {np.mean(accuracy_list):.4f}, std: {np.std(accuracy_list):.4f}")


len(similarity_scores): 163589
Progress: 309070/309070: accuracy: 0.13590125214352736
Accuracy: 0.1359

len(similarity_scores): 163609
Progress: 309070/309070: accuracy: 0.13731193580742226
Accuracy: 0.1373


Accuracy: 0.1366, std: 0.0007


1. Iterate over each row in the test dataset (X_test).
2. For each row, iterate over each player in that row as the target player.
3. Remove the target player from the lineup, leaving the other nine players.
4. Call the recommend_player() function with these nine players.
5. Check if the recommended player matches the target player. If it does, count this as a correct recommendation.
6. Calculate the accuracy of the model as the ratio of correct recommendations to the total number of recommendations made.