Hello and welcome to my presentation of the NBA player prediction project.

The goal of the program is to create a graph that shows all of the current players based on their stats and predicting their results for the current season.

First we have all the imports which we will go through as they come up in the project.

We start with the class PlayerStats which is an object of a player that, as its attributes has all the values present in the parameters. These are all the variables present in the "NBA player (year).csv" files.

In [None]:
class PlayerStats(object):
    def __init__(self, rk, player, pos, age, team, g, gs, mp, fg, fga, fg_pct, three_p, three_pa, three_pct, two_p, two_pa, two_pct, efg_pct, ft, fta, ft_pct, orb, drb, trb, ast, stl, blk, tov, pf, pts):
        self.rk = rk
        self.player = player
        self.pos = pos
        self.age = age
        self.team = team
        self.g = g
        self.gs = gs
        self.mp = mp
        self.fg = fg
        self.fga = fga
        self.fg_pct = fg_pct
        self.three_p = three_p
        self.three_pa = three_pa
        self.three_pct = three_pct
        self.two_p = two_p
        self.two_pa = two_pa
        self.two_pct = two_pct
        self.efg_pct = efg_pct
        self.ft = ft
        self.fta = fta
        self.ft_pct = ft_pct
        self.orb = orb
        self.drb = drb
        self.trb = trb
        self.ast = ast
        self.stl = stl
        self.blk = blk
        self.tov = tov
        self.pf = pf
        self.pts = pts

    def update_stats(self, **kwargs):
        for key, value in kwargs.items():
            setattr(self, key, value)

#nothing special here, just basic implementation of an object and an update method (which could be used but is currently not in use)

Next we have the DataManager class which is initialized with a csv file and an empty player_stats list. It creates a list of all the players present in the given csv file and instatiates them as objects.

In [None]:
class DataManager:
    def __init__(self, csv_file):
        self.csv_file = csv_file
        self.player_stats = []

    def load_data(self):
        data = pd.read_csv(self.csv_file)
        for _, row in data.iterrows():
            player = PlayerStats(
                rk=row['Rk'],
                player=row['Player'],
                pos=row['Pos'],
                age=row['Age'],
                team=row['Tm'],
                g=row['G'],
                gs=row['GS'],
                mp=row['MP'],
                fg=row['FG'],
                fga=row['FGA'],
                fg_pct=row['FG%'],
                three_p=row['3P'],
                three_pa=row['3PA'],
                three_pct=row['3P%'],
                two_p=row['2P'],
                two_pa=row['2PA'],
                two_pct=row['2P%'],
                efg_pct=row['eFG%'],
                ft=row['FT'],
                fta=row['FTA'],
                ft_pct=row['FT%'],
                orb=row['ORB'],
                drb=row['DRB'],
                trb=row['TRB'],
                ast=row['AST'],
                stl=row['STL'],
                blk=row['BLK'],
                tov=row['TOV'],
                pf=row['PF'],
                pts=row['PTS']
            )
            self.player_stats.append(player)

    def get_player_stats(self):
        return self.player_stats
    #used very often in the code to get the values of the player

    def update_player_stats(self, player, **kwargs):
        player.update_stats(**kwargs)
    #not currently used but it is always possible to change arguments of a player


Now comes the most important class of the whole project, basically the backbone, the GUI class. It spans lines 105-321 so we will divide covering this class into smaller parts. Here we go...

In [None]:
#First step is initializing the class, we pass it root so that it can control exiting the application and datamanager so that it gets the proper data

class GUI:
    def __init__(self, root, data_manager):
        self.root = root
        self.data_manager = data_manager
        self.header_var = tk.StringVar()
        self.header_names = []

        self.fig_frame = tk.Frame(self.root)
        self.fig_frame.pack(side=tk.LEFT)
        #frame for graph, move it to the left

        self.create_scatter_plot()
        #create the graph

        self.button_frame = tk.Frame(self.root)
        self.button_frame.pack(side=tk.RIGHT)
        #frame for buttons move it to the right


        self.image_label = tk.Label(self.button_frame)
        self.image_label.pack(side=tk.TOP)
        #frame for image, move it above the buttons

        self.name_label()
        self.create_label()
        self.predicted_label()
        self.real_label()
        self.create_role_menu()
        self.create_date_menu()
        self.create_confirm_button()
        #instantiate all the components

        self.root.protocol("WM_DELETE_WINDOW", self.on_closing)
        #make the close button actually stop the program

        def on_closing(self):
            self.root.destroy()
            sys.exit()
        #it is a system exit so that it exits the project for sure


Let us skip the display_player_image for now and focus on the easier parts:

In [None]:
    def name_label(self):
        self.name_var = tk.StringVar()
        self.name_var.set("Players name")
        label0 = tk.Label(self.button_frame, textvariable=self.name_var)
        label0.pack(side=tk.TOP)

    def create_label(self):
        self.label_var = tk.StringVar()
        self.label_var.set("This is where the player will be")
        label = tk.Label(self.button_frame, textvariable=self.label_var)
        label.pack(side=tk.TOP)

    def predicted_label(self):
        self.label_var2 = tk.StringVar()
        self.label_var2.set("Predicted player stats")
        label2 = tk.Label(self.button_frame, textvariable=self.label_var2)
        label2.pack(side=tk.TOP)

    def real_label(self):
        self.label_var3 = tk.StringVar()  
        self.label_var3.set("Difference or whatever (figure this part out)")
        label3 = tk.Label(self.button_frame, textvariable=self.label_var3)
        label3.pack(side=tk.TOP)

#all of these are just labels that are later meant to store the values of the player chosen, but for now they remain propagated with placeholder text

Now we can go back and explain how the image is added, but in order to do that we must first explain the helper functions that will be used in the function, and the first one from that is:

In [None]:
# This function uses the googleapiclient library in order to create an api app that searches for an image in google and returns a url to that image

def search_images(query):
    #create service
    service = build('customsearch', 'v1', developerKey=api_key)

    #build query
    response = service.cse().list(
        cx=cx, #custom search enginge
        q=query, #form of service
        searchType='image', #image to get
        num=1 #amount of images
    ).execute()

    #url extractionn
    if 'items' in response:
        image_url = response['items'][0]['link']
        return image_url

    return None

Now that we have a function that allows us to query images from the web we need a function that specifies what exactly we want:

In [None]:
#the query will follow search_images rules and search for player_name + basketball player online
def display_basketball_player_image(player_name):
    query = player_name + ' basketball player'
    image_url = search_images(query)

    return image_url
    #url of the extracted image

# player_name = "LeBron James"
# print(display_basketball_player_image(player_name))
# example code that prints the url of a picture of Lebron James

Knowing this we can finally go back to the original function:

In [None]:
def display_player_image(self, player_name):
    image_url = display_basketball_player_image(player_name)
    #url for image is queried
        try:
            if image_url:
                response = requests.get(image_url)
                response.raise_for_status()
                image_data = response.content
                image = Image.open(io.BytesIO(image_data))
                image = image.resize((200, 200))
                photo = ImageTk.PhotoImage(image)
                #if found, image is opened to form and resized

                self.image_label.configure(image=photo)
                self.image_label.image = photo
                self.name_label.configure(text=player_name)
                #image is actually placed
            else:
                self.image_label.configure(image="")
                self.name_label.configure(
                    text=f"No image found for {player_name}")
                #blank image is show, with the text above
        except (requests.RequestException, UnidentifiedImageError) as e:
            self.image_label.configure(image="")
            self.name_label.configure(
                text=f"Error loading image for {player_name}")
            print(f"An error occurred while loading the image: {e}")

#Note: Error handling in this function is crutial for a multitude of reasons. Here are the possible problems that could happen:
#     1.There is no image online for a certain player_name
#     2.Image found online is in an incorrect format
#     3.Quota for maximum amount of querries has been passed and one cannot querry more pictures
# There are just some of the problems that can happen when using google api


Next is the create_scatter_plot(). 

In [None]:
def create_scatter_plot(self):
    x = [float(player.pts)
         for player in self.data_manager.get_player_stats()]
    y = [float(player.g)
         for player in self.data_manager.get_player_stats()]
    #x and y will always start with the same values, points (pts) and games (g)

    self.fig_frame.destroy()
    self.fig_frame = tk.Frame(self.root)
    self.fig_frame.pack(side=tk.LEFT)
    # destroy old plot if exists

    fig = create_scatter_plot(
        x, y, self.data_manager.get_player_stats(), "PTS")
    #creating the actual scatter plot, look below for a more thourough explanation

    #creating a canvas
    canvas = FigureCanvasTkAgg(fig, master=self.fig_frame)
    canvas.draw()
    canvas.mpl_connect("pick_event", lambda event: self.on_plot_pick(
        event, canvas))  #here i incorporated a lamnda event to tie the on_plot_pick event to the canvas, meaning that pressing should result in the function activating
    canvas.get_tk_widget().pack()

    return canvas

As we omited the most crucial part of the above function the create_scatter_plot helper function. This creates the plot that is ultimately shown, it is a big function so lets split it into parts:

In [None]:
def create_scatter_plot(x, y, data, selected_header):
    fig, ax = plt.subplots()
    scatter = ax.scatter(x, y)
    plt.xlabel(selected_header.upper())  #update x-axis label
    plt.ylabel("G")
    plt.title("NBA Player Scatter Plot")

    #create cursor that displays the name on hover
    cursor = mplcursors.cursor(hover=True)

    #set font to Arial, this is done to avoid warnings
    prop = font_manager.FontProperties(family='Arial')


Now come the interesting parts of the function the @cursor.connect part

In [None]:
@cursor.connect("add")
   def on_hover(sel):
        index = sel.index
        player = data[index]

        name = player.player
        age = player.age
        g = player.g
        # gs = player.gs
        # mp = player.mp
        fg = player.fg
        fga = player.fga
        # fg_pct = player.fg_pct
        three_p = player.three_p
        three_pa = player.three_pa
        three_pct = player.three_pct
        # two_p = player.two_p
        # two_pa = player.two_pa
        # two_pct = player.two_pct
        # efg_pct = player.efg_pct
        ft = player.ft
        fta = player.fta
        # ft_pct = player.ft_pct
        # orb = player.orb
        # drb = player.drb
        trb = player.trb
        ast = player.ast
        stl = player.stl
        blk = player.blk
        tov = player.tov
        # pf = player.pf
        pts = player.pts

        sel.annotation.set_text(f"{name}, {age}")
        sel.annotation.set_fontproperties(prop)
        gui.name_var.set(f"{name}, {age}")
        gui.label_var.set(f"PTS: {pts}, G: {g}, FG: {fg}, FGA: {fga},   3P: {three_p}, 3PA: {three_pa}, 3P%: {three_pct}\n"
                          f"FT: {ft}, FTA: {fta}, TRB: {trb}, AST: {ast}, STL: {stl}, BLK: {blk}, TOV: {tov}")
#On hover a player is chose and his name gets passed to label_var which is the first label, it displays the current information stored in the actual csv file



        sus_player = get_sus_player(name)
        for row in sus_player:
            zg = row["G"]
            zpts = row["PTS"]
            ztrb = row["TRB"]
            zast = row["AST"]
            zstl = row["STL"]
            zblk = row["BLK"]
            ztov = row["TOV"]
            zfga = row["FGA"]
            zfg = row["FG"]
            zfta = row["FTA"]
            zft = row["FT"]
            zthree_pa = row["3PA"]
            zthree_p = row["3P"]
            zthree_pct = row["PER"]
        gui.label_var2.set(f"PTS: {zpts}, G: {zg}, FG: {zfg}, FGA: {zfga}, 3P: {zthree_p}, 3PA: {zthree_pa}, 3P%: {zthree_pct}\n"
                           f"FT: {zft}, FTA: {zfta}, TRB: {ztrb}, AST: {zast}, STL: {zstl}, BLK: {zblk}, TOV: {ztov}")
    
# the name of the player is passed to the get_sus_player function which predicts the players growth or decline, that function then return the player as an object and their arguments are displayed in label2

        dg = g - zg
        dpts = pts - zpts
        dtrb = trb - ztrb
        dast = ast - zast
        dstl = stl - zstl
        dblk = blk - zblk
        dtov = tov - ztov
        dfga = fga - zfga
        dfg = fg - zfg
        dfta = fta - zfta
        dft = ft - zft
        dthree_pa = three_pa - zthree_pa
        dthree_p = three_p - zthree_p
        dthree_pct = three_pct - zthree_pct
        gui.label_var3.set(f"PTS: {dpts:.2f}, G: {dg:.2f}, FG: {dfg:.2f}, FGA: {dfga:.2f}, 3P: {dthree_p:.2f}, 3PA: {dthree_pa:.2f}, 3P%: {dthree_pct:.2f}\n"
                           f"FT: {dft:.2f}, FTA: {dfta:.2f}, TRB: {dtrb:.2f}, AST: {dast:.2f}, STL: {dstl:.2f}, BLK: {dblk:.2f}, TOV: {dtov:.2f}")

#the third label is the difference between the actual player stats and the predicted payer stats, this shows how accurate the prediction algorithm is and the information is displayed in label 3


        gui.display_player_image(name)
#gui is updated so that it show the correct player image (if possible)

        gui.url.set(display_basketball_player_image(name))
#correct url is passed, changed due to player name

        allinfo = get_player_info(name)
        for row in allinfo:
            # print(x)
            pts = row["PTS"]
            g = row["G"]
            # print(f"Points: {pts}, Games: {g}")
#an unused function that gets the info from all files, based on a specific name, it can be changed to fit in different rows or all of them

    def on_click(event):
        if event.button == 1:
            index = event.index
            name = data[index].player
            gui.label_var.set(f"Selected player: {name}")

    fig.canvas.mpl_connect("button_press_event", on_click)
#unfrotunately on_click does not work as it should. The reason is still unclear to me, however i am sure that it is due to how the players are instantiated on the scatter graph

    return fig


As the final part of the scatter graph we have:

In [None]:
def on_plot_pick(self, event, canvas):
    # Get the index of the selected player
    index = event.ind[0]
    player = self.data_manager.get_player_stats()[index]
    # Update the label with the player name
    self.label_var.set(f"Selected player: {player.player}")
    # Update the canvas to reflect the label change
    canvas.draw()
#this should work, hoewever the on_plot_pick has an underlying error as portrayed in the final part of the create_scatter_plot helper function

Now we add the role_menu and the date_menu. They both work in a simmilar way. They update the graph based on the chosen value, role_menu changes the atribute that is looked at per each game and date_menu changes the file from which we read the data.

In [None]:
def create_role_menu(self):
        header_names = [
            'rk', 'age', 'g', 'gs', 'mp', 'fg', 'fga',
            'fg_pct', 'three_p', 'three_pa', 'three_pct', 'two_p', 'two_pa', 'two_pct', 'efg_pct',
            'ft', 'fta', 'ft_pct', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk',
            'tov', 'pf', 'pts'
        ]
        #options to choose from

        self.header_names = header_names
        self.header_var.set(header_names[0])
        option_menu = tk.OptionMenu(
            self.button_frame, self.header_var, *header_names)
        option_menu.pack(side=tk.TOP)

    def create_date_menu(self):
        dates = [
            '22/23', '21/22', '20/21'
        ]
        date_files = [
            'NBA player 2022-2023.csv', 'NBA player 2021-2022.csv', 'NBA player 2020-2021.csv'
        ]
        #options to choose from

        self.dates = dates
        self.dates_var = tk.StringVar()
        self.dates_var.set(dates[0])

        def on_date_select(*args):
            selected_date = self.dates_var.get()
            index = dates.index(selected_date)
            selected_file = date_files[index]
            self.data_manager = DataManager(selected_file)
            self.data_manager.load_data()
            self.update_plot()
            #updates the file that is used for the graph

        option_menu = tk.OptionMenu(
            self.button_frame, self.dates_var, *dates, command=on_date_select)
        option_menu.pack(side=tk.TOP)

    def create_confirm_button(self):
        confirm_button = tk.Button(
            self.button_frame, text="Confirm", command=self.update_plot)
        confirm_button.pack(side=tk.TOP)
        #confirm button is made for the role_menu in order to show that i can do both on choosing and on pressing a seperate button

Ofcourse this file would not be complete without update_plot():

In [None]:
def update_plot(self):
    selected_header = self.header_var.get()

    x = [float(getattr(player, selected_header.lower()))
         for player in self.data_manager.get_player_stats()]
    y = [float(player.g)
         for player in self.data_manager.get_player_stats()]

    # Destroy the old plot if it exists
    self.fig_frame.destroy()
    self.fig_frame = tk.Frame(self.root)
    self.fig_frame.pack(side=tk.LEFT)

    fig = create_scatter_plot(
        x, y, self.data_manager.get_player_stats(), selected_header)

    # Create a canvas to display the plot
    canvas = FigureCanvasTkAgg(fig, master=self.fig_frame)
    canvas.draw()
    canvas.get_tk_widget().pack()
#this basically does the same as create but it does not create a canvas, only changes it

These are the helper functions that were mentioned previously but not explained properly.

In [None]:
def get_player_info(player_name):
    csv_files = ['NBA player 2022-2023.csv',
                 'NBA player 2021-2022.csv', 'NBA player 2020-2021.csv']
    player_info = []

    for csv_file in csv_files:
        data = pd.read_csv(csv_file)
        matching_rows = data[data['Player'] == player_name]
        for _, row in matching_rows.iterrows():
            player_info.append(row)

    return player_info
#gets the info of a player from all the listed csv files and returns a list of players matching the name


def get_sus_player(player_name):
    player_info = []
    data = pd.read_csv('pure_efficiency_stats_22_23.csv')
    matching_rows = data[data['Player'] == player_name]
    for _, row in matching_rows.iterrows():
        player_info.append(row)

    # for x in player_info:
    #     print(x)
    return player_info
# gets the projected value of a player based on the output of the computation in the projection file (this can be found in "pure_efficiency_stats_22_23.csv")


And finally:

In [None]:
if __name__ == "__main__":
    root = tk.Tk()
    data_manager = DataManager("NBA player 2022-2023.csv")
    data_manager.load_data()

    gui = GUI(root, data_manager)

    root.protocol("WM_DELETE_WINDOW", gui.on_closing)

    root.mainloop()
#main loop that runs the program

Now onto the documentation for the projection file

import csv
import pandas as pd
from sklearn.linear_model import LinearRegression

#We start by importing what we are going to use from the library, in our case we are working with csv files so we import that, pandas as well, so we could manipulate the data inside said csv files and output them into separate files. Finally we import LinearRegression from the sklearn library. Which has a built in functionality of a neural network.

Then, we move into the part where we selected the required data from the csv file.
We calculate PER (pure efficiency rating) by the given formula:
PER = PTS + TRB + AST + STL + BLK - Missed_FG - Missed_FT    - TOV / G

In [None]:
df = pd.read_csv('NBA player 2020-2021.csv')

per = df.loc[:, ['Player', 'G', 'PTS', 'TRB', 'AST', 'STL',
                 'BLK', 'TOV', 'FGA', 'FG', 'FTA', 'FT', '3PA', '3P']]
per.loc[:, 'PER'] = (
    (per['PTS']
     + per['TRB']
     + per['AST']
     + per['STL']
     + per['BLK']
     - (per['FGA'] - per['FG'])
     - (per['FTA'] - per['FT'])
     - per['TOV'])
    / per['G']
)

per['PER'] = per['PER'].round(2)

per.to_csv('pure_efficiency_stats_20_21.csv', index=False)


This is then repeated for all 3 files

In [None]:
df_2 = pd.read_csv('NBA player 2021-2022.csv')

per_2 = df_2.loc[:, ['Player', 'G', 'PTS', 'TRB', 'AST',
                     'STL', 'BLK', 'TOV', 'FGA', 'FG', 'FTA', 'FT', '3PA', '3P']]
per_2.loc[:, 'PER'] = (
    (per_2['PTS']
     + per_2['TRB']
     + per_2['AST']
     + per_2['STL']
     + per_2['BLK']
     - (per_2['FGA'] - per_2['FG'])
     - (per_2['FTA'] - per_2['FT'])
     - per_2['TOV'])
    / per_2['G']
)

per_2['PER'] = per_2['PER'].round(2)

per_2.to_csv('pure_efficiency_stats_21_22.csv', index=False)

df_3 = pd.read_csv('NBA player 2022-2023.csv')

per_3 = df_3.loc[:, ['Player', 'G', 'PTS', 'TRB', 'AST',
                     'STL', 'BLK', 'TOV', 'FGA', 'FG', 'FTA', 'FT', '3PA', '3P']]
per_3.loc[:, 'PER'] = (
    (per_3['PTS']
     + per_3['TRB']
     + per_3['AST']
     + per_3['STL']
     + per_3['BLK']
     - (per_3['FGA'] - per_3['FG'])
     - (per_3['FTA'] - per_3['FT'])
     - per_3['TOV'])
    / per_3['G']
)

per_3['PER'] = per_3['PER'].round(2)

# per_3['PTS'] += 1
# per_3['STL'] *= 1.2
# per_3['STL'] = per_2['STL'].round(2)
# per_3['TOV'] *= 0.9
# per_3['TOV'] = per_2['TOV'].round(2)
# per_3['FT'] *= 0.8
# per_3['FT'] = per_2['FT'].round(2)
# per_3['BLK'] *= 1.2
# per_3['BLK'] = per_2['BLK'].round(2)
# per_3['AST'] *= 1.1
# per_3['AST'] = per_2['AST'].round(2)
# per_3['G'] -= 1

per_3.to_csv('pure_efficiency_stats_22_23.csv', index=False)


Then we pass the first two files as the training set, and the last file as the test set.
Then we linerally regress through the training data set and predict the projected PER rating for a player. We then compare the predicted PER rating with the actual PER rating, in the latest file.

In [None]:
train_data_1 = pd.read_csv('pure_efficiency_stats_20_21.csv')
train_data_2 = pd.read_csv('pure_efficiency_stats_21_22.csv')

train_data = pd.concat([train_data_1, train_data_2], ignore_index=True)

test_data = pd.read_csv('pure_efficiency_stats_22_23.csv')

train_data = train_data.drop('Player', axis=1)
test_data = test_data.drop('Player', axis=1)

X_train = train_data.drop('PER', axis=1)
y_train = train_data['PER']

X_test = test_data.drop('PER', axis=1)
y_test = test_data['PER']

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(comparison)

comparison.to_csv('final_results.csv', index=False)


# Read the first dataset (Actual, Predicted)
with open('final_results.csv', 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    next(reader)  # Skip header row
    differences = [float(row[1]) - float(row[0]) for row in reader]

# Calculate modified differences
modified_differences = [diff + 1.0 for diff in differences]

# Read the second dataset and perform multiplication
with open('pure_efficiency_stats_22_23.csv', 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    header = next(reader)  # Skip header row
    data = list(reader)

    # Find the index of the 'G' column
    g_column_index = header.index('G')

    for i, row in enumerate(data):
        for j in range(len(row)):
            if j != g_column_index and j != 0:  # Skip the 'G' column and non-numeric 'Player' column
                row[j] = round(float(row[j]) * modified_differences[i], 2)

# Write the updated data back to a new CSV file
with open('updated_per_stats_22_23.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(header)
    writer.writerows(data)


Thank you for your attention :)