# Libraries, Helper Functions, and Raw Data

In [1]:
import re
import glob
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup as bs
import heapq

get_player_id extracts unique player IDs to append to the dataset and use for the graph

In [2]:
def get_player_id(year):
    '''
    Get the player ids from basketball reference for 
    the totals table in a given year
    @param int year: the year for a player
    @rtype list: list of player ids 
    '''
    url = 'http://www.basketball-reference.com/leagues/NBA_{}_totals.html'.format(year)
    website = requests.get(url).text
    soup = bs(website, 'lxml')
    table = soup.find_all('table')[0]
    rows = table.find_all('tr')[1:]
    ids = [i.find_all('td') for i in rows]
    id_list = [i[0]['data-append-csv'] for i in ids if len(i)>0]
    return id_list

All of the csv files are stored in the Season Data 2 folder, and glob returns a list of filepaths to iterate over

In [16]:
path = os.getcwd() + "\Data"

files = glob.glob(path + "/*.csv")

# Main Pipeline

1. Iterates over all the files collected by the scraper
2. Year is extracted from the filename; the graph algorithm down the road depends on a parameter that 
   is a string of the year and team concatenated together.
3. Year is also used to get player IDs for the year which is then added as a column to the data.
4. Players who have played 0 games that season are dropped from the data.
5. All columns except: Games played; Player ID; Player name; Year; and Team are dropped.

<br>
<br>

Basketball reference adds multiple rows for traded players. First there's n rows for the each of the 
n teams they played for in the season with their stats for each team. Then there's a TOT (total) row 
which sums all of their stats for the season, creating a total of n+1 rows where n is the number of 
teams played for.

<br>

6. First, the TOT row is dropped.

<br>

7. The last step is cutting off asterisks that get stuck on to player names when scraping from 
   Basketball Reference. E.g. Kareem Abdul-Jabbar is scraped and recorded as Kareem Abdul-Jabbar*

In [17]:
for file in files:
    data = pd.read_csv(file)
    year = file.split('_')[-1].split('.')[0]
    data["Year"] = year
    primary_list = get_player_id(year)
    data["ID"] = primary_list
    data = data[data["G"] != 0]
    data = data[["Player", "Tm", "G", "Year", "ID"]]
    data = data[data["Tm"] != "TOT"]
    data["Player"] = data["Player"].apply(lambda x: "".join(x.split("*")[0]))
    data.to_csv(file)

## Primary Dataframe

Iterate over the newly processed files, read them in, and add them to a list.

In [18]:
dataframes = []

for file in files:
    dataframes.append(pd.read_csv(file))

The list of dataframes is used to concatenate them all together into one large dataframe.

In [19]:
mega_frame = pd.concat(dataframes)
mega_frame.to_csv("Full-table.csv")

# Creating the Graph

Tm_year column is created in the large dataframe at this point, as indicated previously

In [20]:
mega_frame['Tm_year'] = mega_frame['Tm'] + [str(i) for i in mega_frame['Year']]

Creating a dictionary where the keys are Tm_year and the values are lists of the IDs of the players who played for a specific team in a given year.

In [21]:
mega_frame.set_index("Tm_year").to_dict()

tm_year_ids = {}

for year in mega_frame.Tm_year.unique():
    values = list(mega_frame[mega_frame["Tm_year"] == year]["ID"])
    tm_year_ids[year] = values

id_name stores a pairing of a player ID to their actual name. inv_map reverses the key, value pairing in id_name.

<br>

Both dictionaries are used as lookup tables

In [22]:
id_name = {}

for id1 in mega_frame.ID.unique():
    key = mega_frame[mega_frame["ID"] == id1]["Player"].iloc[0]
    id_name[id1] = key

inv_map = {v: k for k, v in id_name.items()}

Pairs a player ID with a list of all their direct teammates ever. 

<br>

I'm not enthused about writing in n^3 complexity but it's a n^2 problem and the dataset is small so it doesn't really matter.

In [23]:
teammates = {}

for id1 in list(id_name.keys()):
    teammates[id1] = set()
    for teammates2 in list(tm_year_ids.values()):
        if id1 in teammates2:
            new_copy = [player for player in teammates2 if player != id1]
            for teammate in new_copy:
                teammates[id1].add(teammate)

### Final Graph Representation

<br>

The previous teammates dictionary paired players to a list of their direct teammates. new_teammates pairs player IDs to a dictionary of their teammates, where the teammates are paired to the number 1. This is done to facilitate the running of the graph algorithm in the next cell.

In [24]:
new_teammates = {k: dict.fromkeys(v, 1) for k, v in teammates.items()}

### Graph Algorithm

<br>

Calculates the Carter Number of each player.

In [25]:
def calculate_distances(graph, starting_vertex):
    distances = {vertex: float('infinity') for vertex in graph}
    distances[starting_vertex] = 0

    entry_lookup = {}
    pq = []

    for vertex, distance in distances.items():
        entry = [distance, vertex]
        entry_lookup[vertex] = entry
        heapq.heappush(pq, entry)

    while len(pq) > 0:
        current_distance, current_vertex = heapq.heappop(pq)

        for neighbor, neighbor_distance in graph[current_vertex].items():
            distance = distances[current_vertex] + neighbor_distance
            if distance < distances[neighbor]:
                distances[neighbor] = distance
                entry_lookup[neighbor][0] = distance

    return distances

# Carter Number Dataframe

<br>

The Carter Number is calculated for all players. Then the dictionary is shaped into a dataframe and a new column with player names is added to facilitate comparisons.

In [26]:
carter_number = calculate_distances(new_teammates, 'cartevi01')

In [27]:
carter_frame = pd.DataFrame.from_dict(data = carter_number, orient = "index")

carter_frame["Name"] = carter_frame.index.map(lambda x: id_name[x])

In [28]:
carter_frame = carter_frame.rename(columns = {0: "Carter Number"})
carter_frame.to_csv("Carter_Number.csv")

In [29]:
import json

with open('carter.json', 'w') as fp:
    json.dump(carter_number, fp)