In [1]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations
from collections import OrderedDict

#remove warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
#define a coder and decoder for string to int and int to string
coder = OrderedDict()
decoder = OrderedDict()

def encode(s):
    if s not in coder:
        coder[s] = len(coder)
        decoder[len(decoder)] = s
    return coder[s]

def decode(i):
    return decoder[i]

#encode playerID, teamID
def encode_player_team(df):
    if('playerID' in df.columns):
        df['playerID'] = df['playerID'].apply(encode)
    if('tmID' in df.columns):
        df['tmID'] = df['tmID'].apply(encode)
    if('bioID' in df.columns):
        df['bioID'] = df['bioID'].apply(encode)
    if('college' in df.columns):
        df['college'] = df['college'].apply(encode)
    if 'confID' in df.columns:
        df['confID'] = df['confID'].apply(encode)
    return df


#get player teams into dataframe from player_teams.csv
player_teams = pd.read_csv('new_data/data.csv')

#get player teams into dataframe from player_teams.csv
coaches = pd.read_csv('../data/coaches.csv')

#teams match up results (only post is available)
series_post = pd.read_csv('../data/series_post.csv')

#team stats in playoff
teams_post = pd.read_csv('../data/teams_post.csv')

#player csv
players = pd.read_csv('../data/players.csv')

#awards csv
awards_players = pd.read_csv('../data/awards_players.csv')

#teams csv
teams = pd.read_csv('../data/teams.csv')

#comp csv
comp = pd.read_csv('new_data/comp.csv')

#remove all columns starting with "lgID" from all dataframes
for df in [player_teams, coaches, series_post, teams_post, players, awards_players, teams]:
    for col in df.columns:
        if col.startswith('lgID'):
            df.drop(col, axis=1, inplace=True)

#drop divID column from teams
teams.drop('divID', axis=1, inplace=True)

#drop all stint > 1 from player_teams
#player_teams = player_teams[player_teams['stint'] == 0]

print(teams['confID'].unique())

['EA' 'WE']


In [25]:
#get columns with NaN values or empty values for all dataframes
for df in [player_teams, coaches, series_post, teams_post, players, awards_players, teams]:
    print(df.columns[df.isna().any()].tolist())
    print(df.columns[df.isnull().any()].tolist())
    print(df.columns[df.eq('').any()].tolist())

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
['pos', 'college', 'collegeOther']
['pos', 'college', 'collegeOther']
[]
[]
[]
[]
[]
[]
[]


In [3]:
def compute_percentage(numerator, denominator):
    return round(numerator.divide(denominator).where(denominator != 0, 0.0),2)

In [4]:
# Assuming you have a column 'year' to sort by
player_teams = player_teams.sort_values(by=['playerID', 'year'])

# Regular Season Percentages
player_teams['ft%'] = compute_percentage(player_teams['ftMade'], player_teams['ftAttempted'])
player_teams['fg%'] = compute_percentage(player_teams['fgMade'], player_teams['fgAttempted'])
player_teams['three%'] = compute_percentage(player_teams['threeMade'], player_teams['threeAttempted'])
player_teams['gs%'] = compute_percentage(player_teams['GS'], player_teams['GP'])

# Playoffs Percentages
# player_teams['Postft%'] = compute_percentage(player_teams['PostftMade'], player_teams['PostftAttempted'])
# player_teams['Postfg%'] = compute_percentage(player_teams['PostfgMade'], player_teams['PostfgAttempted'])
# player_teams['Postthree%'] = compute_percentage(player_teams['PostthreeMade'], player_teams['PostthreeAttempted'])
# player_teams['Postgs%'] = compute_percentage(player_teams['PostGS'], player_teams['PostGP'])

#effective field goal percentage
player_teams['efg%'] = compute_percentage(player_teams['fgMade'] + 0.5 * player_teams['threeMade'], player_teams['fgAttempted']) * 2

#true shooting percentage
player_teams['ts%'] = compute_percentage(player_teams['points'], 2 * (player_teams['fgAttempted'] + 0.44 * player_teams['ftAttempted'])) * 2

#per game stats
#if pos contains G, then multiply by assists by 1.5, if contains C, then multiply by rebounds by 1.5, if contains F, then points by 1.5 (it may contain more than one letter ex: G-F )
player_teams['ppg'] = round(player_teams['points']/player_teams['GP'],2)
player_teams['rpg'] = round(player_teams['rebounds']/player_teams['GP'],2)
player_teams['apg'] = round(player_teams['assists']/player_teams['GP'],2)
player_teams['spg'] = round(player_teams['steals']/player_teams['GP'],2)
player_teams['bpg'] = round(player_teams['blocks']/player_teams['GP'],2)

#efficiency
player_teams['eff'] = player_teams['ppg'] + player_teams['rpg'] + player_teams['apg'] + player_teams['spg'] + player_teams['bpg'] - (player_teams['fgAttempted'] - player_teams['fgMade']) - (player_teams['ftAttempted'] - player_teams['ftMade']) - player_teams['turnovers']

#per 36 minutes stats
player_teams['pp36'] = compute_percentage(player_teams['points'], player_teams['minutes'])*36

#defensive prowess: Defensive Prowess PCA: Use 'steals', 'blocks', and 'dRebounds' to create a 'Defensive Impact' principal component. Combine 'PF' (personal fouls) and 'turnovers' into a 'Defensive Discipline' component to represent careful play.
player_teams['defensive_prowess'] = compute_percentage(player_teams['steals'] + player_teams['blocks'] + player_teams['dRebounds'], player_teams['GP'])*10
player_teams['defensive_discipline'] = compute_percentage(player_teams['PF'] + player_teams['turnovers'], player_teams['GP'])*2

#minutes per game
player_teams['mpg'] = compute_percentage(player_teams['minutes'], player_teams['GP'])

#percetange of game started
player_teams['gs%'] = compute_percentage(player_teams['GS'], player_teams['GP'])
    

#add pos column from players to player_teams, bioID is the same as playerID
player_teams = player_teams.merge(players[['bioID', 'pos', 'college']], left_on='playerID', right_on='bioID', how='left')
player_teams.drop('bioID', axis=1, inplace=True)

player_teams['pos'] = player_teams['pos'].replace(
    ['G', 'F', 'C', 'C-F', 'F-C', 'G-F', 'F-G'],
    [1, 2, 3, 4, 4, 5, 5]
)

#turn this oRebounds,dRebounds,dq,PostMinutes,PostPoints,PostoRebounds,PostdRebounds,PostRebounds,PostAssists,PostSteals,PostBlocks,PostTurnovers,PostPF,PostDQ into per game stats
player_teams['oRebounds'] = round(player_teams['oRebounds']/player_teams['GP'],2)
player_teams['dRebounds'] = round(player_teams['dRebounds']/player_teams['GP'],2)
player_teams['dq'] = round(player_teams['dq']/player_teams['GP'],2)
player_teams['PostMinutes'] = round(player_teams['PostMinutes']/player_teams['PostGP'],2)
player_teams['PostPoints'] = round(player_teams['PostPoints']/player_teams['PostGP'],2)
player_teams['PostoRebounds'] = round(player_teams['PostoRebounds']/player_teams['PostGP'],2)
player_teams['PostdRebounds'] = round(player_teams['PostdRebounds']/player_teams['PostGP'],2)
player_teams['PostRebounds'] = round(player_teams['PostRebounds']/player_teams['PostGP'],2)
player_teams['PostAssists'] = round(player_teams['PostAssists']/player_teams['PostGP'],2)
player_teams['PostSteals'] = round(player_teams['PostSteals']/player_teams['PostGP'],2)
player_teams['PostBlocks'] = round(player_teams['PostBlocks']/player_teams['PostGP'],2)
player_teams['PostTurnovers'] = round(player_teams['PostTurnovers']/player_teams['PostGP'],2)
player_teams['PostPF'] = round(player_teams['PostPF']/player_teams['PostGP'],2)
player_teams['PostDQ'] = round(player_teams['PostDQ']/player_teams['PostGP'],2)

#make one group stat for post season
player_teams['stats_post'] = player_teams['PostPoints'] + player_teams['PostRebounds'] + player_teams['PostAssists'] + player_teams['PostSteals'] + player_teams['PostBlocks'] - (player_teams['PostfgAttempted'] - player_teams['PostfgMade']) - (player_teams['PostftAttempted'] - player_teams['PostftMade']) - player_teams['PostTurnovers']

#divide the PostStat by PostMinutes to get the PostStat per minute
player_teams['stats_post'] = compute_percentage(player_teams['stats_post'], player_teams['PostMinutes'])

#drop the columns that are not needed: stint, PostoRebounds, PostdRebounds, PostRebounds, PostAssists, PostSteals, PostBlocks, PostTurnovers, PostPF, PostDQ, PostfgMade, PostfgAttempted, PostftMade, PostftAttempted, PostthreeMade, PostthreeAttempted, PostGS, PostGP, PostMinutes, PostPoints
player_teams.drop(['PostoRebounds', 'PostdRebounds', 'PostRebounds', 'PostAssists', 'PostSteals', 'PostBlocks', 'PostTurnovers', 'PostPF', 'PostDQ', 'PostfgMade', 'PostfgAttempted', 'PostftMade', 'PostftAttempted', 'PostthreeMade', 'PostthreeAttempted', 'PostGS', 'PostGP', 'PostMinutes'], axis=1, inplace=True)

#remove the made and attempted columns
player_teams.drop([ 'threeMade', 'threeAttempted'], axis=1, inplace=True)

#remove the columns that are not needed: points, rebounds, assists, steals, blocks, turnovers, PF, fgMade, fgAttempted, ftMade, ftAttempted, GS, GP, minutes
player_teams.drop([ 'points', 'rebounds', 'assists', 'steals', 'blocks', 'turnovers', 'PF', 'fgMade', 'fgAttempted', 'ftMade', 'ftAttempted', 'GS', 'minutes'], axis=1, inplace=True)

#fill nan with 0
player_teams.fillna(0, inplace=True)

player_teams.head()
player_teams.columns

Index(['playerID', 'year', 'stint', 'tmID', 'GP', 'oRebounds', 'dRebounds',
       'dq', 'PostPoints', 'ft%', 'fg%', 'three%', 'gs%', 'efg%', 'ts%', 'ppg',
       'rpg', 'apg', 'spg', 'bpg', 'eff', 'pp36', 'defensive_prowess',
       'defensive_discipline', 'mpg', 'pos', 'college', 'stats_post'],
      dtype='object')

In [5]:
coaches['total_games'] = coaches['won']+ coaches['lost']
coaches['W%'] = compute_percentage(coaches['won'],coaches['total_games'])

coaches['total_p_games'] = coaches['post_wins']+ coaches['post_losses']
coaches['postW%'] = compute_percentage(coaches['post_wins'],coaches['total_p_games'])

#remove won and lost columns, and post_wins and post_losses
coaches.drop(['won','lost','post_wins','post_losses'], axis=1, inplace=True)

coaches.head()

Unnamed: 0,coachID,year,tmID,stint,total_games,W%,total_p_games,postW%
0,adamsmi01w,5,WAS,0,34,0.5,3,0.33
1,adubari99w,1,NYL,0,32,0.62,7,0.57
2,adubari99w,2,NYL,0,32,0.66,6,0.5
3,adubari99w,3,NYL,0,32,0.56,8,0.5
4,adubari99w,4,NYL,0,34,0.47,0,0.0


In [6]:
#aggregate coaches by tmID and year
coaches = coaches.groupby(['tmID','year']).agg({'W%':'mean','postW%':'mean'}).reset_index()

In [7]:
#add coach stats to player_teams by tmID and year
player_teams = player_teams.merge(coaches, left_on=['tmID','year'], right_on=['tmID','year'], how='left')
#drop tmID
player_teams.columns

Index(['playerID', 'year', 'stint', 'tmID', 'GP', 'oRebounds', 'dRebounds',
       'dq', 'PostPoints', 'ft%', 'fg%', 'three%', 'gs%', 'efg%', 'ts%', 'ppg',
       'rpg', 'apg', 'spg', 'bpg', 'eff', 'pp36', 'defensive_prowess',
       'defensive_discipline', 'mpg', 'pos', 'college', 'stats_post', 'W%',
       'postW%'],
      dtype='object')

In [8]:
def pre_process_data(df):
    mapping = {'Y': 1, 'N': 0}
    df['playoff'] = df['playoff'].map(mapping)
    df.fillna(0, inplace=True)
    return df

In [9]:
teams = pre_process_data(teams)

In [10]:
#just get the columns tmID, year, playoff, confID, firstRound, semis, finals
teams_playoffs = teams[['tmID', 'year', 'playoff', 'confID', 'firstRound', 'semis', 'finals']]

#merge teams_playoffs with player_teams, based on tmID and year, and add the playoff columns to player_teams
player_teams = player_teams.merge(teams_playoffs, left_on=['tmID', 'year'], right_on=['tmID', 'year'], how='left')

#add champions column to player_teams
player_teams['champions'] = player_teams.apply(
    lambda row: 'W' if row['finals'] == 'W' else '',
    axis=1
)
# Create a new column for playoff_progression
player_teams['playoff_progression'] = player_teams.apply(
    lambda row: 1 if row['firstRound'] == 'L' else
                2 if row['semis'] == 'L' else
                3 if row['finals'] == 'L' else
                4 if row['champions'] == 'W' else 0,
    axis=1
)

# Drop unnecessary columns
player_teams.drop(['firstRound', 'semis', 'finals', 'champions'], axis=1, inplace=True)

player_teams.head()

player_teams['confID'].unique()

array(['WE', 'EA'], dtype=object)

In [11]:
#add height and weight columns to player_teams from players
player_teams = player_teams.merge(players[['bioID', 'height', 'weight']], left_on='playerID', right_on='bioID', how='left')

#remove bioID column
player_teams.drop('bioID', axis=1, inplace=True)

player_teams.head()

Unnamed: 0,playerID,year,stint,tmID,GP,oRebounds,dRebounds,dq,PostPoints,ft%,...,pos,college,stats_post,W%,postW%,playoff,confID,playoff_progression,height,weight
0,abrossv01w,2,0,MIN,26,1.65,5.04,0.08,0.0,0.73,...,2,Connecticut,0.0,0.38,0.0,0,WE,0,74.0,169
1,abrossv01w,3,0,MIN,27,1.67,3.74,0.0,0.0,0.48,...,2,Connecticut,0.0,0.315,0.0,0,WE,0,74.0,169
2,abrossv01w,4,0,MIN,30,1.47,3.23,0.0,7.67,0.7,...,2,Connecticut,-0.28,0.53,0.33,1,WE,1,74.0,169
3,abrossv01w,5,0,MIN,22,0.77,2.59,0.0,10.0,0.61,...,2,Connecticut,-0.03,0.53,0.0,1,WE,1,74.0,169
4,abrossv01w,6,0,MIN,31,0.94,2.52,0.0,0.0,0.73,...,2,Connecticut,0.0,0.41,0.0,0,WE,0,74.0,169


In [12]:
#from awards_players, add column award_count to player_teams which is the number of awards the team won (sum of all awards)
#player_teams = player_teams.merge(awards_players[['playerID', 'award']], left_on='playerID', right_on='playerID', how='left')

player_teams['award_count'] = 0

#associate each player with each award and year (ex: player A won 2 awards in 2010, thus 2010 has 2, but 2011 is 0 (unless he wins again))
for index, row in awards_players.iterrows():
    player_teams.loc[(player_teams['playerID'] == row['playerID']) & (player_teams['year'] == row['year']), 'award_count'] += 1

player_teams.head()

Unnamed: 0,playerID,year,stint,tmID,GP,oRebounds,dRebounds,dq,PostPoints,ft%,...,college,stats_post,W%,postW%,playoff,confID,playoff_progression,height,weight,award_count
0,abrossv01w,2,0,MIN,26,1.65,5.04,0.08,0.0,0.73,...,Connecticut,0.0,0.38,0.0,0,WE,0,74.0,169,0
1,abrossv01w,3,0,MIN,27,1.67,3.74,0.0,0.0,0.48,...,Connecticut,0.0,0.315,0.0,0,WE,0,74.0,169,0
2,abrossv01w,4,0,MIN,30,1.47,3.23,0.0,7.67,0.7,...,Connecticut,-0.28,0.53,0.33,1,WE,1,74.0,169,0
3,abrossv01w,5,0,MIN,22,0.77,2.59,0.0,10.0,0.61,...,Connecticut,-0.03,0.53,0.0,1,WE,1,74.0,169,0
4,abrossv01w,6,0,MIN,31,0.94,2.52,0.0,0.0,0.73,...,Connecticut,0.0,0.41,0.0,0,WE,0,74.0,169,0


In [13]:
#encode playerID, teamID, college
#swap empty college with 'No College'
player_teams['college'] = player_teams['college'].replace('', 'No College')
player_teams = encode_player_team(player_teams)
player_teams.to_csv('new_data/clean-data.csv', index=False)

In [14]:
#add ['height', 'weight', 'career_year', 'pos', 'college','confID'] to comp.csv

comp = comp.merge(players[['bioID', 'height', 'weight', 'pos', 'college']], left_on='playerID', right_on='bioID', how='left')
comp.drop(['bioID','lgID'], axis=1, inplace=True)


#make pos into a category
comp['pos'] = comp['pos'].replace(
    ['G', 'F', 'C', 'C-F', 'F-C', 'G-F', 'F-G'],
    [1, 2, 3, 4, 4, 5, 5]
)

#get confID from teams
team_conf = teams[['tmID', 'confID']]
#get unique pairs of tmID and confID
team_conf = team_conf.drop_duplicates()
comp = comp.merge(team_conf, left_on='tmID', right_on='tmID', how='left')

#if confID is nan, then print the tmID
for index, row in comp.iterrows():
    if row['confID'] != row['confID']:
        #set confID to WE
        comp.loc[index, 'confID'] = 'WE'

In [15]:
#store comp dataframe into a csv file
#swap empty college with 'No College'
comp['college'] = comp['college'].replace('', 'No College')
comp = encode_player_team(comp)
comp.to_csv('new_data/clean-comp.csv', index=False)

In [16]:
#join comp to player_teams, if a column does not exist, fill it with 0, player_teams is from year 1-10, comp is from year 11
new_player_teams = player_teams.merge(comp, left_on=['playerID','year','stint','tmID','height','weight','pos','college','confID'], right_on=['playerID','year','stint','tmID','height','weight','pos','college','confID'], how='outer')
new_player_teams.fillna(0, inplace=True)
#sort by year
new_player_teams = new_player_teams.sort_values(by=['year','playerID'])

#drop any _y columns or _x columns
new_player_teams = new_player_teams.drop([col for col in new_player_teams.columns if col.endswith('_y')], axis=1)
new_player_teams = new_player_teams.drop([col for col in new_player_teams.columns if col.endswith('_x')], axis=1)

new_player_teams

Unnamed: 0,playerID,year,stint,tmID,GP,oRebounds,dRebounds,dq,PostPoints,ft%,...,college,stats_post,W%,postW%,playoff,confID,playoff_progression,height,weight,award_count
12,4,1,0,559,29.0,0.28,0.55,0.00,7.00,0.32,...,579,0.10,0.435,0.0,1.0,702,1.0,71.0,153,0.0
28,12,1,0,559,30.0,0.40,1.13,0.00,1.00,0.78,...,586,0.00,0.435,0.0,1.0,702,1.0,67.0,125,0.0
31,13,1,0,566,32.0,1.00,1.97,0.03,3.50,0.75,...,587,-0.75,0.530,0.5,1.0,702,2.0,71.0,165,0.0
37,15,1,0,558,32.0,1.12,2.59,0.03,8.33,0.84,...,589,-0.39,0.840,1.0,1.0,701,4.0,71.0,147,0.0
42,16,1,0,567,32.0,3.00,4.22,0.00,0.00,0.68,...,589,0.00,0.410,0.0,0.0,702,0.0,77.0,198,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2004,741,11,0,559,0.0,0.00,0.00,0.00,0.00,0.00,...,598,0.00,0.000,0.0,0.0,702,0.0,69.0,145,0.0
2005,742,11,0,573,0.0,0.00,0.00,0.00,0.00,0.00,...,598,0.00,0.000,0.0,0.0,701,0.0,77.0,195,0.0
2006,742,11,0,563,0.0,0.00,0.00,0.00,0.00,0.00,...,598,0.00,0.000,0.0,0.0,701,0.0,77.0,195,0.0
2010,743,11,0,574,0.0,0.00,0.00,0.00,0.00,0.00,...,750,0.00,0.000,0.0,0.0,702,0.0,68.0,130,0.0


In [17]:
#join the stats of stint = 1 to stint = 0 into one row (stint is 0 if the player only played for one team in that year, else it is 1 for the 1st team and 2 for the 2nd team (and so on)))
players_with_stint = new_player_teams[new_player_teams['stint'] != 0]


not_sum_cols = ['playerID', 'year', 'tmID', 'college', 'confID', 'height', 'weight', 'pos', 'playoff']
sum_cols = [col for col in players_with_stint.columns if col not in not_sum_cols]

#Group by playerID and year
grouped = players_with_stint.groupby(['playerID', 'year'])

# Aggregate data
aggregated_data = grouped.agg({**{col: 'first' for col in not_sum_cols[2:]}, **{col: 'sum' for col in sum_cols}}).reset_index()

print(aggregated_data['stint'].unique())



[3 6]


In [18]:
new_player_teams = new_player_teams[new_player_teams['stint'] == 0]

new_player_teams = pd.concat([new_player_teams, aggregated_data], ignore_index=True)

new_player_teams = new_player_teams.sort_values(by=['year','playerID'])

#drop stint column
new_player_teams.drop('stint', axis=1, inplace=True)

In [19]:
#add career_year column
new_player_teams['career_year'] = new_player_teams.groupby('playerID').cumcount() + 1

In [20]:
def rolling_averages(group, cols, new_cols, window=3):
    #if a player has stint > 0, then the rolling average will be the same as the previous stint
    group = group.sort_values('year')
    #join in stints > 0 as they are the same player
    rolling_stats = group[cols].rolling(window=window, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [21]:
cols = ['playoff_progression','eff', 'award_count','defensive_prowess','stats_post', 'PostPoints',"W%", "postW%", 'rpg', 'apg']
new_cols = [f'{col}_rolling' for col in cols]

#apply the rooling averages to the columns, if a player has stint > 0, then the rolling average will be the same as the previous stint
new_player_teams = new_player_teams.groupby('playerID').apply(rolling_averages, cols, new_cols, window=3)

In [22]:
#sort by year
new_player_teams = new_player_teams.sort_values(by=['year'])
new_player_teams.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,playerID,year,tmID,GP,oRebounds,dRebounds,dq,PostPoints,ft%,fg%,...,playoff_progression_rolling,eff_rolling,award_count_rolling,defensive_prowess_rolling,stats_post_rolling,PostPoints_rolling,W%_rolling,postW%_rolling,rpg_rolling,apg_rolling
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
434,732,434,4,555,34.0,1.18,2.88,0.06,17.33,0.88,0.46,...,0.0,-354.74,0.0,35.033333,0.0,0.0,0.388333,0.0,3.23,2.516667
230,668,230,4,573,34.0,0.71,1.71,0.0,0.0,0.81,0.34,...,0.0,-159.33,0.0,24.0,0.0,0.0,0.426667,0.0,1.563333,2.216667
240,671,240,4,564,31.0,0.23,0.58,0.0,0.0,0.73,0.25,...,0.333333,-81.926667,0.0,10.633333,-0.216667,1.8,0.58,0.133333,1.066667,0.51
242,672,242,4,566,30.0,1.03,1.7,0.0,0.67,0.56,0.56,...,1.0,-62.55,0.0,19.466667,0.5,0.583333,0.563333,0.276667,2.47,0.473333
504,753,504,4,569,20.0,0.1,1.35,0.0,0.0,0.85,0.28,...,2.333333,-102.55,0.0,45.533333,-0.113333,4.72,0.893333,0.553333,3.133333,3.536667


In [23]:
#store new_player_teams into a csv file
new_player_teams.to_csv('new_data/complete-data.csv', index=False)

In [24]:
#store the codes in format of {code: name} into a csv file
with open('new_data/codes.csv', 'w') as f:
    #first write the header
    f.write('name,code\n')
    for code, name in coder.items():
        #if NaN, then do not write to the file
        f.write(f'{name},{code}\n')

        