In [91]:
## Importing thr libraries 
import json 
import pandas as pd 

In [93]:
### Importing data
with open("NBA Players' Performances.json") as file:
    data=json.load(file)
players=list(data.keys())
## Data Cleaning: Removing players with no data
for i in range(len(data)):
    if(data[players[i]]==[]) or data[players[i]]==[{},{},{},{}]:
        data.pop(players[i])
players=list(data.keys()) ## To take into account the deleted players
## The data for each player contains credentials regarding college games, NBA games and All-Star games. 
## See the snapshot below for one of the players
data[players[0]]

[{'MJ': '31',
  'MC': '31',
  'Min': '30,4',
  '%T': '49,3',
  '%3PTS': '32,5',
  '%LF': '59,9',
  'REB': '10,77',
  'PAD': '0,97',
  'STL': '1,10',
  'BLK': '1,87',
  'Pts': '15,77'},
 {'MJ': '134',
  'MC': '32',
  'Min': '18,4',
  '%T': '46,8',
  '%3PTS': '35,7',
  '%LF': '55,6',
  'REB': '5,08',
  'PAD': '0,83',
  'STL': '0,43',
  'BLK': '0,51',
  'Pts': '7,22'},
 {'MJ': '9',
  'MC': '1',
  'Min': '19,9',
  '%T': '50,0',
  '%3PTS': '31,3',
  '%LF': '50,0',
  'REB': '3,90',
  'PAD': '0,70',
  'STL': '0,10',
  'BLK': '0,80',
  'Pts': '7,60'}]

In [None]:
"""We create a function that will perform aggregations to append all data for each player into a single dictionary
for that player. For games and minutes played (MJ,MC) a sum will be used to aggregate the data. For metrics measured 
as percentage, on the other hand, we use the wieghted average with the number of games as a baseline. Although, it 
would not lead to an accurate result, it minimizes the bias that would result from performing a simple average
"""

In [94]:
## We have for each player, the keys and the values 
def agg(keys,values,i):
    try:
        # We first check that the size of the keys list is equal to the size of each sublist in the values list
        len_keys=len(keys)
        len_values_list=[len(l) for l in values]
        len_values=list(dict.fromkeys(len_values_list))
        check=False
        if len(len_values)==1:
            len_values=len_values[0]
            if len_values==len_keys:
                check= True
            else:
                check= False
        else:
            check= False
        ## We proceed with the aggregation
        if check==True:
            final_values=[0 for i in range(len_values)]
            for key in keys:
                t=0
                if key in ['MJ','MC']: #for games played and number of games in starting linup, we proceed with a simple sum
                    for value in values:
                        t=t+value[keys.index(key)]
                    final_values[keys.index(key)]=t
                else: # For percentages or averages, we aggregate using a moving average with the number of games played as a baseline
                    # for our calculations
                    denominator=0
                    for value in values:
                        denominator=denominator+value[keys.index("MJ")]
                    for value in values:
                        t=t+value[keys.index("MJ")]*value[keys.index(key)]
                    final_values[keys.index(key)]=round(t/denominator,2)
            return final_values
        else:
            print('Data inconcistency for player ',i)
            return [0 for i in range(len_values)]
    except:
        print('error with',i)

In [96]:
## We will add data for each player together, such that we obtain a single dictionary per player
all_players_keys=[]
all_players_values=[]
#for i in range(len(data)):
for ip in range(len(data)):
    keys=[]
    values=[]
    for dict in data[players[ip]]:
        keys.append(list(dict.keys()))
        values.append(list(dict.values()))
        keys=list(dict.fromkeys(dict))
        ## Turning the data from str to float for calculations
        for valuelist in values:
            for i in range(len(valuelist)):
                try:
                    valuelist[i]=valuelist[i].replace(',','.')
                except:
                    pass
                try:
                    valuelist[i]=float(valuelist[i])
                except:
                    valuelist[i]=0
        ### This chunk will be replaced by the function
        try:
            final_values=agg(keys,values,ip)
        except:
            print('error with',ip)
            final_values=[]
        #### End function chunk
    all_players_keys.append(keys)
    all_players_values.append(final_values)

In [98]:
# We check whether all players had the same keys in the dataset
baseline=all_players_keys[0]
check=[True for i in range(len(all_players_keys)) if all_players_keys[i]==baseline]
check=list(dict.fromkeys(check))
check

[True]

In [99]:
keys=baseline
#We check that all data is of the same size 
lendata=[len(elem) for elem in all_players_values]
lendata=list(dict.fromkeys(lendata))
lendata

[11]

In [100]:
## Adding the player key to the keys list and the player name to the player's values list
keys.insert(0,'Player')
for i in range(len(all_players_values)):
    all_players_values[i].insert(0,players[i])
### Creating a dataframe
d={keys[i]:[] for i in range(len(keys))}

In [101]:
### Appending data to the dataframe
for player_value in all_players_values:
    for i in range(len(player_value)):
        d[keys[i]].append(player_value[i])
df=pd.DataFrame(d)

In [102]:
df

Unnamed: 0,Player,MJ,MC,Min,%T,%3PTS,%LF,REB,PAD,STL,BLK,Pts
0,Precious Achiuwa,174.0,64.0,20.62,47.41,34.90,56.08,6.03,0.85,0.53,0.77,8.76
1,Steven Adams,804.0,719.0,26.61,58.86,5.88,53.22,8.06,1.42,0.86,1.03,9.00
2,Bam Adebayo,115.0,108.0,32.63,55.48,10.65,70.48,8.53,2.49,0.81,0.88,14.91
3,Ochai Agbaji,190.0,147.0,28.81,43.84,36.47,75.21,3.64,1.46,0.76,0.45,11.74
4,Santi Aldama,142.0,46.0,21.48,46.50,31.90,69.89,5.23,1.31,0.60,0.73,10.01
...,...,...,...,...,...,...,...,...,...,...,...,...
456,Christian Wood,352.0,173.0,24.04,51.07,35.79,70.27,7.21,1.34,0.54,1.09,13.99
457,Delon Wright,498.0,84.0,20.79,45.67,35.61,80.02,3.09,2.99,1.17,0.39,7.03
458,Trae Young,62.0,61.0,35.53,41.12,33.04,81.54,3.63,8.86,1.43,0.17,26.08
459,Cody Zeller,613.0,349.0,22.27,53.09,18.10,72.76,5.91,1.33,0.64,0.66,9.05


In [103]:
# Exporting data to excel
df.to_excel('NBA Data.xlsx',index=False)