# Document de data visualisation sur une base de données NBA

Ce document sert à analyser, et visualiser des données sur les différents joueurs NBA, de la saison 1996/1997 à 2022/2023.

In [2]:
# On importe nos librairies Python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# On récupère notre jeu de données
df = pd.read_csv("./all_seasons.csv")

On commence avec l'affichage de certaines statistiques générales sur notre jeu de données

In [4]:
# On affiche les cinq premières colonnnes
df.head()

Unnamed: 0.1,Unnamed: 0,player_name,team_abbreviation,age,player_height,player_weight,college,country,draft_year,draft_round,...,pts,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,season
0,0,Randy Livingston,HOU,22.0,193.04,94.800728,Louisiana State,USA,1996,2,...,3.9,1.5,2.4,0.3,0.042,0.071,0.169,0.487,0.248,1996-97
1,1,Gaylon Nickerson,WAS,28.0,190.5,86.18248,Northwestern Oklahoma,USA,1994,2,...,3.8,1.3,0.3,8.9,0.03,0.111,0.174,0.497,0.043,1996-97
2,2,George Lynch,VAN,26.0,203.2,103.418976,North Carolina,USA,1993,1,...,8.3,6.4,1.9,-8.2,0.106,0.185,0.175,0.512,0.125,1996-97
3,3,George McCloud,LAL,30.0,203.2,102.0582,Florida State,USA,1989,1,...,10.2,2.8,1.7,-2.7,0.027,0.111,0.206,0.527,0.125,1996-97
4,4,George Zidek,DEN,23.0,213.36,119.748288,UCLA,USA,1995,1,...,2.8,1.7,0.3,-14.1,0.102,0.169,0.195,0.5,0.064,1996-97


In [5]:
# On affiche les cinq dernières colonnes
df.tail()

Unnamed: 0.1,Unnamed: 0,player_name,team_abbreviation,age,player_height,player_weight,college,country,draft_year,draft_round,...,pts,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,season
12839,12839,Joel Embiid,PHI,29.0,213.36,127.00576,Kansas,Cameroon,2014,1,...,33.1,10.2,4.2,8.8,0.057,0.243,0.37,0.655,0.233,2022-23
12840,12840,John Butler Jr.,POR,20.0,213.36,86.18248,Florida State,USA,Undrafted,Undrafted,...,2.4,0.9,0.6,-16.1,0.012,0.065,0.102,0.411,0.066,2022-23
12841,12841,John Collins,ATL,25.0,205.74,102.511792,Wake Forest,USA,2017,1,...,13.1,6.5,1.2,-0.2,0.035,0.18,0.168,0.593,0.052,2022-23
12842,12842,Jericho Sims,NYK,24.0,208.28,113.398,Texas,USA,2021,2,...,3.4,4.7,0.5,-6.7,0.117,0.175,0.074,0.78,0.044,2022-23
12843,12843,JaMychal Green,GSW,33.0,205.74,102.965384,Alabama,USA,Undrafted,Undrafted,...,6.4,3.6,0.9,-8.2,0.087,0.164,0.169,0.65,0.094,2022-23


In [6]:
# On affiche toutes nos colonnes
print(df.columns)

Index(['Unnamed: 0', 'player_name', 'team_abbreviation', 'age',
       'player_height', 'player_weight', 'college', 'country', 'draft_year',
       'draft_round', 'draft_number', 'gp', 'pts', 'reb', 'ast', 'net_rating',
       'oreb_pct', 'dreb_pct', 'usg_pct', 'ts_pct', 'ast_pct', 'season'],
      dtype='object')


In [7]:
# On affiche chaque colonne dans l'ordre
column = ['player_name', 'team_abbreviation', 'college', 'country', 'season']

for col in column:
    print(col, df[col].unique())
    print(" ")

player_name ['Randy Livingston' 'Gaylon Nickerson' 'George Lynch' ...
 'Jeff Dowtin Jr.' 'Jeremy Sochan' 'John Butler Jr.']
 
team_abbreviation ['HOU' 'WAS' 'VAN' 'LAL' 'DEN' 'ORL' 'CHH' 'MIL' 'DET' 'POR' 'DAL' 'UTA'
 'SEA' 'BOS' 'IND' 'SAS' 'MIA' 'ATL' 'NJN' 'LAC' 'GSW' 'PHI' 'NYK' 'TOR'
 'PHX' 'MIN' 'CHI' 'SAC' 'CLE' 'MEM' 'NOH' 'CHA' 'NOK' 'OKC' 'BKN' 'NOP']
 
college ['Louisiana State' 'Northwestern Oklahoma' 'North Carolina'
 'Florida State' 'UCLA' 'Tennessee-Chattanooga' nan 'Michigan' 'Purdue'
 'Duke' 'Ohio' 'Eastern Michigan' 'Nevada-Las Vegas' 'Kansas'
 'Texas-El Paso' 'Indiana' 'Louisville' 'Houston' 'Oklahoma'
 'Oral Roberts' 'Oregon State' 'Brigham Young' 'Washington' 'Memphis'
 'Notre Dame' 'Delaware State' 'Alabama' 'Wyoming' 'Pittsburgh'
 'Providence' 'Nebraska' 'Michigan State' 'Mississippi State'
 'New Orleans' 'Penn State' 'Western Carolina' 'Iowa State'
 "St. Mary's (TX)" 'Clemson' 'Ohio State' 'Georgetown' 'Marquette'
 'Virginia Tech' 'Southern Mississippi' 'McNeese

In [8]:
# On affiche le nombre de valeurs uniques pour chaque colonnes
column = ['player_name', 'team_abbreviation', 'college', 'country', 'season']

for col in column:
    print(col, df[col].nunique())
    print(" ")

player_name 2551
 
team_abbreviation 36
 
college 356
 
country 82
 
season 27
 


In [9]:
# On affiche les cinq valeurs les plus présentes de nos colonnes
column = ['player_name', 'country', 'season']

for col in column:
    print(df[col].value_counts().head())
    print(" ")

player_name
Vince Carter      22
Dirk Nowitzki     21
Jamal Crawford    20
Udonis Haslem     20
Kobe Bryant       20
Name: count, dtype: int64
 
country
USA          10721
Canada         205
France         190
Australia      100
Spain           93
Name: count, dtype: int64
 
season
2021-22    605
2017-18    540
2020-21    540
2022-23    539
2018-19    530
Name: count, dtype: int64
 


Pour chaque colonne, voici l'analyse et la conclusion que nous pouvons en tirer :
- Pour la colonne des joueurs, Vince Carter est celui qui a joué le plus de saisons, son nom apparaissant le plus de fois dans la liste

- Les Etats-Unis sont bien entendu le pays majoritaire de la NBA, talonnés de près par les autres grandes nations du basket-ball.

- Enfin, les dernières saisons sont celles qui contiennent le plus de joueurs.

In [13]:
# Statistiques de notre jeu de données pour les 3 statistiques principales de la NBA : points, rebonds, passes décisives
col = ['pts', 'reb', 'ast']
df[col].describe()

Unnamed: 0,pts,reb,ast
count,12844.0,12844.0,12844.0
mean,8.212582,3.558486,1.824681
std,6.016573,2.477885,1.80084
min,0.0,0.0,0.0
25%,3.6,1.8,0.6
50%,6.7,3.0,1.2
75%,11.5,4.7,2.4
max,36.1,16.3,11.7


Ici, nous avons les statistiques générales de nos joueurs, on peut notamment observer le nombre maximum, minimum et la moyenne de chacune de nos statistiques pour chaque joueur de basket-ball.