# Data Cleaning

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
raw_stats = pd.read_csv("raw_player_stats_2022_23.csv")
raw_pos = pd.read_csv("raw_player_pos_2022_23.csv")

In [3]:
raw_stats.head()

Unnamed: 0,Player,Team,Age,GP,W,L,Min,PTS,FGM,FGA,...,REB,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-
0,Jayson Tatum,BOS,25,74,52,22,2732.2,2225,727,1559,...,649,342,213,78,51,160,3691,31,1,470
1,Joel Embiid,PHI,29,66,43,23,2284.1,2183,728,1328,...,670,274,226,66,112,205,3706,39,1,424
2,Luka Doncic,DAL,24,66,33,33,2390.5,2138,719,1449,...,569,529,236,90,33,166,3747,36,10,128
3,Shai Gilgeous-Alexander,OKC,24,68,33,35,2416.0,2135,704,1381,...,329,371,192,112,65,192,3425,3,0,149
4,Giannis Antetokounmpo,MIL,28,63,47,16,2023.6,1959,707,1278,...,742,359,246,52,51,197,3451,46,6,341


In [4]:
raw_pos.head()

Unnamed: 0,Player,POS
0,Joel Embiid,C
1,Luka Doncic,PG
2,Damian Lillard,PG
3,Shai Gilgeous-Alexander,PG
4,Giannis Antetokounmpo,PF


## Fully join two tables.

In [5]:
raw_dataset = pd.merge(left = raw_stats, right = raw_pos, how = 'outer')
raw_dataset.head()

Unnamed: 0,Player,Team,Age,GP,W,L,Min,PTS,FGM,FGA,...,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-,POS
0,Jayson Tatum,BOS,25.0,74.0,52.0,22.0,2732.2,2225.0,727.0,1559.0,...,342.0,213.0,78.0,51.0,160.0,3691.0,31.0,1.0,470.0,SF
1,Joel Embiid,PHI,29.0,66.0,43.0,23.0,2284.1,2183.0,728.0,1328.0,...,274.0,226.0,66.0,112.0,205.0,3706.0,39.0,1.0,424.0,C
2,Luka Doncic,DAL,24.0,66.0,33.0,33.0,2390.5,2138.0,719.0,1449.0,...,529.0,236.0,90.0,33.0,166.0,3747.0,36.0,10.0,128.0,PG
3,Shai Gilgeous-Alexander,OKC,24.0,68.0,33.0,35.0,2416.0,2135.0,704.0,1381.0,...,371.0,192.0,112.0,65.0,192.0,3425.0,3.0,0.0,149.0,PG
4,Giannis Antetokounmpo,MIL,28.0,63.0,47.0,16.0,2023.6,1959.0,707.0,1278.0,...,359.0,246.0,52.0,51.0,197.0,3451.0,46.0,6.0,341.0,PF


## Data Types

In [6]:
print(raw_dataset.dtypes)

Player     object
Team       object
Age       float64
GP        float64
W         float64
L         float64
Min       float64
PTS       float64
FGM       float64
FGA       float64
FG%       float64
3PM       float64
3PA       float64
3P%       float64
FTM       float64
FTA       float64
FT%       float64
OREB      float64
DREB      float64
REB       float64
AST       float64
TOV       float64
STL       float64
BLK       float64
PF        float64
FP        float64
DD2       float64
TD3       float64
+/-       float64
POS        object
dtype: object


## Checking the null value Record(s).

In [7]:
# There is 1 null data for each columns except Position (which has 6).
print(raw_dataset.isnull().sum())

Player    0
Team      1
Age       1
GP        1
W         1
L         1
Min       1
PTS       1
FGM       1
FGA       1
FG%       1
3PM       1
3PA       1
3P%       1
FTM       1
FTA       1
FT%       1
OREB      1
DREB      1
REB       1
AST       1
TOV       1
STL       1
BLK       1
PF        1
FP        1
DD2       1
TD3       1
+/-       1
POS       6
dtype: int64


In [8]:
raw_dataset[raw_dataset['Team'].isnull()]

Unnamed: 0,Player,Team,Age,GP,W,L,Min,PTS,FGM,FGA,...,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-,POS
539,Nikola Jović,,,,,,,,,,...,,,,,,,,,,F


In [9]:
raw_dataset[raw_dataset['POS'].isnull()]

Unnamed: 0,Player,Team,Age,GP,W,L,Min,PTS,FGM,FGA,...,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-,POS
426,Nikola Jovic,MIA,19.0,15.0,7.0,8.0,204.5,82.0,28.0,69.0,...,10.0,10.0,7.0,2.0,19.0,151.0,0.0,0.0,-16.0,
534,Alondes Williams,BKN,23.0,1.0,1.0,0.0,5.3,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,1.0,-1.0,0.0,0.0,-5.0,
535,Deonte Burton,SAC,29.0,2.0,1.0,1.0,6.5,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,
536,Frank Jackson,UTA,24.0,1.0,0.0,1.0,5.0,0.0,0.0,3.0,...,1.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,-2.0,
537,Michael Foster Jr.,PHI,20.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,
538,Sterling Brown,LAL,28.0,4.0,2.0,2.0,24.4,0.0,0.0,4.0,...,2.0,0.0,3.0,0.0,4.0,22.0,0.0,0.0,-4.0,


In [10]:
# Change the name with special character into normal name.
raw_dataset.iloc[[539],[0]] = 'Nikola Jovic'

## Find the Duplicated Record(s)

In [11]:
raw_dataset.iloc[[539],[0]]

Unnamed: 0,Player
539,Nikola Jovic


In [12]:
raw_dataset.iloc[426]

Player    Nikola Jovic
Team               MIA
Age               19.0
GP                15.0
W                  7.0
L                  8.0
Min              204.5
PTS               82.0
FGM               28.0
FGA               69.0
FG%               40.6
3PM                8.0
3PA               35.0
3P%               22.9
FTM               18.0
FTA               19.0
FT%               94.7
OREB               9.0
DREB              22.0
REB               31.0
AST               10.0
TOV               10.0
STL                7.0
BLK                2.0
PF                19.0
FP               151.0
DD2                0.0
TD3                0.0
+/-              -16.0
POS                NaN
Name: 426, dtype: object

## Move 'POS' Column to be the second index of the dataset

In [13]:
second_col = raw_dataset.pop('POS')

In [14]:
raw_dataset.insert(1, 'POS', second_col)

In [15]:
raw_dataset[-7:]

Unnamed: 0,Player,POS,Team,Age,GP,W,L,Min,PTS,FGM,...,REB,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-
533,Stanley Umude,G,DET,24.0,1.0,0.0,1.0,2.1,2.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,8.0,0.0,0.0,3.0
534,Alondes Williams,,BKN,23.0,1.0,1.0,0.0,5.3,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,1.0,-1.0,0.0,0.0,-5.0
535,Deonte Burton,,SAC,29.0,2.0,1.0,1.0,6.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
536,Frank Jackson,,UTA,24.0,1.0,0.0,1.0,5.0,0.0,0.0,...,2.0,1.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,-2.0
537,Michael Foster Jr.,,PHI,20.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0
538,Sterling Brown,,LAL,28.0,4.0,2.0,2.0,24.4,0.0,0.0,...,8.0,2.0,0.0,3.0,0.0,4.0,22.0,0.0,0.0,-4.0
539,Nikola Jovic,F,,,,,,,,,...,,,,,,,,,,


In [16]:
raw_dataset.at[426, 'POS'] = 'F'

## Drop the duplicated record(s).

In [17]:
dataset = raw_dataset.drop([539])

In [18]:
dataset.iloc[-7:]

Unnamed: 0,Player,POS,Team,Age,GP,W,L,Min,PTS,FGM,...,REB,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-
532,Chris Silva,F,DAL,26.0,1.0,1.0,0.0,3.0,2.0,1.0,...,0.0,0.0,1.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0
533,Stanley Umude,G,DET,24.0,1.0,0.0,1.0,2.1,2.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,8.0,0.0,0.0,3.0
534,Alondes Williams,,BKN,23.0,1.0,1.0,0.0,5.3,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,1.0,-1.0,0.0,0.0,-5.0
535,Deonte Burton,,SAC,29.0,2.0,1.0,1.0,6.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
536,Frank Jackson,,UTA,24.0,1.0,0.0,1.0,5.0,0.0,0.0,...,2.0,1.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,-2.0
537,Michael Foster Jr.,,PHI,20.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0
538,Sterling Brown,,LAL,28.0,4.0,2.0,2.0,24.4,0.0,0.0,...,8.0,2.0,0.0,3.0,0.0,4.0,22.0,0.0,0.0,-4.0


## Change the NaN Data Type into 'N/A' String.

In [19]:
raw_dataset[-7:]

Unnamed: 0,Player,POS,Team,Age,GP,W,L,Min,PTS,FGM,...,REB,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-
533,Stanley Umude,G,DET,24.0,1.0,0.0,1.0,2.1,2.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,8.0,0.0,0.0,3.0
534,Alondes Williams,,BKN,23.0,1.0,1.0,0.0,5.3,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,1.0,-1.0,0.0,0.0,-5.0
535,Deonte Burton,,SAC,29.0,2.0,1.0,1.0,6.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
536,Frank Jackson,,UTA,24.0,1.0,0.0,1.0,5.0,0.0,0.0,...,2.0,1.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,-2.0
537,Michael Foster Jr.,,PHI,20.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0
538,Sterling Brown,,LAL,28.0,4.0,2.0,2.0,24.4,0.0,0.0,...,8.0,2.0,0.0,3.0,0.0,4.0,22.0,0.0,0.0,-4.0
539,Nikola Jovic,F,,,,,,,,,...,,,,,,,,,,


In [20]:
raw_dataset = dataset
for i in range(534,539):
    raw_dataset.at[i, 'POS'] = 'N/A'
dataset = raw_dataset

In [21]:
dataset.iloc[-7:]

Unnamed: 0,Player,POS,Team,Age,GP,W,L,Min,PTS,FGM,...,REB,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-
532,Chris Silva,F,DAL,26.0,1.0,1.0,0.0,3.0,2.0,1.0,...,0.0,0.0,1.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0
533,Stanley Umude,G,DET,24.0,1.0,0.0,1.0,2.1,2.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,8.0,0.0,0.0,3.0
534,Alondes Williams,,BKN,23.0,1.0,1.0,0.0,5.3,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,1.0,-1.0,0.0,0.0,-5.0
535,Deonte Burton,,SAC,29.0,2.0,1.0,1.0,6.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
536,Frank Jackson,,UTA,24.0,1.0,0.0,1.0,5.0,0.0,0.0,...,2.0,1.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,-2.0
537,Michael Foster Jr.,,PHI,20.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0
538,Sterling Brown,,LAL,28.0,4.0,2.0,2.0,24.4,0.0,0.0,...,8.0,2.0,0.0,3.0,0.0,4.0,22.0,0.0,0.0,-4.0


## Export the cleaned dataset.

In [22]:
pd.DataFrame.to_csv(dataset, "2023_nba_player_stats.csv", index = False)