# Data cleaning

In [380]:
import pandas as pd
df = pd.read_csv("nba.csv")
df = df.dropna()

In [381]:
# Convert dtypes
df["Name"] = df["Name"].astype("string")
df["Team"] = df["Team"].astype("category")
df["Position"] = df["Position"].astype("category")
df["College"] = df["College"].astype("category")

# Split the height (e.g., "6-2")
l1 = df["Height"].dropna().str.split("-")

cm_list = []

for x in l1:
    feet = int(x[0])
    inch = int(x[1])
    cm = 2.54 * (12 * feet + inch)
    cm_list.append(cm)

df["Height"] = cm_list
df["Height"] = df["Height"].astype(float)

for x in df.columns:
    print(x, " : ", (df[x].dtype))

Name  :  string
Team  :  category
Number  :  float64
Position  :  category
Age  :  float64
Height  :  float64
Weight  :  float64
College  :  category
Salary  :  float64


In [382]:
# Detect missing values

# df.loc[0, "Height"] = None        creating one Nan value#
df[df.isna().any(axis=1)]


Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary


In [383]:
# Drop missing rows
df.dropna()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,187.96,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,198.12,235.0,Marquette,6796117.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,195.58,185.0,Georgia State,1148640.0
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,203.20,235.0,LSU,1170960.0
7,Kelly Olynyk,Boston Celtics,41.0,C,25.0,213.36,238.0,Gonzaga,2165160.0
...,...,...,...,...,...,...,...,...,...
449,Rodney Hood,Utah Jazz,5.0,SG,23.0,203.20,206.0,Duke,1348440.0
451,Chris Johnson,Utah Jazz,23.0,SF,26.0,198.12,206.0,Dayton,981348.0
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,208.28,234.0,Kentucky,2239800.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,190.50,203.0,Butler,2433333.0


In [384]:
# - Fill missing using mean, 0, "Unknown".  
df = pd.read_csv("nba.csv")
# df.fillna(0, inplace=True)
df = df.fillna(df.mean(numeric_only=True))
df

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.000000,PG,25.000000,6-2,180.000000,Texas,7.730337e+06
1,Jae Crowder,Boston Celtics,99.000000,SF,25.000000,6-6,235.000000,Marquette,6.796117e+06
2,John Holland,Boston Celtics,30.000000,SG,27.000000,6-5,205.000000,Boston University,4.842684e+06
3,R.J. Hunter,Boston Celtics,28.000000,SG,22.000000,6-5,185.000000,Georgia State,1.148640e+06
4,Jonas Jerebko,Boston Celtics,8.000000,PF,29.000000,6-10,231.000000,,5.000000e+06
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.000000,PG,26.000000,6-3,203.000000,Butler,2.433333e+06
454,Raul Neto,Utah Jazz,25.000000,PG,24.000000,6-1,179.000000,,9.000000e+05
455,Tibor Pleiss,Utah Jazz,21.000000,C,26.000000,7-3,256.000000,,2.900000e+06
456,Jeff Withey,Utah Jazz,24.000000,C,26.000000,7-0,231.000000,Kansas,9.472760e+05


In [385]:
# - Extract username from email, year from date, first name from full name.
import random

# random day/month
df["DOB"] = pd.to_datetime({
    "year": 2025 - df["Age"].astype(int),
    "month": [random.randint(1, 12) for _ in range(len(df))],
    "day": [random.randint(1, 28) for _ in range(len(df))],  # 1–28 to avoid invalid dates
})

len(df)
df

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,DOB
0,Avery Bradley,Boston Celtics,0.000000,PG,25.000000,6-2,180.000000,Texas,7.730337e+06,2000-09-20
1,Jae Crowder,Boston Celtics,99.000000,SF,25.000000,6-6,235.000000,Marquette,6.796117e+06,2000-12-27
2,John Holland,Boston Celtics,30.000000,SG,27.000000,6-5,205.000000,Boston University,4.842684e+06,1998-05-12
3,R.J. Hunter,Boston Celtics,28.000000,SG,22.000000,6-5,185.000000,Georgia State,1.148640e+06,2003-07-16
4,Jonas Jerebko,Boston Celtics,8.000000,PF,29.000000,6-10,231.000000,,5.000000e+06,1996-10-27
...,...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.000000,PG,26.000000,6-3,203.000000,Butler,2.433333e+06,1999-03-14
454,Raul Neto,Utah Jazz,25.000000,PG,24.000000,6-1,179.000000,,9.000000e+05,2001-04-08
455,Tibor Pleiss,Utah Jazz,21.000000,C,26.000000,7-3,256.000000,,2.900000e+06,1999-04-22
456,Jeff Withey,Utah Jazz,24.000000,C,26.000000,7-0,231.000000,Kansas,9.472760e+05,1999-09-04


In [393]:
#  Fix mixed-format date column.  
df.loc[458, "DOB"] = "meow"
df["DOB"] = pd.to_datetime(df["DOB"], errors = "coerce", dayfirst = True)
df["Age"] = pd.to_numeric(df["Age"], errors = "coerce")
df

  df.loc[458, "DOB"] = "meow"


Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,DOB
0,Avery Bradley,Boston Celtics,0.000000,PG,25.000000,6-2,180.000000,Texas,7.730337e+06,2000-09-20
1,Jae Crowder,Boston Celtics,99.000000,SF,25.000000,6-6,235.000000,Marquette,6.796117e+06,2000-12-27
2,John Holland,Boston Celtics,30.000000,SG,27.000000,6-5,205.000000,Boston University,4.842684e+06,1998-05-12
3,R.J. Hunter,Boston Celtics,28.000000,SG,22.000000,6-5,185.000000,Georgia State,1.148640e+06,2003-07-16
4,Jonas Jerebko,Boston Celtics,8.000000,PF,29.000000,6-10,231.000000,,5.000000e+06,1996-10-27
...,...,...,...,...,...,...,...,...,...,...
454,Raul Neto,Utah Jazz,25.000000,PG,24.000000,6-1,179.000000,,9.000000e+05,2001-04-08
455,Tibor Pleiss,Utah Jazz,21.000000,C,26.000000,7-3,256.000000,,2.900000e+06,1999-04-22
456,Jeff Withey,Utah Jazz,24.000000,C,26.000000,7-0,231.000000,Kansas,9.472760e+05,1999-09-04
457,,,17.678337,,26.938731,,221.522976,,4.842684e+06,1999-08-05
