In [1]:
# Imports

import pandas as pd
import numpy as np
import os
import pprint

%matplotlib inline
import matplotlib as plt
import seaborn as sns

In [2]:
# function to see how many nulls are in the columns

def null_counts(df):
    """
    function to count the number of null values in each df column
    """
    missing = df.isna().sum()
    return missing

In [3]:
# function to determine percentage of NaN values in each column

def null_percentages(df):
    """
    returns the percentage of NaN values in each column to help determine
    whether or not the column in question should be kept or dropped
    """
    
    perct = round((df.isna().sum()) / len(df) * 100), 2
    return perct

In [4]:
# function to combine both 'null_counts' and 'null_percentages'

def null_feedback(df):
    """
    function that tells the user the info on the DataFrame's NaNs
    """
    x = null_counts(df)
    y = null_percentages(df)
    print(f"Total number of NaNs by column: \n{x}.")
    print(f"Percentage of columns that are NaN: \n{y}.")
    return 

In [44]:
# bring in the 'player_data' csv

player_data_df = pd.read_csv("/Users/DataScience/Rimshotz/alley-oop-nba-stats/player_data.csv")

player_data_shape = player_data_df.shape

a = player_data_df.isnull().sum().sum()

print("Player Data Information")
print(f"Consisting of {player_data_shape[0]} rows and {player_data_shape[1]} columns")
print(f"It has loads of data, but also has {a} missing values.") 
player_data_df

Player Data Information
Consisting of 4550 rows and 8 columns
It has loads of data, but also has 341 missing values.


Unnamed: 0,name,year_start,year_end,position,height,weight,birth_date,college
0,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke University
1,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State University
2,Kareem Abdul-Jabbar,1970,1989,C,7-2,225.0,"April 16, 1947","University of California, Los Angeles"
3,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",Louisiana State University
4,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974",San Jose State University
...,...,...,...,...,...,...,...,...
4545,Ante Zizic,2018,2018,F-C,6-11,250.0,"January 4, 1997",
4546,Jim Zoet,1983,1983,C,7-1,240.0,"December 20, 1953",Kent State University
4547,Bill Zopf,1971,1971,G,6-1,170.0,"June 7, 1948",Duquesne University
4548,Ivica Zubac,2017,2018,C,7-1,265.0,"March 18, 1997",


**Let's find out where those missing values are located**

In [6]:
player_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4550 entries, 0 to 4549
Data columns (total 8 columns):
name          4550 non-null object
year_start    4550 non-null int64
year_end      4550 non-null int64
position      4549 non-null object
height        4549 non-null object
weight        4544 non-null float64
birth_date    4519 non-null object
college       4248 non-null object
dtypes: float64(1), int64(2), object(5)
memory usage: 284.5+ KB


**Initial observations from data import:**

- Strings where they're expected ("name" and "college" columns), but also where NOT expected ("height" and "birthdate")

- "height" column is written so that someone who is 6-feet, 9-inches tall is documented as '6-9.'  We will have to not only change the data in that column to int/float, but also convert all those heights to inches.  Choosing inches because the representation as it is on import is NOT in metric (m/cm)

- There are some NaNs in the dataset that must be dealt with.  Recall: 'impute' is to assign a value; 'encode' is to give it a 0 or 1 

In [7]:
player_data_df.shape

(4550, 8)

In [8]:
null_feedback(player_data_df)

Total number of NaNs by column: 
name            0
year_start      0
year_end        0
position        1
height          1
weight          6
birth_date     31
college       302
dtype: int64.
Percentage of columns that are NaN: 
(name          0.0
year_start    0.0
year_end      0.0
position      0.0
height        0.0
weight        0.0
birth_date    1.0
college       7.0
dtype: float64, 2).


In [9]:
a = null_counts(player_data_df)
b = null_percentages(player_data_df)

print(f"The Player Data DataFrame has a total of {a} nulls, for a total null pct of {b}.")

The Player Data DataFrame has a total of name            0
year_start      0
year_end        0
position        1
height          1
weight          6
birth_date     31
college       302
dtype: int64 nulls, for a total null pct of (name          0.0
year_start    0.0
year_end      0.0
position      0.0
height        0.0
weight        0.0
birth_date    1.0
college       7.0
dtype: float64, 2).


In [10]:
# from datetime import datetime

# def birth_date(df):
#     """
#     function to loop through all the strings in 'birth_date' and convert
#     them into more data-analysis-friendly datetime formats
#     """
#     for birth_date in player_data_df.birth_date:
#         datetime_object = datetime.strptime('Jan 1 2005', '%b %d %Y')
#     df = birth_date_df
#     return birth_date_df


# player_data_df["birth_date"] = pd.date_range(pd.Timestamp(player_data_df.birth_date),
#                                             periods=1)

# player_data.strftime("%B %d, %Y")

In [11]:
# replace 'height' column with inches

# player_data_df['height'] = player_data_df.height.str.split("-").apply(lambda x: int(x[0]) * 12 + int(x[1]))

# def true_height(strings):
#     for string in strings:
#         new_str = string.split('-')
#         inches1 = new_str[0]
#         inches2 = new_str[1]

#         inches1 = int(inches1)*12
#         inches2 = int(inches2)

#     return inches1 + inches2

# player_data_df['height'] = player_data_df.height.apply(true_height)

# remove the dash from heights
# player_data_df["height"] = player_data_df["height"].str.replace("-", " ")

# player_data_df["height"]=player_data_df["height"].replace(["-"], "")
player_data_df

Unnamed: 0,name,year_start,year_end,position,height,weight,birth_date,college
0,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke University
1,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State University
2,Kareem Abdul-Jabbar,1970,1989,C,7-2,225.0,"April 16, 1947","University of California, Los Angeles"
3,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",Louisiana State University
4,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974",San Jose State University
...,...,...,...,...,...,...,...,...
4545,Ante Zizic,2018,2018,F-C,6-11,250.0,"January 4, 1997",
4546,Jim Zoet,1983,1983,C,7-1,240.0,"December 20, 1953",Kent State University
4547,Bill Zopf,1971,1971,G,6-1,170.0,"June 7, 1948",Duquesne University
4548,Ivica Zubac,2017,2018,C,7-1,265.0,"March 18, 1997",


In [12]:
# find the NaN value in the df
player_data_df[player_data_df["height"].isnull()].index.tolist()

[2142]

In [13]:
# show me row 2142

print(player_data_df.loc[[2142]])

             name  year_start  year_end position height  weight    birth_date  \
2142  George Karl        1974      1978      NaN    NaN     NaN  May 12, 1952   

                           college  
2142  University of North Carolina  


**Okay, so we know people lie about their weight, but having a 'NaN' value in the weight column is *killing* my statistical analysis, so substituting George Karl's NaN weight with what I find his playing weight to be while with the San Antonio Spurs from 1974-1978**

- [Team Roster Source](https://en.wikipedia.org/wiki/George_Karl)

- [San Antonio Spurs Roster, 1974](https://www.statscrew.com/basketball/roster/t-SAA/y-1974)

In [14]:
# replace the NaN value under 'weight' with Karl's playing weight, according to source
player_data_df["weight"] = player_data_df["weight"].fillna(185)

In [15]:
# check to make sure that row's weight data matches
print(player_data_df.loc[[2142]])

             name  year_start  year_end position height  weight    birth_date  \
2142  George Karl        1974      1978      NaN    NaN   185.0  May 12, 1952   

                           college  
2142  University of North Carolina  


**Checks out.  But since we're here, let's go ahead and adjust his position and height NaN values as well**

In [16]:
# according to the Spurs roster (cited above), George Karl was 6'2" 
player_data_df["height"] = player_data_df["height"].fillna("6-2")

In [17]:
# using the same Spurs roster, Karl's position is listed as guard (G)
player_data_df["position"] = player_data_df["position"].fillna("G")

**Checking to see that George Karl's row is now complete:**

In [18]:
print(player_data_df.loc[[2142]])

             name  year_start  year_end position height  weight    birth_date  \
2142  George Karl        1974      1978        G    6-2   185.0  May 12, 1952   

                           college  
2142  University of North Carolina  


In [19]:
null_counts(player_data_df)

name            0
year_start      0
year_end        0
position        0
height          0
weight          0
birth_date     31
college       302
dtype: int64

**^^Checks out.  Now 'height' has to be converted from a string object to an integer**

- Process:

>From the original 'height' column, isolate both feet and inches
        - recall that since the whole column is a string, '6-10' is has an index of 3, so if we're going to isolate the inches, we have to use ':4' as the index range.

>convert the feet and inches measurements into a new column called 'total_inches' 
>Convert 'foot-inch' measurements to inches

In [20]:
# isolate 'feet' measurements into new column

player_data_df["feet"] = player_data_df.height.str[:1] 

In [21]:
# isolate inches into new column

player_data_df["inches"] = player_data_df.height.str[2:4]

In [22]:
# check to see if the columns have been added

player_data_df

Unnamed: 0,name,year_start,year_end,position,height,weight,birth_date,college,feet,inches
0,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke University,6,10
1,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State University,6,9
2,Kareem Abdul-Jabbar,1970,1989,C,7-2,225.0,"April 16, 1947","University of California, Los Angeles",7,2
3,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",Louisiana State University,6,1
4,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974",San Jose State University,6,6
...,...,...,...,...,...,...,...,...,...,...
4545,Ante Zizic,2018,2018,F-C,6-11,250.0,"January 4, 1997",,6,11
4546,Jim Zoet,1983,1983,C,7-1,240.0,"December 20, 1953",Kent State University,7,1
4547,Bill Zopf,1971,1971,G,6-1,170.0,"June 7, 1948",Duquesne University,6,1
4548,Ivica Zubac,2017,2018,C,7-1,265.0,"March 18, 1997",,7,1


In [23]:
# check to see the datatypes of the two new columns
player_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4550 entries, 0 to 4549
Data columns (total 10 columns):
name          4550 non-null object
year_start    4550 non-null int64
year_end      4550 non-null int64
position      4550 non-null object
height        4550 non-null object
weight        4550 non-null float64
birth_date    4519 non-null object
college       4248 non-null object
feet          4550 non-null object
inches        4550 non-null object
dtypes: float64(1), int64(2), object(7)
memory usage: 355.6+ KB


In [24]:
# convert the 'feet' column to integer for later math

player_data_df["feet"] = player_data_df["feet"].astype(int)

In [25]:
# convert the 'inches' column to integer for later math

player_data_df["inches"] = player_data_df["inches"].astype(int)

In [26]:
# convert feet to inches

player_data_df["feet_in_inches"] = (player_data_df["feet"] * 12).to_frame("feet")

player_data_df

Unnamed: 0,name,year_start,year_end,position,height,weight,birth_date,college,feet,inches,feet_in_inches
0,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke University,6,10,72
1,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State University,6,9,72
2,Kareem Abdul-Jabbar,1970,1989,C,7-2,225.0,"April 16, 1947","University of California, Los Angeles",7,2,84
3,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",Louisiana State University,6,1,72
4,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974",San Jose State University,6,6,72
...,...,...,...,...,...,...,...,...,...,...,...
4545,Ante Zizic,2018,2018,F-C,6-11,250.0,"January 4, 1997",,6,11,72
4546,Jim Zoet,1983,1983,C,7-1,240.0,"December 20, 1953",Kent State University,7,1,84
4547,Bill Zopf,1971,1971,G,6-1,170.0,"June 7, 1948",Duquesne University,6,1,72
4548,Ivica Zubac,2017,2018,C,7-1,265.0,"March 18, 1997",,7,1,84


In [27]:
# new column adding 'feet_in_inches' and 'inches' columns

player_data_df["total_inches"] = (player_data_df["feet_in_inches"] + player_data_df["inches"])

player_data_df

Unnamed: 0,name,year_start,year_end,position,height,weight,birth_date,college,feet,inches,feet_in_inches,total_inches
0,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke University,6,10,72,82
1,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State University,6,9,72,81
2,Kareem Abdul-Jabbar,1970,1989,C,7-2,225.0,"April 16, 1947","University of California, Los Angeles",7,2,84,86
3,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",Louisiana State University,6,1,72,73
4,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974",San Jose State University,6,6,72,78
...,...,...,...,...,...,...,...,...,...,...,...,...
4545,Ante Zizic,2018,2018,F-C,6-11,250.0,"January 4, 1997",,6,11,72,83
4546,Jim Zoet,1983,1983,C,7-1,240.0,"December 20, 1953",Kent State University,7,1,84,85
4547,Bill Zopf,1971,1971,G,6-1,170.0,"June 7, 1948",Duquesne University,6,1,72,73
4548,Ivica Zubac,2017,2018,C,7-1,265.0,"March 18, 1997",,7,1,84,85


**Now that we have the height of each player in total inches, we can replace the values in 'height' with the values in 'total inches'**

In [28]:
player_data_df = player_data_df.assign(height=player_data_df["total_inches"])

player_data_df

Unnamed: 0,name,year_start,year_end,position,height,weight,birth_date,college,feet,inches,feet_in_inches,total_inches
0,Alaa Abdelnaby,1991,1995,F-C,82,240.0,"June 24, 1968",Duke University,6,10,72,82
1,Zaid Abdul-Aziz,1969,1978,C-F,81,235.0,"April 7, 1946",Iowa State University,6,9,72,81
2,Kareem Abdul-Jabbar,1970,1989,C,86,225.0,"April 16, 1947","University of California, Los Angeles",7,2,84,86
3,Mahmoud Abdul-Rauf,1991,2001,G,73,162.0,"March 9, 1969",Louisiana State University,6,1,72,73
4,Tariq Abdul-Wahad,1998,2003,F,78,223.0,"November 3, 1974",San Jose State University,6,6,72,78
...,...,...,...,...,...,...,...,...,...,...,...,...
4545,Ante Zizic,2018,2018,F-C,83,250.0,"January 4, 1997",,6,11,72,83
4546,Jim Zoet,1983,1983,C,85,240.0,"December 20, 1953",Kent State University,7,1,84,85
4547,Bill Zopf,1971,1971,G,73,170.0,"June 7, 1948",Duquesne University,6,1,72,73
4548,Ivica Zubac,2017,2018,C,85,265.0,"March 18, 1997",,7,1,84,85


**To clean things up, we can now drop the columns 'feet', 'inches', 'feet_in_inches', and 'total_inches'.  After that, rename 'height' column to 'height_inches'** 

In [29]:
# drop the columns

player_data_df = player_data_df.drop(["feet", "inches", "feet_in_inches", 
                                      "total_inches"], axis=1)

In [30]:
# rename 'height' column

player_data_df = player_data_df.rename(columns={"height" : "height_inches"})

In [31]:
player_data_df

Unnamed: 0,name,year_start,year_end,position,height_inches,weight,birth_date,college
0,Alaa Abdelnaby,1991,1995,F-C,82,240.0,"June 24, 1968",Duke University
1,Zaid Abdul-Aziz,1969,1978,C-F,81,235.0,"April 7, 1946",Iowa State University
2,Kareem Abdul-Jabbar,1970,1989,C,86,225.0,"April 16, 1947","University of California, Los Angeles"
3,Mahmoud Abdul-Rauf,1991,2001,G,73,162.0,"March 9, 1969",Louisiana State University
4,Tariq Abdul-Wahad,1998,2003,F,78,223.0,"November 3, 1974",San Jose State University
...,...,...,...,...,...,...,...,...
4545,Ante Zizic,2018,2018,F-C,83,250.0,"January 4, 1997",
4546,Jim Zoet,1983,1983,C,85,240.0,"December 20, 1953",Kent State University
4547,Bill Zopf,1971,1971,G,73,170.0,"June 7, 1948",Duquesne University
4548,Ivica Zubac,2017,2018,C,85,265.0,"March 18, 1997",


In [32]:
player_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4550 entries, 0 to 4549
Data columns (total 8 columns):
name             4550 non-null object
year_start       4550 non-null int64
year_end         4550 non-null int64
position         4550 non-null object
height_inches    4550 non-null int64
weight           4550 non-null float64
birth_date       4519 non-null object
college          4248 non-null object
dtypes: float64(1), int64(3), object(4)
memory usage: 284.5+ KB


**Success!  The original heights have not only been converted from stuff like '6-10' to inches, but also the datatype of that column is now int64, and we can start the analysis.**

In [33]:
# bring in the 'player' csv

players_df = pd.read_csv("/Users/DataScience/Rimshotz/alley-oop-nba-stats/Players.csv")

b = players_df.isnull().sum().sum()

players_shape = players_df.shape
print("Player Data Information")
print(f"Consisting of {players_shape[0]} rows and {players_shape[1]} columns")
print(f"Not all data is present.  We still have {b} missing values.")
players_df

Player Data Information
Consisting of 3922 rows and 8 columns
Not all data is present.  We still have 1306 missing values.


Unnamed: 0.1,Unnamed: 0,Player,height,weight,collage,born,birth_city,birth_state
0,0,Curly Armstrong,180.0,77.0,Indiana University,1918.0,,
1,1,Cliff Barker,188.0,83.0,University of Kentucky,1921.0,Yorktown,Indiana
2,2,Leo Barnhorst,193.0,86.0,University of Notre Dame,1924.0,,
3,3,Ed Bartels,196.0,88.0,North Carolina State University,1925.0,,
4,4,Ralph Beard,178.0,79.0,University of Kentucky,1927.0,Hardinsburg,Kentucky
...,...,...,...,...,...,...,...,...
3917,3917,Troy Williams,198.0,97.0,South Carolina State University,1969.0,Columbia,South Carolina
3918,3918,Kyle Wiltjer,208.0,108.0,Gonzaga University,1992.0,Portland,Oregon
3919,3919,Stephen Zimmerman,213.0,108.0,"University of Nevada, Las Vegas",1996.0,Hendersonville,Tennessee
3920,3920,Paul Zipser,203.0,97.0,,1994.0,Heidelberg,Germany


In [34]:
players_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3922 entries, 0 to 3921
Data columns (total 8 columns):
Unnamed: 0     3922 non-null int64
Player         3921 non-null object
height         3921 non-null float64
weight         3921 non-null float64
collage        3573 non-null object
born           3921 non-null float64
birth_city     3452 non-null object
birth_state    3439 non-null object
dtypes: float64(3), int64(1), object(4)
memory usage: 245.2+ KB


**Right off the bat, there's a misspelling: 'collage' should be 'college'...**

In [35]:
# renaming "collage" column to "college" - nobody plays basketball in a collage...

players_df = players_df.rename(columns={"collage" : "college"})

players_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3922 entries, 0 to 3921
Data columns (total 8 columns):
Unnamed: 0     3922 non-null int64
Player         3921 non-null object
height         3921 non-null float64
weight         3921 non-null float64
college        3573 non-null object
born           3921 non-null float64
birth_city     3452 non-null object
birth_state    3439 non-null object
dtypes: float64(3), int64(1), object(4)
memory usage: 245.2+ KB


**String datatypes are where they're expected; floats/ints as well**

In [36]:
# bring in the 'seasons_stats' csv

seasons_stats_df = pd.read_csv("/Users/DataScience/Rimshotz/alley-oop-nba-stats/Seasons_Stats.csv")

c = seasons_stats_df.isnull().sum().sum()

seasons_stats_shape = seasons_stats_df.shape
print("Statistical Information by Season")
print(f"Consisting of {seasons_stats_shape[0]} rows and {seasons_stats_shape[1]} columns")
print(f"While informative, still needs work: we are missing {c} values.")
seasons_stats_df

Statistical Information by Season
Consisting of 24691 rows and 53 columns
While informative, still needs work: we are missing 154919 values.


Unnamed: 0.1,Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,0,1950.0,Curly Armstrong,G-F,31.0,FTW,63.0,,,,...,0.705,,,,176.0,,,,217.0,458.0
1,1,1950.0,Cliff Barker,SG,29.0,INO,49.0,,,,...,0.708,,,,109.0,,,,99.0,279.0
2,2,1950.0,Leo Barnhorst,SF,25.0,CHS,67.0,,,,...,0.698,,,,140.0,,,,192.0,438.0
3,3,1950.0,Ed Bartels,F,24.0,TOT,15.0,,,,...,0.559,,,,20.0,,,,29.0,63.0
4,4,1950.0,Ed Bartels,F,24.0,DNN,13.0,,,,...,0.548,,,,20.0,,,,27.0,59.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24686,24686,2017.0,Cody Zeller,PF,24.0,CHO,62.0,58.0,1725.0,16.7,...,0.679,135.0,270.0,405.0,99.0,62.0,58.0,65.0,189.0,639.0
24687,24687,2017.0,Tyler Zeller,C,27.0,BOS,51.0,5.0,525.0,13.0,...,0.564,43.0,81.0,124.0,42.0,7.0,21.0,20.0,61.0,178.0
24688,24688,2017.0,Stephen Zimmerman,C,20.0,ORL,19.0,0.0,108.0,7.3,...,0.600,11.0,24.0,35.0,4.0,2.0,5.0,3.0,17.0,23.0
24689,24689,2017.0,Paul Zipser,SF,22.0,CHI,44.0,18.0,843.0,6.9,...,0.775,15.0,110.0,125.0,36.0,15.0,16.0,40.0,78.0,240.0


**Recall: the three DataFrames at which we're looking are:**

- player_data_df;

- players_df; and 

- seasons_stats_df

In [37]:
null_percentages(player_data_df)

(name             0.0
 year_start       0.0
 year_end         0.0
 position         0.0
 height_inches    0.0
 weight           0.0
 birth_date       1.0
 college          7.0
 dtype: float64, 2)

- "College" category has 7% nulls, and of all the data in "birth_date", only 1% is null.

- That 7% comes out to , I will investigate these missing values - don't seem to be hard to find.

In [38]:
null_percentages(players_df)

(Unnamed: 0      0.0
 Player          0.0
 height          0.0
 weight          0.0
 college         9.0
 born            0.0
 birth_city     12.0
 birth_state    12.0
 dtype: float64, 2)

- Like 'player_data_df', there are is some information missing.  I can hopefully find that missing data, and will definitely need to correct the spelling for 'collage' column

In [39]:
null_percentages(seasons_stats_df)

(Unnamed: 0      0.0
 Year            0.0
 Player          0.0
 Pos             0.0
 Age             0.0
 Tm              0.0
 G               0.0
 GS             26.0
 MP              2.0
 PER             2.0
 TS%             1.0
 3PAr           24.0
 FTr             1.0
 ORB%           16.0
 DRB%           16.0
 TRB%           13.0
 AST%            9.0
 STL%           16.0
 BLK%           16.0
 TOV%           21.0
 USG%           20.0
 blanl         100.0
 OWS             0.0
 DWS             0.0
 WS              0.0
 WS/48           2.0
 blank2        100.0
 OBPM           16.0
 DBPM           16.0
 BPM            16.0
 VORP           16.0
 FG              0.0
 FGA             0.0
 FG%             1.0
 3P             23.0
 3PA            23.0
 3P%            38.0
 2P              0.0
 2PA             0.0
 2P%             1.0
 eFG%            1.0
 FT              0.0
 FTA             0.0
 FT%             4.0
 ORB            16.0
 DRB            16.0
 TRB             2.0
 AST         

- All sorts of missing data here.  This is more statistical - not biographical - data, and other methods will have to be employed to determine what gets dropped, what gets imputed, and what gets encoded.

- Also: this will need a data dictionary, as all columns are abbreviations.  Since we're doing that (finding out what the abbreviations are), it may be a good idea to rename those columns with what we find.m


In [40]:
# def rename_abbreviated_columns(df):
#     """
#     Function to rename all the abbreviated column names into plain English.
#     All renamed columns are based on the dataset's accompanying glossary.
#     """
    
# rename columns inplace
seasons_stats_df.rename(
        columns={
            "Year": "year",
            "Player": "player",
            "Pos": "position",
            "Age": "age",
            "Tm": "team",
            "G": "games",
            "GS": "games_started",
            "MP": "minutes_played",
            "PER": "player_efficiency",
            "TS%": "true_shooting_%",
            "3PAr": "three_pt_tries",
            "FTr": "free_throws",
            "ORB%": "off_rebound_%",
            "DRB%": "def_rebound_%",
            "TRB%": "total_rebound_%",
            "AST%": "assist_%",
            "STL%": "steal_%",
            "BLK%": "block_%",
            "TOV%" : "turnover_%",
            "USG%": "usage_%",
            "blanl": "blank1",
            "OWS": "offensive_win_shares",
            "DWS": "defensive_win_shates",
            "WS": "win_shares",
            "WS/48": "win_shares_per_48min",
            "OBPM": "off_box_plus_minus",
            "DBPM": "def_box_plus_minus",
            "BPM": "box_plus_minus",
            "VORP": "value_over_replacement",
            "FG" : "field_goals", 
            "FGA" : "field_goal_attempts",
            "FG%" : "field_goal_%", 
            "3P" : "3_pointers", 
            "3PA" : "3_point_tries",
            "3P%" : "3_point_%",
            "2P" : "2_pointers",
            "2PA" : "2_point_tries",
            "2P%" : "2_point_%",
            "eFG%" : "effective_field_goal_%",
            "FT" : "free_throws",
            "FTA" : "free_throw_attempts",
            "FT%" : "free_throw_%",
            "ORB" : "off_rebounds",
            "DRB" : "def_rebounds",
            "TRB" : "total_rebounds",
            "AST" : "assists",
            "STL" : "steals",
            "BLK" : "blocks",
            "TOV" : "turnovers",
            "PF" : "personal_fouls",
            "PTS" : "points",
        },
        inplace=True,
        )

In [41]:
seasons_stats_df

Unnamed: 0.1,Unnamed: 0,year,player,position,age,team,games,games_started,minutes_played,player_efficiency,...,free_throw_%,off_rebounds,def_rebounds,total_rebounds,assists,steals,blocks,turnovers,personal_fouls,points
0,0,1950.0,Curly Armstrong,G-F,31.0,FTW,63.0,,,,...,0.705,,,,176.0,,,,217.0,458.0
1,1,1950.0,Cliff Barker,SG,29.0,INO,49.0,,,,...,0.708,,,,109.0,,,,99.0,279.0
2,2,1950.0,Leo Barnhorst,SF,25.0,CHS,67.0,,,,...,0.698,,,,140.0,,,,192.0,438.0
3,3,1950.0,Ed Bartels,F,24.0,TOT,15.0,,,,...,0.559,,,,20.0,,,,29.0,63.0
4,4,1950.0,Ed Bartels,F,24.0,DNN,13.0,,,,...,0.548,,,,20.0,,,,27.0,59.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24686,24686,2017.0,Cody Zeller,PF,24.0,CHO,62.0,58.0,1725.0,16.7,...,0.679,135.0,270.0,405.0,99.0,62.0,58.0,65.0,189.0,639.0
24687,24687,2017.0,Tyler Zeller,C,27.0,BOS,51.0,5.0,525.0,13.0,...,0.564,43.0,81.0,124.0,42.0,7.0,21.0,20.0,61.0,178.0
24688,24688,2017.0,Stephen Zimmerman,C,20.0,ORL,19.0,0.0,108.0,7.3,...,0.600,11.0,24.0,35.0,4.0,2.0,5.0,3.0,17.0,23.0
24689,24689,2017.0,Paul Zipser,SF,22.0,CHI,44.0,18.0,843.0,6.9,...,0.775,15.0,110.0,125.0,36.0,15.0,16.0,40.0,78.0,240.0


**Based on what we observed earlier, the following columns can be dropped because they lack necessary data for exploration:**

| Column To Be Dropped | Reason            |
|:--------------------:|:-----------------:|
| "blank1"             | all values are NaN| 
| "blank2"             | all values are NaN| 

In [42]:
# drop 'blank1' and 'blank2' columns - 100% of the values are missing

seasons_stats_df = seasons_stats_df.drop(["blank1", "blank2"], axis=1)
seasons_stats_df

Unnamed: 0.1,Unnamed: 0,year,player,position,age,team,games,games_started,minutes_played,player_efficiency,...,free_throw_%,off_rebounds,def_rebounds,total_rebounds,assists,steals,blocks,turnovers,personal_fouls,points
0,0,1950.0,Curly Armstrong,G-F,31.0,FTW,63.0,,,,...,0.705,,,,176.0,,,,217.0,458.0
1,1,1950.0,Cliff Barker,SG,29.0,INO,49.0,,,,...,0.708,,,,109.0,,,,99.0,279.0
2,2,1950.0,Leo Barnhorst,SF,25.0,CHS,67.0,,,,...,0.698,,,,140.0,,,,192.0,438.0
3,3,1950.0,Ed Bartels,F,24.0,TOT,15.0,,,,...,0.559,,,,20.0,,,,29.0,63.0
4,4,1950.0,Ed Bartels,F,24.0,DNN,13.0,,,,...,0.548,,,,20.0,,,,27.0,59.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24686,24686,2017.0,Cody Zeller,PF,24.0,CHO,62.0,58.0,1725.0,16.7,...,0.679,135.0,270.0,405.0,99.0,62.0,58.0,65.0,189.0,639.0
24687,24687,2017.0,Tyler Zeller,C,27.0,BOS,51.0,5.0,525.0,13.0,...,0.564,43.0,81.0,124.0,42.0,7.0,21.0,20.0,61.0,178.0
24688,24688,2017.0,Stephen Zimmerman,C,20.0,ORL,19.0,0.0,108.0,7.3,...,0.600,11.0,24.0,35.0,4.0,2.0,5.0,3.0,17.0,23.0
24689,24689,2017.0,Paul Zipser,SF,22.0,CHI,44.0,18.0,843.0,6.9,...,0.775,15.0,110.0,125.0,36.0,15.0,16.0,40.0,78.0,240.0


**What was originally 53 columns is now only 51 - drop successful.**

In [43]:
# Attempted to count total NaN at each row in seasons_stats_df 
# By running the following code:

# for i in range(len(seasons_stats_df.index)) : 
#     print(" Total NaN in row", i + 1, ":", 
#           seasons_stats_df.iloc[i].isnull().sum()) 

# Realized after running this, it was going through and listing all NaNs
# found in each of the 24,691 rows individually.  And that, well, I don't have another 
# millions years to live.