In [1]:
import pandas as pd
import numpy as np
import os
import pprint

%matplotlib inline
import matplotlib as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import nba_acquire

np.random.seed(123)

In [2]:
pd.__version__

'0.25.3'

## 'player_data_df' work

In [3]:
player_data_df = nba_acquire.player_data()

player_data_df.head()

Player Data Information
Consisting of 4550 rows and 8 columns
It has loads of data, but also has 341 missing values.


Unnamed: 0,name,year_start,year_end,position,height_inches,weight,birth_date,college
0,Alaa Abdelnaby,1991,1995,F-C,82,240.0,"June 24, 1968",Duke University
1,Zaid Abdul-Aziz,1969,1978,C-F,81,235.0,"April 7, 1946",Iowa State University
2,Kareem Abdul-Jabbar,1970,1989,C,86,225.0,"April 16, 1947","University of California, Los Angeles"
3,Mahmoud Abdul-Rauf,1991,2001,G,73,162.0,"March 9, 1969",Louisiana State University
4,Tariq Abdul-Wahad,1998,2003,F,78,223.0,"November 3, 1974",San Jose State University


In [4]:
player_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4550 entries, 0 to 4549
Data columns (total 8 columns):
name             4550 non-null object
year_start       4550 non-null int64
year_end         4550 non-null int64
position         4550 non-null object
height_inches    4550 non-null int64
weight           4550 non-null float64
birth_date       4550 non-null object
college          4550 non-null object
dtypes: float64(1), int64(3), object(4)
memory usage: 284.5+ KB


In [5]:
# because of merge later, we need to verify no columns are dupes
player_data_df.columns.duplicated()

array([False, False, False, False, False, False, False, False])

In [6]:
# change 'name' column to 'player' column so we can merge later
player_data_df = player_data_df.rename(columns={"name" : "player"})

In [7]:
# Seeing if there are any duplicate players in the 4,550 listed
player_data_df.index.nunique()

4550

In [8]:
player_data_df.player.nunique()

4500

**4,550 rows of data, but 4,500 unique names.  Could be repeats - let's check to see what these duplicate player names look like:**

In [9]:
# assign the column data to a variable
names = player_data_df["player"]

# checks to see if there a name is duplicated in the player column, 
# then sorts_values on "player"
player_data_df[names.isin(names[names.duplicated()])].sort_values("player")

Unnamed: 0,player,year_start,year_end,position,height_inches,weight,birth_date,college
423,Bill Bradley,1968,1968,G,71,165.0,"June 16, 1941",No college listed
424,Bill Bradley,1968,1977,F-G,77,205.0,"July 28, 1943",Princeton University
1069,Bob Duffy,1963,1965,G,75,185.0,"September 26, 1940",Colgate University
1068,Bob Duffy,1947,1947,F,76,175.0,"July 5, 1922",Tulane University
2062,Bobby Jones,1975,1986,F,81,210.0,"December 18, 1951",University of North Carolina
...,...,...,...,...,...,...,...,...
1938,Tony Jackson,1981,1981,G,72,170.0,"January 17, 1958",Florida State University
2786,Tony Mitchell,2014,2014,F,78,216.0,"August 7, 1989",University of Alabama
2787,Tony Mitchell,2014,2014,F,80,235.0,"April 7, 1992",University of North Texas
3522,Walker Russell,2012,2012,G,72,170.0,"October 6, 1982",Jacksonville State University


In [10]:
names.value_counts()

Charles Jones      3
Charles Smith      3
George Johnson     3
David Lee          2
Gary Payton        2
                  ..
Jordan Hamilton    1
Joe Colone         1
Tony Windis        1
Bill Downey        1
Bo Erias           1
Name: player, Length: 4500, dtype: int64

**Looks like the reason there are only 4,500 unique player names in the 4,550 rows of data is because there are 50 instances of duplicated names.  Thanks to the other data in the DataFrame, we can determine the details of each player as we do analysis.**

## Now for 'players_df' work

In [11]:
players_df = nba_acquire.data_of_players()

players_df.head()

Player Data Information
Consisting of 3922 rows and 8 columns
Not all data is present.  We still have 1306 missing values.


Unnamed: 0,Player,height,weight,college,born,birth_city,birth_state
0,Curly Armstrong,70,169,Indiana University,1918,Not Available,Not Available
1,Cliff Barker,74,182,University of Kentucky,1921,Yorktown,Indiana
2,Leo Barnhorst,75,189,University of Notre Dame,1924,Not Available,Not Available
3,Ed Bartels,77,193,North Carolina State University,1925,Not Available,Not Available
4,Ralph Beard,70,173,University of Kentucky,1927,Hardinsburg,Kentucky


In [12]:
# change 'Player' column to 'player' so we can merge later
players_df = players_df.rename(columns={"Player" : "player"})

In [13]:
players_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3921 entries, 0 to 3921
Data columns (total 7 columns):
player         3921 non-null object
height         3921 non-null int64
weight         3921 non-null int64
college        3921 non-null object
born           3921 non-null int64
birth_city     3921 non-null object
birth_state    3921 non-null object
dtypes: int64(3), object(4)
memory usage: 245.1+ KB


In [14]:
# make sure no columns are duped for later merger
players_df.columns.duplicated()

array([False, False, False, False, False, False, False])

## Lastly, 'seasons_stats_df'

In [15]:
seasons_stats_df = nba_acquire.seasons_stats()

seasons_stats_df.head()

Statistical Information by Season
Consisting of 24691 rows and 53 columns
While informative, still needs work: we are missing 154919 values.


Unnamed: 0,year,player,position,age,team,games,games_started,minutes_played,player_efficiency,true_shooting_%,...,free_throw_%,off_rebounds,def_rebounds,total_rebounds,assists,steals,blocks,turnovers,personal_fouls,points
0,1950,Curly Armstrong,G-F,31,FTW,63,1,1,1.0,0.368,...,0.705,1,1,1,176,1,1,1,217,458
1,1950,Cliff Barker,SG,29,INO,49,1,1,1.0,0.435,...,0.708,1,1,1,109,1,1,1,99,279
2,1950,Leo Barnhorst,SF,25,CHS,67,1,1,1.0,0.394,...,0.698,1,1,1,140,1,1,1,192,438
3,1950,Ed Bartels,F,24,TOT,15,1,1,1.0,0.312,...,0.559,1,1,1,20,1,1,1,29,63
4,1950,Ed Bartels,F,24,DNN,13,1,1,1.0,0.308,...,0.548,1,1,1,20,1,1,1,27,59


In [16]:
seasons_stats_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24624 entries, 0 to 24690
Data columns (total 49 columns):
year                      24624 non-null int64
player                    24624 non-null object
position                  24624 non-null object
age                       24624 non-null int64
team                      24624 non-null object
games                     24624 non-null int64
games_started             24624 non-null int64
minutes_played            24624 non-null int64
player_efficiency         24624 non-null float64
true_shooting_%           24624 non-null float64
three_pt_tries            24624 non-null float64
free_throws               24624 non-null float64
off_rebound_%             24624 non-null float64
def_rebound_%             24624 non-null float64
total_rebound_%           24624 non-null float64
assist_%                  24624 non-null float64
steal_%                   24624 non-null float64
block_%                   24624 non-null float64
turnover_%            

In [17]:
# checking for dupes for later merge
seasons_stats_df.columns.duplicated()

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False])

**We have a duplicated column at column 39.  For a successful dataframe merger, we need to get rid of that column.**

In [18]:
# assigns new seasons_stats_df to be all the columns that
# are NOT (~) duplicated

# if we don't do this now, when we import the final_df 
# from this notebook, we will get the following error:
# "ValueError: Plan shapes are not aligned"

# that error tells us there is a duplicate column in one 
# of the dataframes

seasons_stats_df = seasons_stats_df.loc[:,~seasons_stats_df.columns.duplicated()]

In [19]:
seasons_stats_df.columns.duplicated()

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False])

**Now that the duplicated column has been eliminated, we can address the fact that 'seasons_stats_df' contains a lot of floats where we could have integers.  Again: hate the '.0' if there's no need for one**

In [20]:
# using dictionary to convert specific columns 
convert_dict = {"2_point_tries" : int, 
                "2_pointers" : int,
                "3_point_tries" : int,
                "3_pointers" : int, 
                "age" : int,
                "assists" : int, 
                "blocks" : int, 
                "def_rebounds" : int, 
                "field_goal_attempts" : int, 
                "field_goals" : int,
                "free_throws" : int, 
                "free_throw_attempts" : int, 
                "games" : int, 
                "games_started" : int, 
                "off_rebounds" : int, 
                "personal_fouls" : int, 
                "points" : int, 
                "steals" : int,
                "three_pt_tries" : int,
                "total_rebounds" : int, 
                "turnovers" : int, 
                "year" : int,
               } 

# turn off column limit so I can see the data in all columns:
pd.options.display.max_columns = None
  
seasons_stats_df = seasons_stats_df.astype(convert_dict) 
print(seasons_stats_df.dtypes.tolist()) 

# # save the columns as a variable
# float_cols = ["2_point_tries", "2_pointers", "3_point_tries", "3_pointers", 
#              "age", "assists", "blocks", "def_rebounds", "field_goal_attempts",
#              "field_goals", "free_throw_attempts", "games", "games_started",
#              "off_rebounds", "personal_fouls", "points", "steals", 
#              "total_rebounds", "turnovers", "year",]

# # convert those columns to integers
# seasons_stats_df = seasons_stats_df[float_cols].astype(int) 

# # check to see the datatypes have changed to what I wanted
# seasons_stats_df.head()

[dtype('int64'), dtype('O'), dtype('O'), dtype('int64'), dtype('O'), dtype('int64'), dtype('int64'), dtype('int64'), dtype('float64'), dtype('float64'), dtype('int64'), dtype('int64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('int64'), dtype('int64'), dtype('float64'), dtype('int64'), dtype('int64'), dtype('float64'), dtype('int64'), dtype('int64'), dtype('float64'), dtype('float64'), dtype('int64'), dtype('float64'), dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64')]


In [21]:
seasons_stats_df.head()

Unnamed: 0,year,player,position,age,team,games,games_started,minutes_played,player_efficiency,true_shooting_%,three_pt_tries,free_throws,off_rebound_%,def_rebound_%,total_rebound_%,assist_%,steal_%,block_%,turnover_%,usage_%,offensive_win_shares,defensive_win_shares,win_shares,win_shares_per_48min,off_box_plus_minus,def_box_plus_minus,box_plus_minus,value_over_replacement,field_goals,field_goal_attempts,field_goal_%,3_pointers,3_point_tries,3_point_%,2_pointers,2_point_tries,2_point_%,effective_field_goal_%,free_throw_attempts,free_throw_%,off_rebounds,def_rebounds,total_rebounds,assists,steals,blocks,turnovers,personal_fouls,points
0,1950,Curly Armstrong,G-F,31,FTW,63,1,1,1.0,0.368,1,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.1,3.6,3.5,1.0,1.0,1.0,1.0,1.0,144,516,0.279,1,1,1.0,144,516,0.279,0.279,241,0.705,1,1,1,176,1,1,1,217,458
1,1950,Cliff Barker,SG,29,INO,49,1,1,1.0,0.435,1,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.6,0.6,2.2,1.0,1.0,1.0,1.0,1.0,102,274,0.372,1,1,1.0,102,274,0.372,0.372,106,0.708,1,1,1,109,1,1,1,99,279
2,1950,Leo Barnhorst,SF,25,CHS,67,1,1,1.0,0.394,1,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9,2.8,3.6,1.0,1.0,1.0,1.0,1.0,174,499,0.349,1,1,1.0,174,499,0.349,0.349,129,0.698,1,1,1,140,1,1,1,192,438
3,1950,Ed Bartels,F,24,TOT,15,1,1,1.0,0.312,1,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.5,-0.1,-0.6,1.0,1.0,1.0,1.0,1.0,22,86,0.256,1,1,1.0,22,86,0.256,0.256,34,0.559,1,1,1,20,1,1,1,29,63
4,1950,Ed Bartels,F,24,DNN,13,1,1,1.0,0.308,1,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.5,-0.1,-0.6,1.0,1.0,1.0,1.0,1.0,21,82,0.256,1,1,1.0,21,82,0.256,0.256,31,0.548,1,1,1,20,1,1,1,27,59


**Because a lot of this info is repetitive and we can do a better analysis of the data in such a manner, I am combining the three dataframes into one**

- The backbone will be the seasons_stats_df, as it is by far the most comprehensive.  However, 'player_data_df' and 'players_df' both have additional information, like each player's height (in inches), weight (in pounds), and the college the attended (if they went to college)

- When doing analysis we can append the data as needed

**First, though, we need to develop a data dictionary to figure out what all this is.**

**player_efficiency -** per its inventor (ESPN columnist John Hollinger), it "sums up all a player's positive accomplishments, subtracts the negative accomplishments, and returns a per-minute rating of a player's performance."  Basically, the formula for calculating it is:
>*positive deed - negative deed = per minute rating of player's performance*

**true_shooting_% -** a measure of shooting efficiency taking into account 3-pointers, 2-pointers, and free throws.  Calculated using *points / (2 X shooting attempts)*

**rebound -** when someone misses a shot, whoever grabs the ball after the missed shot gets the 'rebound.'  If the person is on the same team as the one who took the shot, that's called an 'offensive rebound.'  If on the other team, a 'defensive rebound.' 

**off_rebound_% -** an estimated percentage of the total amount of rebounds a player got per game that maintained possession for their team 

**def_rebound_% -** an estimated percentage of the total amount of rebounds a player got per game that turned possession in favor of their team. 

**asst_% -** an assist is when one player passes to another who makes the basket after receiving the pass.  They receiver is entitled to one dribble after getting the pass - more than that, and the player who passed them the ball loses the assist

**usage_% -** an estimate of the percentage of team plays involving the individual player during his time on the court

**win_shares -** an estimate of the number of team wins to which the individual player contributed.  It includes the number of points the player contributed, as well as that player's time of possession compared to the average time of possession for the rest of the league

- **offensive_win_shares -** 

- **defensive_win_shares -**

- **win_shares_per_48min -**

**box_plus_minus _**

- **off_box_plus_minus -**

- **def_box_plus_minus -**

**value_over_replacement -**

**field_goals -**

**effective_field_goal_% -**

In [22]:
# seasons_stats_df["height_inches"] = np.where(seasons_stats_df["player"] == player_data_df["name"],
#                                              'True','False')

# yielded the following error:
# ValueError: Can only compare identically-labeled Series objects

**^^Have to change the 'name' column in player_data_df to 'player' so that we can compare columns in different dataframes**

In [23]:
# player_data_df = player_data_df.rename(columns={"name" : "player"})

# print(player_data_df.head())

**Done.  Now let's try comparing the two df's again:**

In [24]:
# seasons_stats_df["height_inches"] = np.where(seasons_stats_df["player"] == player_data_df["player"],
#                                               'True','False')

# Nope, same error:
# ValueError: Can only compare identically-labeled Series objects

In [25]:
# df4 = player_data_df.loc[player_data_df["player"] == seasons_stats_df["player"]].values

# df4.head()

# Nope.  Same error:
#ValueError: Can only compare identically-labeled Series objects

**Looks like the problem is that whenever I try to work with these dataframes, the indexes don't match, and so I need to do that:**

In [26]:
# for player_data_df:
# player_data_df = player_data_df.set_index("player")

# player_data_df.head()

# didn't do any good

In [27]:
# for seasons_stats_df:
# seasons_stats_df = seasons_stats_df.set_index("player")

# seasons_stats_df.head()

# didn't do any good

**Okay, so indexes are set, now let's try that code again:**

In [28]:
# seasons_stats_df["height_inches"] = np.where(seasons_stats_df["player"] == player_data_df["player"],
#                                               'True','False')

# Nope. Got this error:
# KeyError: 'player'

In [29]:
# seasons_stats_df["height_inches"] = np.where(seasons_stats_df.index == player_data_df.index,
#                                               'True','False')

# Negatory.  Got this error:
# ValueError: Lengths must match to compare

**Okay, so running into some problems.  Need to consider the following:**

- since the indexes are different (substantially, given player_data_df only has 4,550 rows and seasons_stats_df has 24,691 rows), I can't just merge the player_data_df column 'height_inches' onto the seasons_stats_df wherever the names match.  

- I have to search 'seasons_stats_df' for player matches

In [30]:
# checking to make sure this is the info in 'player' column of 'player_data_df'
print(player_data_df.index)

RangeIndex(start=0, stop=4550, step=1)


In [31]:
# try merging

final_df = pd.merge(seasons_stats_df, player_data_df, on=["player"], how="left")

final_df.head()

# Individual_df = pd.merge(Person_df, Target_df2, on=['Country'], how='left')
# Individual_df['TargetID'] = Individual_df['Name'] + df3['Emplid'].astype(str) + ((df3.groupby('Emplid').cumcount() + 1).astype(str).str.zfill(2))
# Individual_df = Individual_df[['TargetID', 'Category', 'Target']]
# print Individual_df

Unnamed: 0,year,player,position_x,age,team,games,games_started,minutes_played,player_efficiency,true_shooting_%,three_pt_tries,free_throws,off_rebound_%,def_rebound_%,total_rebound_%,assist_%,steal_%,block_%,turnover_%,usage_%,offensive_win_shares,defensive_win_shares,win_shares,win_shares_per_48min,off_box_plus_minus,def_box_plus_minus,box_plus_minus,value_over_replacement,field_goals,field_goal_attempts,field_goal_%,3_pointers,3_point_tries,3_point_%,2_pointers,2_point_tries,2_point_%,effective_field_goal_%,free_throw_attempts,free_throw_%,off_rebounds,def_rebounds,total_rebounds,assists,steals,blocks,turnovers,personal_fouls,points,year_start,year_end,position_y,height_inches,weight,birth_date,college
0,1950,Curly Armstrong,G-F,31,FTW,63,1,1,1.0,0.368,1,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.1,3.6,3.5,1.0,1.0,1.0,1.0,1.0,144,516,0.279,1,1,1.0,144,516,0.279,0.279,241,0.705,1,1,1,176,1,1,1,217,458,1949.0,1951.0,G-F,71.0,170.0,"November 1, 1918",Indiana University
1,1950,Cliff Barker,SG,29,INO,49,1,1,1.0,0.435,1,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.6,0.6,2.2,1.0,1.0,1.0,1.0,1.0,102,274,0.372,1,1,1.0,102,274,0.372,0.372,106,0.708,1,1,1,109,1,1,1,99,279,1950.0,1952.0,G,74.0,185.0,"January 15, 1921",University of Kentucky
2,1950,Leo Barnhorst,SF,25,CHS,67,1,1,1.0,0.394,1,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9,2.8,3.6,1.0,1.0,1.0,1.0,1.0,174,499,0.349,1,1,1.0,174,499,0.349,0.349,129,0.698,1,1,1,140,1,1,1,192,438,1950.0,1954.0,F-G,76.0,190.0,"May 11, 1924",University of Notre Dame
3,1950,Ed Bartels,F,24,TOT,15,1,1,1.0,0.312,1,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.5,-0.1,-0.6,1.0,1.0,1.0,1.0,1.0,22,86,0.256,1,1,1.0,22,86,0.256,0.256,34,0.559,1,1,1,20,1,1,1,29,63,1950.0,1951.0,F,77.0,195.0,"October 8, 1925",North Carolina State University
4,1950,Ed Bartels,F,24,DNN,13,1,1,1.0,0.308,1,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.5,-0.1,-0.6,1.0,1.0,1.0,1.0,1.0,21,82,0.256,1,1,1.0,21,82,0.256,0.256,31,0.548,1,1,1,20,1,1,1,27,59,1950.0,1951.0,F,77.0,195.0,"October 8, 1925",North Carolina State University


In [32]:
final_df.columns

Index(['year', 'player', 'position_x', 'age', 'team', 'games', 'games_started',
       'minutes_played', 'player_efficiency', 'true_shooting_%',
       'three_pt_tries', 'free_throws', 'off_rebound_%', 'def_rebound_%',
       'total_rebound_%', 'assist_%', 'steal_%', 'block_%', 'turnover_%',
       'usage_%', 'offensive_win_shares', 'defensive_win_shares', 'win_shares',
       'win_shares_per_48min', 'off_box_plus_minus', 'def_box_plus_minus',
       'box_plus_minus', 'value_over_replacement', 'field_goals',
       'field_goal_attempts', 'field_goal_%', '3_pointers', '3_point_tries',
       '3_point_%', '2_pointers', '2_point_tries', '2_point_%',
       'effective_field_goal_%', 'free_throw_attempts', 'free_throw_%',
       'off_rebounds', 'def_rebounds', 'total_rebounds', 'assists', 'steals',
       'blocks', 'turnovers', 'personal_fouls', 'points', 'year_start',
       'year_end', 'position_y', 'height_inches', 'weight', 'birth_date',
       'college'],
      dtype='object')

**Lots of info here, but we could also use the information from the players_df regarding where these players were born.  Would be helpful for some type of analysis where we're asked 'In what region are most NBA players born?'**

In [33]:
# merge the two dataframes
# final_df = pd.concat([final_df, players_df])

# gave me the following error:
# AttributeError: 'NoneType' object has no attribute 'is_extension'

**Based on some investigation, this error(^^) is due to duplicate column names.  Looks like I need to drop the names from players_df has in common with final_df**

In [34]:
players_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3921 entries, 0 to 3921
Data columns (total 7 columns):
player         3921 non-null object
height         3921 non-null int64
weight         3921 non-null int64
college        3921 non-null object
born           3921 non-null int64
birth_city     3921 non-null object
birth_state    3921 non-null object
dtypes: int64(3), object(4)
memory usage: 245.1+ KB


In [35]:
# players_df = players_df.drop(["player", "height", "weight", "college",], axis=1)
# players_df.head()

**Now that the similar column names have been dropped, let's try the merge again:**

In [36]:
final_df.columns.duplicated()

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False])

**One duplicate column = "free throws".  Let's check the values so we can decide which to drop:**

In [37]:
# checks to see which columns are NOT duplicates (indicated by ~ for 'not')
final_df = final_df.loc[:,~final_df.columns.duplicated()]
final_df

Unnamed: 0,year,player,position_x,age,team,games,games_started,minutes_played,player_efficiency,true_shooting_%,three_pt_tries,free_throws,off_rebound_%,def_rebound_%,total_rebound_%,assist_%,steal_%,block_%,turnover_%,usage_%,offensive_win_shares,defensive_win_shares,win_shares,win_shares_per_48min,off_box_plus_minus,def_box_plus_minus,box_plus_minus,value_over_replacement,field_goals,field_goal_attempts,field_goal_%,3_pointers,3_point_tries,3_point_%,2_pointers,2_point_tries,2_point_%,effective_field_goal_%,free_throw_attempts,free_throw_%,off_rebounds,def_rebounds,total_rebounds,assists,steals,blocks,turnovers,personal_fouls,points,year_start,year_end,position_y,height_inches,weight,birth_date,college
0,1950,Curly Armstrong,G-F,31,FTW,63,1,1,1.0,0.368,1,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.1,3.6,3.5,1.000,1.0,1.0,1.0,1.0,144,516,0.279,1,1,1.000,144,516,0.279,0.279,241,0.705,1,1,1,176,1,1,1,217,458,1949.0,1951.0,G-F,71.0,170.0,"November 1, 1918",Indiana University
1,1950,Cliff Barker,SG,29,INO,49,1,1,1.0,0.435,1,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.6,0.6,2.2,1.000,1.0,1.0,1.0,1.0,102,274,0.372,1,1,1.000,102,274,0.372,0.372,106,0.708,1,1,1,109,1,1,1,99,279,1950.0,1952.0,G,74.0,185.0,"January 15, 1921",University of Kentucky
2,1950,Leo Barnhorst,SF,25,CHS,67,1,1,1.0,0.394,1,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9,2.8,3.6,1.000,1.0,1.0,1.0,1.0,174,499,0.349,1,1,1.000,174,499,0.349,0.349,129,0.698,1,1,1,140,1,1,1,192,438,1950.0,1954.0,F-G,76.0,190.0,"May 11, 1924",University of Notre Dame
3,1950,Ed Bartels,F,24,TOT,15,1,1,1.0,0.312,1,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.5,-0.1,-0.6,1.000,1.0,1.0,1.0,1.0,22,86,0.256,1,1,1.000,22,86,0.256,0.256,34,0.559,1,1,1,20,1,1,1,29,63,1950.0,1951.0,F,77.0,195.0,"October 8, 1925",North Carolina State University
4,1950,Ed Bartels,F,24,DNN,13,1,1,1.0,0.308,1,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.5,-0.1,-0.6,1.000,1.0,1.0,1.0,1.0,21,82,0.256,1,1,1.000,21,82,0.256,0.256,31,0.548,1,1,1,20,1,1,1,27,59,1950.0,1951.0,F,77.0,195.0,"October 8, 1925",North Carolina State University
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25225,2017,Cody Zeller,PF,24,CHO,62,58,1725,16.7,0.604,0,0,8.6,17.3,12.9,9.1,1.8,3.0,10.9,15.5,3.4,2.2,5.6,0.157,-0.2,2.3,2.1,1.8,253,443,0.571,0,1,0.000,253,442,0.572,0.571,196,0.679,135,270,405,99,62,58,65,189,639,2014.0,2018.0,C-F,84.0,240.0,"October 5, 1992",Indiana University
25226,2017,Tyler Zeller,C,27,BOS,51,5,525,13.0,0.508,0,0,9.2,17.0,13.2,12.2,0.7,3.3,10.2,16.5,0.5,0.6,1.0,0.094,-3.2,0.8,-2.5,-0.1,78,158,0.494,0,1,0.000,78,157,0.497,0.494,39,0.564,43,81,124,42,7,21,20,61,178,2013.0,2018.0,F-C,84.0,253.0,"January 17, 1990",University of North Carolina
25227,2017,Stephen Zimmerman,C,20,ORL,19,0,108,7.3,0.346,0,0,10.8,24.9,17.6,5.3,0.9,3.7,8.3,14.8,-0.1,0.1,0.0,-0.005,-7.8,0.4,-7.3,-0.1,10,31,0.323,0,0,1.000,10,31,0.323,0.323,5,0.600,11,24,35,4,2,5,3,17,23,2017.0,2017.0,C,84.0,240.0,"September 9, 1996","University of Nevada, Las Vegas"
25228,2017,Paul Zipser,SF,22,CHI,44,18,843,6.9,0.503,0,0,1.9,14.2,8.0,6.1,0.9,1.5,14.4,14.4,-0.3,0.8,0.5,0.030,-3.6,-0.1,-3.7,-0.4,88,221,0.398,33,99,0.333,55,122,0.451,0.473,40,0.775,15,110,125,36,15,16,40,78,240,2017.0,2018.0,G-F,80.0,215.0,"February 18, 1994",No college listed


In [38]:
final_df.columns

Index(['year', 'player', 'position_x', 'age', 'team', 'games', 'games_started',
       'minutes_played', 'player_efficiency', 'true_shooting_%',
       'three_pt_tries', 'free_throws', 'off_rebound_%', 'def_rebound_%',
       'total_rebound_%', 'assist_%', 'steal_%', 'block_%', 'turnover_%',
       'usage_%', 'offensive_win_shares', 'defensive_win_shares', 'win_shares',
       'win_shares_per_48min', 'off_box_plus_minus', 'def_box_plus_minus',
       'box_plus_minus', 'value_over_replacement', 'field_goals',
       'field_goal_attempts', 'field_goal_%', '3_pointers', '3_point_tries',
       '3_point_%', '2_pointers', '2_point_tries', '2_point_%',
       'effective_field_goal_%', 'free_throw_attempts', 'free_throw_%',
       'off_rebounds', 'def_rebounds', 'total_rebounds', 'assists', 'steals',
       'blocks', 'turnovers', 'personal_fouls', 'points', 'year_start',
       'year_end', 'position_y', 'height_inches', 'weight', 'birth_date',
       'college'],
      dtype='object')

**Okay, looks like the second instance of 'free_throws' was removed, keeping the data from the first.  On to the next df, 'players_df':**

In [39]:
players_df.columns.duplicated()

array([False, False, False, False, False, False, False])

**Okay, no duplicates anywhere.  NOW let's try merging the two df's into a final one.**

In [40]:
final_df = pd.concat([final_df, players_df])
# ^^ works, but need raised the following error on import for exploration:
# "ValueError: Plan shapes are not aligned"

# if concat doesn't work:
# trying merge instead of concat
# final_df = pd.merge(final_df, player_data_df, on=["player"], how="left")

# turn off column limit so I can see the data in all columns:
pd.options.display.max_columns = None

final_df.head()

Unnamed: 0,2_point_%,2_point_tries,2_pointers,3_point_%,3_point_tries,3_pointers,age,assist_%,assists,birth_city,birth_date,birth_state,block_%,blocks,born,box_plus_minus,college,def_box_plus_minus,def_rebound_%,def_rebounds,defensive_win_shares,effective_field_goal_%,field_goal_%,field_goal_attempts,field_goals,free_throw_%,free_throw_attempts,free_throws,games,games_started,height,height_inches,minutes_played,off_box_plus_minus,off_rebound_%,off_rebounds,offensive_win_shares,personal_fouls,player,player_efficiency,points,position_x,position_y,steal_%,steals,team,three_pt_tries,total_rebound_%,total_rebounds,true_shooting_%,turnover_%,turnovers,usage_%,value_over_replacement,weight,win_shares,win_shares_per_48min,year,year_end,year_start
0,0.279,516.0,144.0,1.0,1.0,1.0,31.0,1.0,176.0,,"November 1, 1918",,1.0,1.0,,1.0,Indiana University,1.0,1.0,1.0,3.6,0.279,0.279,516.0,144.0,0.705,241.0,0.0,63.0,1.0,,71.0,1.0,1.0,1.0,1.0,-0.1,217.0,Curly Armstrong,1.0,458.0,G-F,G-F,1.0,1.0,FTW,1.0,1.0,1.0,0.368,1.0,1.0,1.0,1.0,170.0,3.5,1.0,1950.0,1951.0,1949.0
1,0.372,274.0,102.0,1.0,1.0,1.0,29.0,1.0,109.0,,"January 15, 1921",,1.0,1.0,,1.0,University of Kentucky,1.0,1.0,1.0,0.6,0.372,0.372,274.0,102.0,0.708,106.0,0.0,49.0,1.0,,74.0,1.0,1.0,1.0,1.0,1.6,99.0,Cliff Barker,1.0,279.0,SG,G,1.0,1.0,INO,1.0,1.0,1.0,0.435,1.0,1.0,1.0,1.0,185.0,2.2,1.0,1950.0,1952.0,1950.0
2,0.349,499.0,174.0,1.0,1.0,1.0,25.0,1.0,140.0,,"May 11, 1924",,1.0,1.0,,1.0,University of Notre Dame,1.0,1.0,1.0,2.8,0.349,0.349,499.0,174.0,0.698,129.0,0.0,67.0,1.0,,76.0,1.0,1.0,1.0,1.0,0.9,192.0,Leo Barnhorst,1.0,438.0,SF,F-G,1.0,1.0,CHS,1.0,1.0,1.0,0.394,1.0,1.0,1.0,1.0,190.0,3.6,1.0,1950.0,1954.0,1950.0
3,0.256,86.0,22.0,1.0,1.0,1.0,24.0,1.0,20.0,,"October 8, 1925",,1.0,1.0,,1.0,North Carolina State University,1.0,1.0,1.0,-0.1,0.256,0.256,86.0,22.0,0.559,34.0,0.0,15.0,1.0,,77.0,1.0,1.0,1.0,1.0,-0.5,29.0,Ed Bartels,1.0,63.0,F,F,1.0,1.0,TOT,1.0,1.0,1.0,0.312,1.0,1.0,1.0,1.0,195.0,-0.6,1.0,1950.0,1951.0,1950.0
4,0.256,82.0,21.0,1.0,1.0,1.0,24.0,1.0,20.0,,"October 8, 1925",,1.0,1.0,,1.0,North Carolina State University,1.0,1.0,1.0,-0.1,0.256,0.256,82.0,21.0,0.548,31.0,0.0,13.0,1.0,,77.0,1.0,1.0,1.0,1.0,-0.5,27.0,Ed Bartels,1.0,59.0,F,F,1.0,1.0,DNN,1.0,1.0,1.0,0.308,1.0,1.0,1.0,1.0,195.0,-0.6,1.0,1950.0,1951.0,1950.0


In [41]:
final_df.columns

Index(['2_point_%', '2_point_tries', '2_pointers', '3_point_%',
       '3_point_tries', '3_pointers', 'age', 'assist_%', 'assists',
       'birth_city', 'birth_date', 'birth_state', 'block_%', 'blocks', 'born',
       'box_plus_minus', 'college', 'def_box_plus_minus', 'def_rebound_%',
       'def_rebounds', 'defensive_win_shares', 'effective_field_goal_%',
       'field_goal_%', 'field_goal_attempts', 'field_goals', 'free_throw_%',
       'free_throw_attempts', 'free_throws', 'games', 'games_started',
       'height', 'height_inches', 'minutes_played', 'off_box_plus_minus',
       'off_rebound_%', 'off_rebounds', 'offensive_win_shares',
       'personal_fouls', 'player', 'player_efficiency', 'points', 'position_x',
       'position_y', 'steal_%', 'steals', 'team', 'three_pt_tries',
       'total_rebound_%', 'total_rebounds', 'true_shooting_%', 'turnover_%',
       'turnovers', 'usage_%', 'value_over_replacement', 'weight',
       'win_shares', 'win_shares_per_48min', 'year', 'year_end'

**Whenever you merge dataframes, you risk duplicating keys, and that results in a doubling of rows.  For example, if I wanted to look at a single index, I would code <span style="color:blue">'df.loc[[index 15]]'</span> (15 was picked at random for example's sake) and get:***

|Output|
|:---------------------------------------:|
|15     Data.    Data.    Data.    Data.  |
|15     Data.    Data.    Data.    Data.  |

**Solution is to drop the duplicates to the left and right of the merge**

In [42]:
# drop dupes on left
final_df.drop_duplicates(subset="player", inplace=True)

# drop dupes on right
# final_df.drop_duplicates(subset=right_key)

In [43]:
# check
final_df.loc[[54]]

Unnamed: 0,2_point_%,2_point_tries,2_pointers,3_point_%,3_point_tries,3_pointers,age,assist_%,assists,birth_city,birth_date,birth_state,block_%,blocks,born,box_plus_minus,college,def_box_plus_minus,def_rebound_%,def_rebounds,defensive_win_shares,effective_field_goal_%,field_goal_%,field_goal_attempts,field_goals,free_throw_%,free_throw_attempts,free_throws,games,games_started,height,height_inches,minutes_played,off_box_plus_minus,off_rebound_%,off_rebounds,offensive_win_shares,personal_fouls,player,player_efficiency,points,position_x,position_y,steal_%,steals,team,three_pt_tries,total_rebound_%,total_rebounds,true_shooting_%,turnover_%,turnovers,usage_%,value_over_replacement,weight,win_shares,win_shares_per_48min,year,year_end,year_start
54,0.327,275.0,90.0,1.0,1.0,1.0,23.0,1.0,38.0,,"September 10, 1926",,1.0,1.0,,1.0,Muhlenberg College,1.0,1.0,1.0,0.7,0.327,0.327,275.0,90.0,0.689,106.0,0.0,45.0,1.0,,74.0,1.0,1.0,1.0,1.0,0.3,107.0,Harry Donovan,1.0,253.0,G,G,1.0,1.0,NYK,1.0,1.0,1.0,0.393,1.0,1.0,1.0,1.0,180.0,1.0,1.0,1950.0,1950.0,1950.0


In [44]:
# double check
final_df.loc[[103]]

Unnamed: 0,2_point_%,2_point_tries,2_pointers,3_point_%,3_point_tries,3_pointers,age,assist_%,assists,birth_city,birth_date,birth_state,block_%,blocks,born,box_plus_minus,college,def_box_plus_minus,def_rebound_%,def_rebounds,defensive_win_shares,effective_field_goal_%,field_goal_%,field_goal_attempts,field_goals,free_throw_%,free_throw_attempts,free_throws,games,games_started,height,height_inches,minutes_played,off_box_plus_minus,off_rebound_%,off_rebounds,offensive_win_shares,personal_fouls,player,player_efficiency,points,position_x,position_y,steal_%,steals,team,three_pt_tries,total_rebound_%,total_rebounds,true_shooting_%,turnover_%,turnovers,usage_%,value_over_replacement,weight,win_shares,win_shares_per_48min,year,year_end,year_start
103,0.385,65.0,25.0,1.0,1.0,1.0,25.0,1.0,15.0,,"May 17, 1924",,1.0,1.0,,1.0,Mount Union College,1.0,1.0,1.0,-0.1,0.385,0.385,65.0,25.0,0.545,11.0,0.0,13.0,1.0,,75.0,1.0,1.0,1.0,1.0,0.1,13.0,Bill Herman,1.0,56.0,G,G,1.0,1.0,DNN,1.0,1.0,1.0,0.401,1.0,1.0,1.0,1.0,170.0,0.1,1.0,1950.0,1950.0,1950.0


In [45]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3921 entries, 0 to 25229
Data columns (total 60 columns):
2_point_%                 3921 non-null float64
2_point_tries             3921 non-null float64
2_pointers                3921 non-null float64
3_point_%                 3921 non-null float64
3_point_tries             3921 non-null float64
3_pointers                3921 non-null float64
age                       3921 non-null float64
assist_%                  3921 non-null float64
assists                   3921 non-null float64
birth_city                0 non-null object
birth_date                3765 non-null object
birth_state               0 non-null object
block_%                   3921 non-null float64
blocks                    3921 non-null float64
born                      0 non-null float64
box_plus_minus            3921 non-null float64
college                   3765 non-null object
def_box_plus_minus        3921 non-null float64
def_rebound_%             3921 non-null f

In [46]:
# # using dictionary to convert specific columns 
# final_convert = {"2_point_tries" : int, 
#                 "2_pointers" : int,
#                 "3_point_tries" : int,
#                 "3_pointers" : int, 
#                 "age" : int,
#                 "assists" : int, 
#                 "blocks" : int,
#                 "born" : int, 
#                 "def_rebounds" : int, 
#                 "field_goal_attempts" : int, 
#                 "field_goals" : int,
#                 "free_throws" : int, 
#                 "free_throw_attempts" : int, 
#                 "games" : int, 
#                 "games_started" : int, 
#                 "height_inches" : int,
#                 "minutes_played" : int, 
#                 "off_rebounds" : int, 
#                 "personal_fouls" : int, 
#                 "points" : int, 
#                 "steals" : int,
#                 "three_pt_tries" : int,
#                 "total_rebounds" : int, 
#                 "turnovers" : int,
#                 "weight" : int,
#                 "year" : int,
#                 "year_end" : int, 
#                 "year_start" : int,
#                } 

# # turn off column limit so I can see the data in all columns:
# pd.options.display.max_columns = None
  
# final_df = final_df.astype(final_convert) 

# final_df.head()

## Above code gave me the following error:
## ValueError: Cannot convert non-finite values (NA or inf) to integer

**Looks like before I change all these datatypes to integers, I need to get rid of the NaN values.**

- Plan is to throw a 1 into all the NaNs - imputing averages wouldn't make sense, and using '0' would throw off the maths

In [47]:
final_df = final_df.fillna(1)

final_df.head(3)

Unnamed: 0,2_point_%,2_point_tries,2_pointers,3_point_%,3_point_tries,3_pointers,age,assist_%,assists,birth_city,birth_date,birth_state,block_%,blocks,born,box_plus_minus,college,def_box_plus_minus,def_rebound_%,def_rebounds,defensive_win_shares,effective_field_goal_%,field_goal_%,field_goal_attempts,field_goals,free_throw_%,free_throw_attempts,free_throws,games,games_started,height,height_inches,minutes_played,off_box_plus_minus,off_rebound_%,off_rebounds,offensive_win_shares,personal_fouls,player,player_efficiency,points,position_x,position_y,steal_%,steals,team,three_pt_tries,total_rebound_%,total_rebounds,true_shooting_%,turnover_%,turnovers,usage_%,value_over_replacement,weight,win_shares,win_shares_per_48min,year,year_end,year_start
0,0.279,516.0,144.0,1.0,1.0,1.0,31.0,1.0,176.0,1,"November 1, 1918",1,1.0,1.0,1.0,1.0,Indiana University,1.0,1.0,1.0,3.6,0.279,0.279,516.0,144.0,0.705,241.0,0.0,63.0,1.0,1.0,71.0,1.0,1.0,1.0,1.0,-0.1,217.0,Curly Armstrong,1.0,458.0,G-F,G-F,1.0,1.0,FTW,1.0,1.0,1.0,0.368,1.0,1.0,1.0,1.0,170.0,3.5,1.0,1950.0,1951.0,1949.0
1,0.372,274.0,102.0,1.0,1.0,1.0,29.0,1.0,109.0,1,"January 15, 1921",1,1.0,1.0,1.0,1.0,University of Kentucky,1.0,1.0,1.0,0.6,0.372,0.372,274.0,102.0,0.708,106.0,0.0,49.0,1.0,1.0,74.0,1.0,1.0,1.0,1.0,1.6,99.0,Cliff Barker,1.0,279.0,SG,G,1.0,1.0,INO,1.0,1.0,1.0,0.435,1.0,1.0,1.0,1.0,185.0,2.2,1.0,1950.0,1952.0,1950.0
2,0.349,499.0,174.0,1.0,1.0,1.0,25.0,1.0,140.0,1,"May 11, 1924",1,1.0,1.0,1.0,1.0,University of Notre Dame,1.0,1.0,1.0,2.8,0.349,0.349,499.0,174.0,0.698,129.0,0.0,67.0,1.0,1.0,76.0,1.0,1.0,1.0,1.0,0.9,192.0,Leo Barnhorst,1.0,438.0,SF,F-G,1.0,1.0,CHS,1.0,1.0,1.0,0.394,1.0,1.0,1.0,1.0,190.0,3.6,1.0,1950.0,1954.0,1950.0


In [48]:
# check to see if any columns are duplicated
final_df.columns.duplicated()

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False])

In [49]:
# Now we can try the lengthy column datatype change code from above:

final_convert = {"2_point_tries" : int, 
                "2_pointers" : int,
                "3_point_tries" : int,
                "3_pointers" : int, 
                "age" : int,
                "assists" : int, 
                "blocks" : int,
                "born" : int, 
                "def_rebounds" : int, 
                "field_goal_attempts" : int, 
                "field_goals" : int,
                "free_throws" : int, 
                "free_throw_attempts" : int, 
                "games" : int, 
                "games_started" : int, 
                "height_inches" : int,
                "minutes_played" : int, 
                "off_rebounds" : int, 
                "personal_fouls" : int, 
                "points" : int, 
                "steals" : int,
                "three_pt_tries" : int,
                "total_rebounds" : int, 
                "turnovers" : int,
                "weight" : int,
                "year" : int,
                "year_end" : int, 
                "year_start" : int,
               } 
# ^^ works, but let me try something else where I don't have to assign anything
# to a variable, because I'm getting the following problem

# vv try this?
# final_df = final_df.astype({"2_point_tries" : int, 
#                 "2_pointers" : int,
#                 "3_point_tries" : int,
#                 "3_pointers" : int, 
#                 "age" : int,
#                 "assists" : int, 
#                 "blocks" : int,
#                 "born" : int, 
#                 "def_rebounds" : int, 
#                 "field_goal_attempts" : int, 
#                 "field_goals" : int,
#                 "free_throws" : int, 
#                 "free_throw_attempts" : int, 
#                 "games" : int, 
#                 "games_started" : int, 
#                 "height_inches" : int,
#                 "minutes_played" : int, 
#                 "off_rebounds" : int, 
#                 "personal_fouls" : int, 
#                 "points" : int, 
#                 "steals" : int,
#                 "three_pt_tries" : int,
#                 "total_rebounds" : int, 
#                 "turnovers" : int,
#                 "weight" : int,
#                 "year" : int,
#                 "year_end" : int, 
#                 "year_start_x" : int,})

final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3921 entries, 0 to 25229
Data columns (total 60 columns):
2_point_%                 3921 non-null float64
2_point_tries             3921 non-null float64
2_pointers                3921 non-null float64
3_point_%                 3921 non-null float64
3_point_tries             3921 non-null float64
3_pointers                3921 non-null float64
age                       3921 non-null float64
assist_%                  3921 non-null float64
assists                   3921 non-null float64
birth_city                3921 non-null int64
birth_date                3921 non-null object
birth_state               3921 non-null int64
block_%                   3921 non-null float64
blocks                    3921 non-null float64
born                      3921 non-null float64
box_plus_minus            3921 non-null float64
college                   3921 non-null object
def_box_plus_minus        3921 non-null float64
def_rebound_%             3921 non

In [50]:
final_df.head(3)

Unnamed: 0,2_point_%,2_point_tries,2_pointers,3_point_%,3_point_tries,3_pointers,age,assist_%,assists,birth_city,birth_date,birth_state,block_%,blocks,born,box_plus_minus,college,def_box_plus_minus,def_rebound_%,def_rebounds,defensive_win_shares,effective_field_goal_%,field_goal_%,field_goal_attempts,field_goals,free_throw_%,free_throw_attempts,free_throws,games,games_started,height,height_inches,minutes_played,off_box_plus_minus,off_rebound_%,off_rebounds,offensive_win_shares,personal_fouls,player,player_efficiency,points,position_x,position_y,steal_%,steals,team,three_pt_tries,total_rebound_%,total_rebounds,true_shooting_%,turnover_%,turnovers,usage_%,value_over_replacement,weight,win_shares,win_shares_per_48min,year,year_end,year_start
0,0.279,516.0,144.0,1.0,1.0,1.0,31.0,1.0,176.0,1,"November 1, 1918",1,1.0,1.0,1.0,1.0,Indiana University,1.0,1.0,1.0,3.6,0.279,0.279,516.0,144.0,0.705,241.0,0.0,63.0,1.0,1.0,71.0,1.0,1.0,1.0,1.0,-0.1,217.0,Curly Armstrong,1.0,458.0,G-F,G-F,1.0,1.0,FTW,1.0,1.0,1.0,0.368,1.0,1.0,1.0,1.0,170.0,3.5,1.0,1950.0,1951.0,1949.0
1,0.372,274.0,102.0,1.0,1.0,1.0,29.0,1.0,109.0,1,"January 15, 1921",1,1.0,1.0,1.0,1.0,University of Kentucky,1.0,1.0,1.0,0.6,0.372,0.372,274.0,102.0,0.708,106.0,0.0,49.0,1.0,1.0,74.0,1.0,1.0,1.0,1.0,1.6,99.0,Cliff Barker,1.0,279.0,SG,G,1.0,1.0,INO,1.0,1.0,1.0,0.435,1.0,1.0,1.0,1.0,185.0,2.2,1.0,1950.0,1952.0,1950.0
2,0.349,499.0,174.0,1.0,1.0,1.0,25.0,1.0,140.0,1,"May 11, 1924",1,1.0,1.0,1.0,1.0,University of Notre Dame,1.0,1.0,1.0,2.8,0.349,0.349,499.0,174.0,0.698,129.0,0.0,67.0,1.0,1.0,76.0,1.0,1.0,1.0,1.0,0.9,192.0,Leo Barnhorst,1.0,438.0,SF,F-G,1.0,1.0,CHS,1.0,1.0,1.0,0.394,1.0,1.0,1.0,1.0,190.0,3.6,1.0,1950.0,1954.0,1950.0


In [51]:
# turn off column limit so I can see the data in all columns:
pd.options.display.max_columns = None


final_df.head()

Unnamed: 0,2_point_%,2_point_tries,2_pointers,3_point_%,3_point_tries,3_pointers,age,assist_%,assists,birth_city,birth_date,birth_state,block_%,blocks,born,box_plus_minus,college,def_box_plus_minus,def_rebound_%,def_rebounds,defensive_win_shares,effective_field_goal_%,field_goal_%,field_goal_attempts,field_goals,free_throw_%,free_throw_attempts,free_throws,games,games_started,height,height_inches,minutes_played,off_box_plus_minus,off_rebound_%,off_rebounds,offensive_win_shares,personal_fouls,player,player_efficiency,points,position_x,position_y,steal_%,steals,team,three_pt_tries,total_rebound_%,total_rebounds,true_shooting_%,turnover_%,turnovers,usage_%,value_over_replacement,weight,win_shares,win_shares_per_48min,year,year_end,year_start
0,0.279,516.0,144.0,1.0,1.0,1.0,31.0,1.0,176.0,1,"November 1, 1918",1,1.0,1.0,1.0,1.0,Indiana University,1.0,1.0,1.0,3.6,0.279,0.279,516.0,144.0,0.705,241.0,0.0,63.0,1.0,1.0,71.0,1.0,1.0,1.0,1.0,-0.1,217.0,Curly Armstrong,1.0,458.0,G-F,G-F,1.0,1.0,FTW,1.0,1.0,1.0,0.368,1.0,1.0,1.0,1.0,170.0,3.5,1.0,1950.0,1951.0,1949.0
1,0.372,274.0,102.0,1.0,1.0,1.0,29.0,1.0,109.0,1,"January 15, 1921",1,1.0,1.0,1.0,1.0,University of Kentucky,1.0,1.0,1.0,0.6,0.372,0.372,274.0,102.0,0.708,106.0,0.0,49.0,1.0,1.0,74.0,1.0,1.0,1.0,1.0,1.6,99.0,Cliff Barker,1.0,279.0,SG,G,1.0,1.0,INO,1.0,1.0,1.0,0.435,1.0,1.0,1.0,1.0,185.0,2.2,1.0,1950.0,1952.0,1950.0
2,0.349,499.0,174.0,1.0,1.0,1.0,25.0,1.0,140.0,1,"May 11, 1924",1,1.0,1.0,1.0,1.0,University of Notre Dame,1.0,1.0,1.0,2.8,0.349,0.349,499.0,174.0,0.698,129.0,0.0,67.0,1.0,1.0,76.0,1.0,1.0,1.0,1.0,0.9,192.0,Leo Barnhorst,1.0,438.0,SF,F-G,1.0,1.0,CHS,1.0,1.0,1.0,0.394,1.0,1.0,1.0,1.0,190.0,3.6,1.0,1950.0,1954.0,1950.0
3,0.256,86.0,22.0,1.0,1.0,1.0,24.0,1.0,20.0,1,"October 8, 1925",1,1.0,1.0,1.0,1.0,North Carolina State University,1.0,1.0,1.0,-0.1,0.256,0.256,86.0,22.0,0.559,34.0,0.0,15.0,1.0,1.0,77.0,1.0,1.0,1.0,1.0,-0.5,29.0,Ed Bartels,1.0,63.0,F,F,1.0,1.0,TOT,1.0,1.0,1.0,0.312,1.0,1.0,1.0,1.0,195.0,-0.6,1.0,1950.0,1951.0,1950.0
6,0.363,936.0,340.0,1.0,1.0,1.0,22.0,1.0,233.0,1,"December 2, 1927",1,1.0,1.0,1.0,1.0,University of Kentucky,1.0,1.0,1.0,1.2,0.363,0.363,936.0,340.0,0.762,282.0,0.0,60.0,1.0,1.0,70.0,1.0,1.0,1.0,1.0,3.6,132.0,Ralph Beard,1.0,895.0,G,G,1.0,1.0,INO,1.0,1.0,1.0,0.422,1.0,1.0,1.0,1.0,175.0,4.8,1.0,1950.0,1951.0,1950.0


**SUCCESS!!!  DFs merged...**

- **After a quick overview, we can see the following:**
    
    - there are several columns that seem very similar: 'position_x' and 'position_y', 'birth_date' and 'born', and 'year,' 'year_start', and 'year_end'

    

In [66]:
# taking a look at 'position_x' and 'position_y'
final_df[["position_x", "position_y"]]

Unnamed: 0,position_x,position_y
0,G-F,G-F
1,SG,G
2,SF,F-G
3,F,F
6,G,G
...,...,...
25211,SF,F
25214,PF,F
25227,C,C
25228,SF,G-F


In [67]:
# taking a look at 'birth_date' and 'born'
final_df[["birth_date", "born"]].head(10)

Unnamed: 0,birth_date,born
0,"November 1, 1918",1.0
1,"January 15, 1921",1.0
2,"May 11, 1924",1.0
3,"October 8, 1925",1.0
6,"December 2, 1927",1.0
7,"November 22, 1926",1.0
8,"June 15, 1921",1.0
11,"February 25, 1924",1.0
12,"June 29, 1927",1.0
13,"March 11, 1927",1.0


In [68]:
# taking a look at 'year,' 'year_start,' and 'year_end'
final_df[["year", "year_start", "year_end"]].head(10)

Unnamed: 0,year,year_start,year_end
0,1950.0,1949.0,1951.0
1,1950.0,1950.0,1952.0
2,1950.0,1950.0,1954.0
3,1950.0,1950.0,1951.0
6,1950.0,1950.0,1951.0
7,1950.0,1950.0,1950.0
8,1950.0,1949.0,1952.0
11,1950.0,1950.0,1953.0
12,1950.0,1949.0,1950.0
13,1950.0,1950.0,1954.0


In [69]:
# taking a look at '3_point_tries,' 'three_pt_tries'
final_df[["3_point_tries", "three_pt_tries"]].head(10)

Unnamed: 0,3_point_tries,three_pt_tries
0,1.0,1.0
1,1.0,1.0
2,1.0,1.0
3,1.0,1.0
6,1.0,1.0
7,1.0,1.0
8,1.0,1.0
11,1.0,1.0
12,1.0,1.0
13,1.0,1.0


**What to do with the information above:**

- While similar (and nearly identical), 'position_x' is more specific than 'position_y,' so dropping 'position_y' column from dataframe

- Because it is more specific, 'birth_date' will be chosen over 'born'

- 'Year' is the season in which the stat took place.  Given that 'year_start' and 'year_end' have no clear information (index 0, for example has 'year_start' as 1949 and 'year_end' as 1951 but is talking about the 1950 season), in an attempt to clear things up, dropping both 'year_start' and 'year_end' because the particular seasons are what we're studying

- '3_point_tries' and 'three_pt_tries' are identical columns

In [70]:
# drop 'position_y', 'born', 'year_start', 'year_end', 'three_pt_tries'
final_df = final_df.drop(["position_y", "born", "year_start", "year_end", 
                         "three_pt_tries"], axis=1)

final_df.columns

Index(['2_point_%', '2_point_tries', '2_pointers', '3_point_%',
       '3_point_tries', '3_pointers', 'age', 'assist_%', 'assists',
       'birth_city', 'birth_date', 'birth_state', 'block_%', 'blocks',
       'box_plus_minus', 'college', 'def_box_plus_minus', 'def_rebound_%',
       'def_rebounds', 'defensive_win_shares', 'effective_field_goal_%',
       'field_goal_%', 'field_goal_attempts', 'field_goals', 'free_throw_%',
       'free_throw_attempts', 'free_throws', 'games', 'games_started',
       'height', 'height_inches', 'minutes_played', 'off_box_plus_minus',
       'off_rebound_%', 'off_rebounds', 'offensive_win_shares',
       'personal_fouls', 'player', 'player_efficiency', 'points', 'position_x',
       'steal_%', 'steals', 'team', 'total_rebound_%', 'total_rebounds',
       'true_shooting_%', 'turnover_%', 'turnovers', 'usage_%',
       'value_over_replacement', 'weight', 'win_shares',
       'win_shares_per_48min', 'year'],
      dtype='object')

In [71]:
# looks good, but now we need to rename 'position_x' to just 'position'
final_df = final_df.rename(columns = {"position_x" : "position"})

final_df.head()

Unnamed: 0,2_point_%,2_point_tries,2_pointers,3_point_%,3_point_tries,3_pointers,age,assist_%,assists,birth_city,birth_date,birth_state,block_%,blocks,box_plus_minus,college,def_box_plus_minus,def_rebound_%,def_rebounds,defensive_win_shares,effective_field_goal_%,field_goal_%,field_goal_attempts,field_goals,free_throw_%,free_throw_attempts,free_throws,games,games_started,height,height_inches,minutes_played,off_box_plus_minus,off_rebound_%,off_rebounds,offensive_win_shares,personal_fouls,player,player_efficiency,points,position,steal_%,steals,team,total_rebound_%,total_rebounds,true_shooting_%,turnover_%,turnovers,usage_%,value_over_replacement,weight,win_shares,win_shares_per_48min,year
0,0.279,516.0,144.0,1.0,1.0,1.0,31.0,1.0,176.0,1,"November 1, 1918",1,1.0,1.0,1.0,Indiana University,1.0,1.0,1.0,3.6,0.279,0.279,516.0,144.0,0.705,241.0,0.0,63.0,1.0,1.0,71.0,1.0,1.0,1.0,1.0,-0.1,217.0,Curly Armstrong,1.0,458.0,G-F,1.0,1.0,FTW,1.0,1.0,0.368,1.0,1.0,1.0,1.0,170.0,3.5,1.0,1950.0
1,0.372,274.0,102.0,1.0,1.0,1.0,29.0,1.0,109.0,1,"January 15, 1921",1,1.0,1.0,1.0,University of Kentucky,1.0,1.0,1.0,0.6,0.372,0.372,274.0,102.0,0.708,106.0,0.0,49.0,1.0,1.0,74.0,1.0,1.0,1.0,1.0,1.6,99.0,Cliff Barker,1.0,279.0,SG,1.0,1.0,INO,1.0,1.0,0.435,1.0,1.0,1.0,1.0,185.0,2.2,1.0,1950.0
2,0.349,499.0,174.0,1.0,1.0,1.0,25.0,1.0,140.0,1,"May 11, 1924",1,1.0,1.0,1.0,University of Notre Dame,1.0,1.0,1.0,2.8,0.349,0.349,499.0,174.0,0.698,129.0,0.0,67.0,1.0,1.0,76.0,1.0,1.0,1.0,1.0,0.9,192.0,Leo Barnhorst,1.0,438.0,SF,1.0,1.0,CHS,1.0,1.0,0.394,1.0,1.0,1.0,1.0,190.0,3.6,1.0,1950.0
3,0.256,86.0,22.0,1.0,1.0,1.0,24.0,1.0,20.0,1,"October 8, 1925",1,1.0,1.0,1.0,North Carolina State University,1.0,1.0,1.0,-0.1,0.256,0.256,86.0,22.0,0.559,34.0,0.0,15.0,1.0,1.0,77.0,1.0,1.0,1.0,1.0,-0.5,29.0,Ed Bartels,1.0,63.0,F,1.0,1.0,TOT,1.0,1.0,0.312,1.0,1.0,1.0,1.0,195.0,-0.6,1.0,1950.0
6,0.363,936.0,340.0,1.0,1.0,1.0,22.0,1.0,233.0,1,"December 2, 1927",1,1.0,1.0,1.0,University of Kentucky,1.0,1.0,1.0,1.2,0.363,0.363,936.0,340.0,0.762,282.0,0.0,60.0,1.0,1.0,70.0,1.0,1.0,1.0,1.0,3.6,132.0,Ralph Beard,1.0,895.0,G,1.0,1.0,INO,1.0,1.0,0.422,1.0,1.0,1.0,1.0,175.0,4.8,1.0,1950.0


**Data looks great, but we need to rearrange those columns**

In [72]:
# rearranging the columns:
final_df = final_df[["player", "position", "age", "height_inches", "weight", "team", 
                     "birth_date", "birth_city", "birth_state", "year", "college", 
                     "games", "games_started", "minutes_played", "usage_%",
                     "points", "field_goals", "field_goal_attempts", 
                     "field_goal_%", "effective_field_goal_%",
                     "2_point_tries", "2_pointers", "2_point_%", 
                     "3_point_tries", "3_pointers", "3_point_%",
                     "free_throws", "free_throw_attempts", "free_throw_%", 
                     "true_shooting_%", "assists", "assist_%", 
                     "blocks", "block_%", "steals", 
                     "steal_%", "total_rebounds", "total_rebound_%", 
                     "off_rebounds", "off_rebound_%", "def_rebounds", "def_rebound_%",
                     "turnovers", "turnover_%", "offensive_win_shares",
                     "defensive_win_shares", "win_shares", "win_shares_per_48min",
                     "personal_fouls", "player_efficiency", "off_box_plus_minus", 
                     "def_box_plus_minus", "box_plus_minus", "value_over_replacement",
                     ]]

In [73]:
final_df.head(3)

Unnamed: 0,player,position,age,height_inches,weight,team,birth_date,birth_city,birth_state,year,college,games,games_started,minutes_played,usage_%,points,field_goals,field_goal_attempts,field_goal_%,effective_field_goal_%,2_point_tries,2_pointers,2_point_%,3_point_tries,3_pointers,3_point_%,free_throws,free_throw_attempts,free_throw_%,true_shooting_%,assists,assist_%,blocks,block_%,steals,steal_%,total_rebounds,total_rebound_%,off_rebounds,off_rebound_%,def_rebounds,def_rebound_%,turnovers,turnover_%,offensive_win_shares,defensive_win_shares,win_shares,win_shares_per_48min,personal_fouls,player_efficiency,off_box_plus_minus,def_box_plus_minus,box_plus_minus,value_over_replacement
0,Curly Armstrong,G-F,31.0,71.0,170.0,FTW,"November 1, 1918",1,1,1950.0,Indiana University,63.0,1.0,1.0,1.0,458.0,144.0,516.0,0.279,0.279,516.0,144.0,0.279,1.0,1.0,1.0,0.0,241.0,0.705,0.368,176.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.1,3.6,3.5,1.0,217.0,1.0,1.0,1.0,1.0,1.0
1,Cliff Barker,SG,29.0,74.0,185.0,INO,"January 15, 1921",1,1,1950.0,University of Kentucky,49.0,1.0,1.0,1.0,279.0,102.0,274.0,0.372,0.372,274.0,102.0,0.372,1.0,1.0,1.0,0.0,106.0,0.708,0.435,109.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.6,0.6,2.2,1.0,99.0,1.0,1.0,1.0,1.0,1.0
2,Leo Barnhorst,SF,25.0,76.0,190.0,CHS,"May 11, 1924",1,1,1950.0,University of Notre Dame,67.0,1.0,1.0,1.0,438.0,174.0,499.0,0.349,0.349,499.0,174.0,0.349,1.0,1.0,1.0,0.0,129.0,0.698,0.394,140.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9,2.8,3.6,1.0,192.0,1.0,1.0,1.0,1.0,1.0


In [74]:
# renaming more columns for clarity
final_df = final_df.rename(columns={
    "weight": "weight_lbs", 
    "birth_date" : "date_of_birth",
    "year" : "season",
    "points" : "total_points",
    "field_goals" : "field_goals_made",
    "2_pointers" : "2_pointers_made",
    "2_pointers_tries" : "2_pointers_made",
    "3_pointers" : "3_pointers_made",
    "3_point_tries" : "3_pointers_tried",
    "off_plus_minus" : "value_on_offense", 
    "def_box_plus_minus" : "value_on_defense",
    "def_box_plus_minus" : "total_value", 
    "value_over_replacement" : "value_over_bench_sub"},)

final_df.head()

Unnamed: 0,player,position,age,height_inches,weight_lbs,team,date_of_birth,birth_city,birth_state,season,college,games,games_started,minutes_played,usage_%,total_points,field_goals_made,field_goal_attempts,field_goal_%,effective_field_goal_%,2_point_tries,2_pointers_made,2_point_%,3_pointers_tried,3_pointers_made,3_point_%,free_throws,free_throw_attempts,free_throw_%,true_shooting_%,assists,assist_%,blocks,block_%,steals,steal_%,total_rebounds,total_rebound_%,off_rebounds,off_rebound_%,def_rebounds,def_rebound_%,turnovers,turnover_%,offensive_win_shares,defensive_win_shares,win_shares,win_shares_per_48min,personal_fouls,player_efficiency,off_box_plus_minus,total_value,box_plus_minus,value_over_bench_sub
0,Curly Armstrong,G-F,31.0,71.0,170.0,FTW,"November 1, 1918",1,1,1950.0,Indiana University,63.0,1.0,1.0,1.0,458.0,144.0,516.0,0.279,0.279,516.0,144.0,0.279,1.0,1.0,1.0,0.0,241.0,0.705,0.368,176.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.1,3.6,3.5,1.0,217.0,1.0,1.0,1.0,1.0,1.0
1,Cliff Barker,SG,29.0,74.0,185.0,INO,"January 15, 1921",1,1,1950.0,University of Kentucky,49.0,1.0,1.0,1.0,279.0,102.0,274.0,0.372,0.372,274.0,102.0,0.372,1.0,1.0,1.0,0.0,106.0,0.708,0.435,109.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.6,0.6,2.2,1.0,99.0,1.0,1.0,1.0,1.0,1.0
2,Leo Barnhorst,SF,25.0,76.0,190.0,CHS,"May 11, 1924",1,1,1950.0,University of Notre Dame,67.0,1.0,1.0,1.0,438.0,174.0,499.0,0.349,0.349,499.0,174.0,0.349,1.0,1.0,1.0,0.0,129.0,0.698,0.394,140.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9,2.8,3.6,1.0,192.0,1.0,1.0,1.0,1.0,1.0
3,Ed Bartels,F,24.0,77.0,195.0,TOT,"October 8, 1925",1,1,1950.0,North Carolina State University,15.0,1.0,1.0,1.0,63.0,22.0,86.0,0.256,0.256,86.0,22.0,0.256,1.0,1.0,1.0,0.0,34.0,0.559,0.312,20.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.5,-0.1,-0.6,1.0,29.0,1.0,1.0,1.0,1.0,1.0
6,Ralph Beard,G,22.0,70.0,175.0,INO,"December 2, 1927",1,1,1950.0,University of Kentucky,60.0,1.0,1.0,1.0,895.0,340.0,936.0,0.363,0.363,936.0,340.0,0.363,1.0,1.0,1.0,0.0,282.0,0.762,0.422,233.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.6,1.2,4.8,1.0,132.0,1.0,1.0,1.0,1.0,1.0


In [77]:
# setting the df index to 'player' so we can access the info using
# the player's name

final_df = final_df.set_index("player")

final_df.head(3)

Unnamed: 0_level_0,position,age,height_inches,weight_lbs,team,date_of_birth,birth_city,birth_state,season,college,games,games_started,minutes_played,usage_%,total_points,field_goals_made,field_goal_attempts,field_goal_%,effective_field_goal_%,2_point_tries,2_pointers_made,2_point_%,3_pointers_tried,3_pointers_made,3_point_%,free_throws,free_throw_attempts,free_throw_%,true_shooting_%,assists,assist_%,blocks,block_%,steals,steal_%,total_rebounds,total_rebound_%,off_rebounds,off_rebound_%,def_rebounds,def_rebound_%,turnovers,turnover_%,offensive_win_shares,defensive_win_shares,win_shares,win_shares_per_48min,personal_fouls,player_efficiency,off_box_plus_minus,total_value,box_plus_minus,value_over_bench_sub
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1
Curly Armstrong,G-F,31.0,71.0,170.0,FTW,"November 1, 1918",1,1,1950.0,Indiana University,63.0,1.0,1.0,1.0,458.0,144.0,516.0,0.279,0.279,516.0,144.0,0.279,1.0,1.0,1.0,0.0,241.0,0.705,0.368,176.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.1,3.6,3.5,1.0,217.0,1.0,1.0,1.0,1.0,1.0
Cliff Barker,SG,29.0,74.0,185.0,INO,"January 15, 1921",1,1,1950.0,University of Kentucky,49.0,1.0,1.0,1.0,279.0,102.0,274.0,0.372,0.372,274.0,102.0,0.372,1.0,1.0,1.0,0.0,106.0,0.708,0.435,109.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.6,0.6,2.2,1.0,99.0,1.0,1.0,1.0,1.0,1.0
Leo Barnhorst,SF,25.0,76.0,190.0,CHS,"May 11, 1924",1,1,1950.0,University of Notre Dame,67.0,1.0,1.0,1.0,438.0,174.0,499.0,0.349,0.349,499.0,174.0,0.349,1.0,1.0,1.0,0.0,129.0,0.698,0.394,140.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9,2.8,3.6,1.0,192.0,1.0,1.0,1.0,1.0,1.0


In [78]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3921 entries, Curly Armstrong to Ivica Zubac
Data columns (total 53 columns):
position                  3921 non-null object
age                       3921 non-null float64
height_inches             3921 non-null float64
weight_lbs                3921 non-null float64
team                      3921 non-null object
date_of_birth             3921 non-null object
birth_city                3921 non-null int64
birth_state               3921 non-null int64
season                    3921 non-null float64
college                   3921 non-null object
games                     3921 non-null float64
games_started             3921 non-null float64
minutes_played            3921 non-null float64
usage_%                   3921 non-null float64
total_points              3921 non-null float64
field_goals_made          3921 non-null float64
field_goal_attempts       3921 non-null float64
field_goal_%              3921 non-null float64
effective_field_goal_

**Df's are merged, columns are easy-to-read and understand, and nulls are handled**

In [None]:
final_df.shape

In [None]:
seasons_stats_df.shape

In [None]:
players_df.shape

In [None]:
player_data_df.shape

In [None]:
# Add 'birth_city' and 'birth_state' from players_df to final_df

# assign column names to variables: 
# city = players_df["birth_city"]
# state = players_df["birth_state"]

# merge these columns onto the final_df:
# final_df = final_df.join(city, state)
# final_df.head()

# generated an error stating, basically, I can't do that with these two dfs because
# they have different numbers of rows.  Good thing, too, b/c I just realized that 
# joining everything would not match the birth city and states to the correct players.

In [None]:
# Got some nulls in the last 7 columns.  Taking a look at them:
# final_df[final_df.isnull().any(axis=1)]

In [None]:
# **Deciding what to do with duplicates...**

# merging dataframes involving similar data is bound to include duplicates

# we can't just drop them because duplicate positions, teams, etc are everywhere

# will look for duplicate player names and use the remaining info to decide 
# whether or not to drop that row