# #  Scraping NBA Championship Data from Wikipedia

Calling pandas

In [239]:
# call pandas
import pandas as pd # 'pd' is a nickname

Grabbing list of NBA champions from Wikipedia

In [240]:
from IPython.display import IFrame  
wikiLink="https://en.wikipedia.org/wiki/List_of_NBA_champions" 
IFrame(wikiLink, width=700, height=300)

In [241]:
!pip install html5lib
!pip install beautifulsoup4
!pip install lxml



In [242]:
wikiTables=pd.read_html(wikiLink, # link
                        header=0, # where is the header?
                        flavor='bs4', # helper to translate html
                        attrs={'class': 'wikitable sortable'}) # attributes to identify element(s)

In [243]:
type(wikiTables)

list

In [244]:
len(wikiTables)

3

In [245]:
wikiTables[1]

Unnamed: 0,Team,Win,Loss,Apps,Pct,Year(s) won,Year(s) lost
0,Los Angeles Lakers[i],17,15,32,.531,"1949, 1950, 1952, 1953, 1954, 1972, 1980, 1982...","1959, 1962, 1963, 1965, 1966, 1968, 1969, 1970..."
1,Boston Celtics,17,5,22,.773,"1957, 1959, 1960, 1961, 1962, 1963, 1964, 1965...","1958, 1985, 1987, 2010, 2022"
2,Golden State Warriors[ii],7,5,12,.583,"1947, 1956, 1975, 2015, 2017, 2018, 2022","1948, 1964, 1967, 2016, 2019"
3,Chicago Bulls,6,0,6,1.000,"1991, 1992, 1993, 1996, 1997, 1998",—
4,San Antonio Spurs,5,1,6,.833,"1999, 2003, 2005, 2007, 2014",2013
5,Philadelphia 76ers[iii],3,6,9,.333,"1955, 1967, 1983","1950, 1954, 1977, 1980, 1982, 2001"
6,Detroit Pistons[iv],3,4,7,.429,"1989, 1990, 2004","1955, 1956, 1988, 2005"
7,Miami Heat,3,3,6,.500,"2006, 2012, 2013","2011, 2014, 2020"
8,New York Knicks,2,6,8,.250,"1970, 1973","1951, 1952, 1953, 1972, 1994, 1999"
9,Houston Rockets,2,2,4,.500,"1994, 1995","1981, 1986"


In [246]:
DFwiki=wikiTables[1].copy()

## Data Cleaning

Keeping columns as needed

In [247]:
DFwiki.columns.to_list()

['Team', 'Win', 'Loss', 'Apps', 'Pct', 'Year(s) won', 'Year(s) lost']

In [248]:
# gonna drop the years won and years lost
bye=[5,6]
DFwiki.drop(columns=DFwiki.columns[bye],inplace=True)
DFwiki

Unnamed: 0,Team,Win,Loss,Apps,Pct
0,Los Angeles Lakers[i],17,15,32,.531
1,Boston Celtics,17,5,22,.773
2,Golden State Warriors[ii],7,5,12,.583
3,Chicago Bulls,6,0,6,1.000
4,San Antonio Spurs,5,1,6,.833
5,Philadelphia 76ers[iii],3,6,9,.333
6,Detroit Pistons[iv],3,4,7,.429
7,Miami Heat,3,3,6,.500
8,New York Knicks,2,6,8,.250
9,Houston Rockets,2,2,4,.500


Check column names, clean em up

In [249]:
DFwiki.columns.to_list()

['Team', 'Win', 'Loss', 'Apps', 'Pct']

In [250]:
DFwiki.columns.str.replace(pat="\W",# the string to replace
                           repl="", # the value to replace with
                           regex=True) # is 'pat' above a regex?
# replacing "\W" with "" gets rid of any symbols that are not A-Z or 0-9 (spaces and non-std characters)

Index(['Team', 'Win', 'Loss', 'Apps', 'Pct'], dtype='object')

In [251]:
#let's check with .to_list()
DFwiki.columns.str.replace("\W","",regex=True).to_list()

['Team', 'Win', 'Loss', 'Apps', 'Pct']

In [252]:
DFwiki.columns=DFwiki.columns.str.replace("\W","",regex=True).to_list()

In [253]:
DFwiki.iloc[:,:4]

Unnamed: 0,Team,Win,Loss,Apps
0,Los Angeles Lakers[i],17,15,32
1,Boston Celtics,17,5,22
2,Golden State Warriors[ii],7,5,12
3,Chicago Bulls,6,0,6
4,San Antonio Spurs,5,1,6
5,Philadelphia 76ers[iii],3,6,9
6,Detroit Pistons[iv],3,4,7
7,Miami Heat,3,3,6
8,New York Knicks,2,6,8
9,Houston Rockets,2,2,4


Cleaning up columns to get rid of leading trailing spaces

In [254]:
# create function for multiple use:
stripSeveral=lambda x: x.str.strip() # x is a will be a series
# lamda is just the name that python gave
# every time we use stripSeveral, we'll use the function x.str.strip()

#apply function just created
DFwiki.iloc[:,:1].apply(stripSeveral)

Unnamed: 0,Team
0,Los Angeles Lakers[i]
1,Boston Celtics
2,Golden State Warriors[ii]
3,Chicago Bulls
4,San Antonio Spurs
5,Philadelphia 76ers[iii]
6,Detroit Pistons[iv]
7,Miami Heat
8,New York Knicks
9,Houston Rockets


Actually making the changes now

In [255]:
# let's actually make the changes!
DFwiki.iloc[:,:1]=DFwiki.iloc[:,:1].apply(stripSeveral)
#overwrites dataframe

Preparing a frequency table to detect possible errors:

In [256]:
DFwiki.Team.value_counts()
# I think this looks good tbh

Atlanta Hawks[v]                                       1
Sacramento Kings[ix]                                   1
Memphis Grizzlies                                      1
Oklahoma City Thunder[vii]                             1
Los Angeles Lakers[i]                                  1
Indiana Pacers                                         1
Houston Rockets                                        1
New York Knicks                                        1
Boston Celtics                                         1
Baltimore Bullets (original) (folded in 1954)[viii]    1
Portland Trail Blazers                                 1
Dallas Mavericks                                       1
Chicago Bulls                                          1
Detroit Pistons[iv]                                    1
Chicago Stags (folded in 1950)                         1
Cleveland Cavaliers                                    1
Milwaukee Bucks                                        1
Orlando Magic                  

In [257]:
# Saving frequency table
tableCounts=DFwiki.Team.value_counts()
tableCounts

Atlanta Hawks[v]                                       1
Sacramento Kings[ix]                                   1
Memphis Grizzlies                                      1
Oklahoma City Thunder[vii]                             1
Los Angeles Lakers[i]                                  1
Indiana Pacers                                         1
Houston Rockets                                        1
New York Knicks                                        1
Boston Celtics                                         1
Baltimore Bullets (original) (folded in 1954)[viii]    1
Portland Trail Blazers                                 1
Dallas Mavericks                                       1
Chicago Bulls                                          1
Detroit Pistons[iv]                                    1
Chicago Stags (folded in 1950)                         1
Cleveland Cavaliers                                    1
Milwaukee Bucks                                        1
Orlando Magic                  

In [258]:
DFwiki

Unnamed: 0,Team,Win,Loss,Apps,Pct
0,Los Angeles Lakers[i],17,15,32,.531
1,Boston Celtics,17,5,22,.773
2,Golden State Warriors[ii],7,5,12,.583
3,Chicago Bulls,6,0,6,1.000
4,San Antonio Spurs,5,1,6,.833
5,Philadelphia 76ers[iii],3,6,9,.333
6,Detroit Pistons[iv],3,4,7,.429
7,Miami Heat,3,3,6,.500
8,New York Knicks,2,6,8,.250
9,Houston Rockets,2,2,4,.500


Resetting the index

In [259]:
DFwiki.reset_index(drop=True, inplace=True) # drop=True: don't want the old index; inplace=True: do it now
#
DFwiki

Unnamed: 0,Team,Win,Loss,Apps,Pct
0,Los Angeles Lakers[i],17,15,32,.531
1,Boston Celtics,17,5,22,.773
2,Golden State Warriors[ii],7,5,12,.583
3,Chicago Bulls,6,0,6,1.000
4,San Antonio Spurs,5,1,6,.833
5,Philadelphia 76ers[iii],3,6,9,.333
6,Detroit Pistons[iv],3,4,7,.429
7,Miami Heat,3,3,6,.500
8,New York Knicks,2,6,8,.250
9,Houston Rockets,2,2,4,.500


In [260]:
# need to get rid of dashes in rows 27-32
winCounts=DFwiki.Win.value_counts()
winCounts

1     9
0     7
—     6
2     3
3     3
17    2
7     1
6     1
5     1
Name: Win, dtype: int64

In [261]:
import numpy as np
DFwiki=DFwiki.replace('—', np.nan)
DFwiki

Unnamed: 0,Team,Win,Loss,Apps,Pct
0,Los Angeles Lakers[i],17.0,15.0,32.0,0.531
1,Boston Celtics,17.0,5.0,22.0,0.773
2,Golden State Warriors[ii],7.0,5.0,12.0,0.583
3,Chicago Bulls,6.0,0.0,6.0,1.0
4,San Antonio Spurs,5.0,1.0,6.0,0.833
5,Philadelphia 76ers[iii],3.0,6.0,9.0,0.333
6,Detroit Pistons[iv],3.0,4.0,7.0,0.429
7,Miami Heat,3.0,3.0,6.0,0.5
8,New York Knicks,2.0,6.0,8.0,0.25
9,Houston Rockets,2.0,2.0,4.0,0.5


## Formating the data

In [262]:
DFwiki.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Team    33 non-null     object
 1   Win     27 non-null     object
 2   Loss    27 non-null     object
 3   Apps    27 non-null     object
 4   Pct     27 non-null     object
dtypes: object(5)
memory usage: 1.4+ KB


In [263]:
DFwiki.columns.str.lower()

Index(['team', 'win', 'loss', 'apps', 'pct'], dtype='object')

In [264]:
DFwiki

Unnamed: 0,Team,Win,Loss,Apps,Pct
0,Los Angeles Lakers[i],17.0,15.0,32.0,0.531
1,Boston Celtics,17.0,5.0,22.0,0.773
2,Golden State Warriors[ii],7.0,5.0,12.0,0.583
3,Chicago Bulls,6.0,0.0,6.0,1.0
4,San Antonio Spurs,5.0,1.0,6.0,0.833
5,Philadelphia 76ers[iii],3.0,6.0,9.0,0.333
6,Detroit Pistons[iv],3.0,4.0,7.0,0.429
7,Miami Heat,3.0,3.0,6.0,0.5
8,New York Knicks,2.0,6.0,8.0,0.25
9,Houston Rockets,2.0,2.0,4.0,0.5


I'm going to rename the columns to specify these are championship stats

In [265]:
DFwiki=DFwiki.rename(columns={'Pct':'ChampPct'})
DFwiki

Unnamed: 0,Team,Win,Loss,Apps,ChampPct
0,Los Angeles Lakers[i],17.0,15.0,32.0,0.531
1,Boston Celtics,17.0,5.0,22.0,0.773
2,Golden State Warriors[ii],7.0,5.0,12.0,0.583
3,Chicago Bulls,6.0,0.0,6.0,1.0
4,San Antonio Spurs,5.0,1.0,6.0,0.833
5,Philadelphia 76ers[iii],3.0,6.0,9.0,0.333
6,Detroit Pistons[iv],3.0,4.0,7.0,0.429
7,Miami Heat,3.0,3.0,6.0,0.5
8,New York Knicks,2.0,6.0,8.0,0.25
9,Houston Rockets,2.0,2.0,4.0,0.5


In [266]:
# need to do something about these dashes for 27-32
# I think I can just make them all zeroes?

Turning wins, losses, appearances, and percents into integers

In [267]:
DFwiki['Win'] = pd.to_numeric(DFwiki['Win'])

In [268]:
DFwiki.info()
# I think this worked?
# turned wins into a float64
# gonna do this to the rest of the columns using a lambda function

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Team      33 non-null     object 
 1   Win       27 non-null     float64
 2   Loss      27 non-null     object 
 3   Apps      27 non-null     object 
 4   ChampPct  27 non-null     object 
dtypes: float64(1), object(4)
memory usage: 1.4+ KB


In [269]:
# customize function
severalToNum=lambda x:pd.to_numeric(x,errors='coerce')
#pd.to_numeric,
#apply function:

where=DFwiki.columns[1:]

DFwiki.loc[:,where]=DFwiki.loc[:,where].apply(severalToNum)

In [270]:
DFwiki.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Team      33 non-null     object 
 1   Win       27 non-null     float64
 2   Loss      27 non-null     float64
 3   Apps      27 non-null     float64
 4   ChampPct  27 non-null     float64
dtypes: float64(4), object(1)
memory usage: 1.4+ KB


I think this is what we want. At this point we want to save this work locally in a file:

In [271]:
# saving as a csv and as a pickle file
DFwiki.to_csv("nbachamps.csv",index=False)
DFwiki.to_pickle("nbachamps.pkl")

## Scraping NBA Regular Season Streak Data from Wikipedia

Grabbing table of regular season wins from Wikipedia

In [272]:
from IPython.display import IFrame  
streakLink="https://en.wikipedia.org/wiki/List_of_National_Basketball_Association_longest_winning_streaks" 
IFrame(streakLink, width=700, height=300)

In [273]:
streakTables=pd.read_html(streakLink, # link
                        header=0, # where is the header?
                        flavor='bs4', # helper to translate html
                        attrs={'class': 'wikitable sortable'}) # attributes to identify element(s)

In [274]:
streakTables[0]

Unnamed: 0,Rank,Games,Team,Season(s),Seasonrecord(s),Date,Score,Opponent,Date.1,Score.1,Opponent.1
0,Rank,Games,Team,Season(s),Seasonrecord(s),Beginning (first victory),Beginning (first victory),Beginning (first victory),End (first defeat),End (first defeat),End (first defeat)
1,1,33,Los Angeles Lakers,1971–72*,69–13,"November 5, 1971",110–106,Baltimore Bullets,"January 9, 1972",104–120,Milwaukee Bucks
2,2,28(4 + 24),Golden State Warriors^,2014–15*2015–16,67–1573–9,"April 9, 2015",116–105,Portland Trail Blazers,"December 12, 2015",95–108,Milwaukee Bucks
3,3,27,Miami Heat,2012–13*,66–16,"February 3, 2013",100–85,Toronto Raptors,"March 27, 2013",97–101,Chicago Bulls
4,4,22,Houston Rockets,2007–08,55–27,"January 29, 2008",111–107,Golden State Warriors,"March 18, 2008",74–94,Boston Celtics
5,5 (tie),20(5 + 15),Washington Capitols^,1947–481948–49,28–2038–22,"March 13, 1948",86–69,St. Louis Bombers,"December 7, 1948",78–94,Indianapolis Jets
6,5 (tie),20,Milwaukee Bucks,1970–71*,66–16,"February 6, 1971",111–85,San Francisco Warriors,"March 9, 1971",103–110,Chicago Bulls
7,7 (tie),19,Los Angeles Lakers,1999–00*,67–15,"February 4, 2000",113–67,Utah Jazz,"March 16, 2000",102–109,Washington Wizards
8,7 (tie),19,Boston Celtics,2008–09,62–20,"November 15, 2008",102–97,Milwaukee Bucks,"December 25, 2008",83–92,Los Angeles Lakers
9,7 (tie),19,San Antonio Spurs,2013–14*,62–20,"February 26, 2014",120–110,Detroit Pistons,"April 3, 2014",94–106,Oklahoma City Thunder


In [275]:
DFstreak=streakTables[0].copy()

In [276]:
DFstreak.columns.to_list()

['Rank',
 'Games',
 'Team',
 'Season(s)',
 'Seasonrecord(s)',
 'Date',
 'Score',
 'Opponent',
 'Date.1',
 'Score.1',
 'Opponent.1']

In [277]:
byeStreak=[0,3,4,5,6,7,8,9,10]
DFstreak.drop(columns=DFstreak.columns[byeStreak],inplace=True)
DFstreak

Unnamed: 0,Games,Team
0,Games,Team
1,33,Los Angeles Lakers
2,28(4 + 24),Golden State Warriors^
3,27,Miami Heat
4,22,Houston Rockets
5,20(5 + 15),Washington Capitols^
6,20,Milwaukee Bucks
7,19,Los Angeles Lakers
8,19,Boston Celtics
9,19,San Antonio Spurs


In [278]:
DFstreak.columns.to_list()

['Games', 'Team']

Getting rid of the (x + y) following certain values in the 'Games' column
These indicate win streaks that span multiple seasons

In [279]:
DFstreak['Games']=DFstreak['Games'].str.slice(stop=2).to_list()
DFstreak['Games']

0     Ga
1     33
2     28
3     27
4     22
5     20
6     20
7     19
8     19
9     19
10    19
11    18
12    18
13    18
14    18
15    18
16    18
17    18
18    17
19    17
20    17
21    17
22    17
23    17
24    17
25    16
26    16
27    16
28    16
29    16
30    16
31    16
32    16
33    16
Name: Games, dtype: object

Stripping leading and trailing spaces

In [280]:
# applying changes straight up


We know we have column names in the rows that need to be removed

In [281]:
wrong_index=[0]
DFstreak.drop(wrong_index,axis='index')

Unnamed: 0,Games,Team
1,33,Los Angeles Lakers
2,28,Golden State Warriors^
3,27,Miami Heat
4,22,Houston Rockets
5,20,Washington Capitols^
6,20,Milwaukee Bucks
7,19,Los Angeles Lakers
8,19,Boston Celtics
9,19,San Antonio Spurs
10,19,Atlanta Hawks


In [282]:
# applying changes
DFstreak=DFstreak.drop(wrong_index,axis='index')

In [283]:
DFstreak

Unnamed: 0,Games,Team
1,33,Los Angeles Lakers
2,28,Golden State Warriors^
3,27,Miami Heat
4,22,Houston Rockets
5,20,Washington Capitols^
6,20,Milwaukee Bucks
7,19,Los Angeles Lakers
8,19,Boston Celtics
9,19,San Antonio Spurs
10,19,Atlanta Hawks


So I realized that the unit of analysis for this table is in fact not the team so I'm no longer sure I can use it in my dataset.

Resetting index

In [284]:
DFstreak.reset_index(drop=True, inplace=True)
DFstreak

Unnamed: 0,Games,Team
0,33,Los Angeles Lakers
1,28,Golden State Warriors^
2,27,Miami Heat
3,22,Houston Rockets
4,20,Washington Capitols^
5,20,Milwaukee Bucks
6,19,Los Angeles Lakers
7,19,Boston Celtics
8,19,San Antonio Spurs
9,19,Atlanta Hawks


## Formating win streak data

In [285]:
DFstreak.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Games   33 non-null     object
 1   Team    33 non-null     object
dtypes: object(2)
memory usage: 656.0+ bytes


Changing Games (number of games in the win streak) to an integer

In [286]:
DFstreak.Games.dtype

dtype('O')

In [287]:
DFstreak['Games'] = pd.to_numeric(DFstreak['Games'], errors='coerce')
DFstreak.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Games   33 non-null     int64 
 1   Team    33 non-null     object
dtypes: int64(1), object(1)
memory usage: 656.0+ bytes


Now let's save this one:

In [288]:
# two options
DFstreak.to_csv("nbawinstreaks.csv",index=False)
DFstreak.to_pickle("nbawinstreaks.pkl")

## One more time scraping MVP data by team from Wikipedia

In [289]:
from IPython.display import IFrame  
mvpLink="https://en.wikipedia.org/wiki/NBA_Most_Valuable_Player_Award#Teams" 
IFrame(mvpLink, width=900, height=500)

In [290]:
mvpTables=pd.read_html(mvpLink, # link
                        header=0, # where is the header?
                        flavor='bs4', # helper to translate html
                        attrs={'class': 'wikitable'}) # inspected this page to find out that this is just a wikitable, not a wikitable sortable

In [291]:
mvpTables[3] 

Unnamed: 0,Awards,Teams,Years
0,10,Boston Celtics,"1957, 1958, 1961, 1962, 1963, 1965, 1973, 1984..."
1,8,Los Angeles Lakers,"1976, 1977, 1980, 1987, 1989, 1990, 2000, 2008"
2,6,Philadelphia 76ers,"1966, 1967, 1968, 1981, 1983, 2001"
3,6,Chicago Bulls,"1988, 1991, 1992, 1996, 1998, 2011"
4,5,Milwaukee Bucks,"1971, 1972, 1974, 2019, 2020"
5,4,Houston Rockets,"1979, 1982, 1994, 2018"
6,3,San Antonio Spurs,"1995, 2002, 2003"
7,3,Phoenix Suns,"1993, 2005, 2006"
8,3,Philadelphia/Golden State Warriors,"1960, 2015, 2016"
9,2,St. Louis Hawks (now Atlanta Hawks),"1956, 1959"


In [292]:
DFMvp=mvpTables[3].copy()
DFMvp

Unnamed: 0,Awards,Teams,Years
0,10,Boston Celtics,"1957, 1958, 1961, 1962, 1963, 1965, 1973, 1984..."
1,8,Los Angeles Lakers,"1976, 1977, 1980, 1987, 1989, 1990, 2000, 2008"
2,6,Philadelphia 76ers,"1966, 1967, 1968, 1981, 1983, 2001"
3,6,Chicago Bulls,"1988, 1991, 1992, 1996, 1998, 2011"
4,5,Milwaukee Bucks,"1971, 1972, 1974, 2019, 2020"
5,4,Houston Rockets,"1979, 1982, 1994, 2018"
6,3,San Antonio Spurs,"1995, 2002, 2003"
7,3,Phoenix Suns,"1993, 2005, 2006"
8,3,Philadelphia/Golden State Warriors,"1960, 2015, 2016"
9,2,St. Louis Hawks (now Atlanta Hawks),"1956, 1959"


In [293]:
# dropping years
byeMVP=[2]
DFMvp.drop(columns=DFMvp.columns[byeMVP],inplace=True)
DFMvp

Unnamed: 0,Awards,Teams
0,10,Boston Celtics
1,8,Los Angeles Lakers
2,6,Philadelphia 76ers
3,6,Chicago Bulls
4,5,Milwaukee Bucks
5,4,Houston Rockets
6,3,San Antonio Spurs
7,3,Phoenix Suns
8,3,Philadelphia/Golden State Warriors
9,2,St. Louis Hawks (now Atlanta Hawks)


In [294]:
DFMvp.reset_index(drop=True, inplace=True)

In [295]:
DFMvp.columns.to_list()

['Awards', 'Teams']

Gonna rename "Teams" to Team

In [296]:
DFMvp=DFMvp.rename(columns={'Teams':'Team'})
DFMvp

Unnamed: 0,Awards,Team
0,10,Boston Celtics
1,8,Los Angeles Lakers
2,6,Philadelphia 76ers
3,6,Chicago Bulls
4,5,Milwaukee Bucks
5,4,Houston Rockets
6,3,San Antonio Spurs
7,3,Phoenix Suns
8,3,Philadelphia/Golden State Warriors
9,2,St. Louis Hawks (now Atlanta Hawks)


In [297]:
DFMvp.columns.to_list()

['Awards', 'Team']

Turning number of awards into a numerical variable

In [298]:
DFMvp['Awards'] = pd.to_numeric(DFMvp['Awards'], errors='coerce')
DFMvp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Awards  30 non-null     int64 
 1   Team    30 non-null     object
dtypes: int64(1), object(1)
memory usage: 608.0+ bytes


Saving results as csv and as a pickle

In [299]:
# two options
DFMvp.to_csv("nba_mvps.csv",index=False)
DFMvp.to_pickle("nba_mvps.pkl")

## Scraping Overall W-L Data by Team from Wikipedia

In [300]:
from IPython.display import IFrame  
winLink="https://en.wikipedia.org/wiki/List_of_all-time_NBA_win%E2%80%93loss_records" 
IFrame(winLink, width=900, height=500)

In [301]:
import pandas as pd

winTables=pd.read_html(winLink, # link
                        header=0, # where is the header?
                        flavor='bs4', # helper to translate html
                        attrs={'class': 'wikitable sortable'}) # attributes to identify element(s)

In [302]:
DFwin=winTables[0].copy()

In [303]:
DFwin

Unnamed: 0,Rank,Team,.mw-parser-output .tooltip-dotted{border-bottom:1px dotted;cursor:help}GP,Won,Lost,Pct.,First NBA season,Division,Notes
0,1,San Antonio Spurs,3703,2261,1442,.611,1976–77,Southwest,
1,2,Los Angeles Lakers,5840,3460,2380,.592,1948–49,Pacific,[b]
2,3,Boston Celtics,5950,3513,2437,.590,1946–47,Atlantic,
3,4,Utah Jazz,3868,2109,1759,.545,1974–75,Northwest,[c]
4,5,Phoenix Suns,4361,2335,2026,.535,1968–69,Pacific,
5,6,Portland Trail Blazers,4198,2238,1960,.533,1970–71,Northwest,
6,7,Oklahoma City Thunder,4442,2373,2069,.534,1967–68,Northwest,[d]
7,8,Miami Heat,2721,1431,1290,.526,1988–89,Southeast,
8,9,Milwaukee Bucks,4361,2282,2079,.523,1968–69,Central,
9,10,Philadelphia 76ers,5777,3000,2777,.519,1949–50,Atlantic,[e]


In [304]:
DFwin.columns.to_list()

['Rank',
 'Team',
 '.mw-parser-output .tooltip-dotted{border-bottom:1px dotted;cursor:help}GP',
 'Won',
 'Lost',
 'Pct.',
 'First NBA season',
 'Division',
 'Notes']

In [305]:
byeWin=[0,6,8]
DFwin.drop(columns=DFwin.columns[byeWin],inplace=True)
DFwin

Unnamed: 0,Team,.mw-parser-output .tooltip-dotted{border-bottom:1px dotted;cursor:help}GP,Won,Lost,Pct.,Division
0,San Antonio Spurs,3703,2261,1442,.611,Southwest
1,Los Angeles Lakers,5840,3460,2380,.592,Pacific
2,Boston Celtics,5950,3513,2437,.590,Atlantic
3,Utah Jazz,3868,2109,1759,.545,Northwest
4,Phoenix Suns,4361,2335,2026,.535,Pacific
5,Portland Trail Blazers,4198,2238,1960,.533,Northwest
6,Oklahoma City Thunder,4442,2373,2069,.534,Northwest
7,Miami Heat,2721,1431,1290,.526,Southeast
8,Milwaukee Bucks,4361,2282,2079,.523,Central
9,Philadelphia 76ers,5777,3000,2777,.519,Atlantic


Checking column names

In [306]:
DFwin.columns.to_list()

['Team',
 '.mw-parser-output .tooltip-dotted{border-bottom:1px dotted;cursor:help}GP',
 'Won',
 'Lost',
 'Pct.',
 'Division']

Renaming columns: Indicating these are regular season stats, not championship stats
Renaming GP column [2] , which indicates total number of regular season games played

In [307]:
DFwin=DFwin.rename(columns={'.mw-parser-output .tooltip-dotted{border-bottom:1px dotted;cursor:help}GP':'GP2022'})
DFwin

Unnamed: 0,Team,GP2022,Won,Lost,Pct.,Division
0,San Antonio Spurs,3703,2261,1442,.611,Southwest
1,Los Angeles Lakers,5840,3460,2380,.592,Pacific
2,Boston Celtics,5950,3513,2437,.590,Atlantic
3,Utah Jazz,3868,2109,1759,.545,Northwest
4,Phoenix Suns,4361,2335,2026,.535,Pacific
5,Portland Trail Blazers,4198,2238,1960,.533,Northwest
6,Oklahoma City Thunder,4442,2373,2069,.534,Northwest
7,Miami Heat,2721,1431,1290,.526,Southeast
8,Milwaukee Bucks,4361,2282,2079,.523,Central
9,Philadelphia 76ers,5777,3000,2777,.519,Atlantic


In [308]:
DFwin=DFwin.rename(columns={'Won':'TotalWins'})
DFwin

Unnamed: 0,Team,GP2022,TotalWins,Lost,Pct.,Division
0,San Antonio Spurs,3703,2261,1442,.611,Southwest
1,Los Angeles Lakers,5840,3460,2380,.592,Pacific
2,Boston Celtics,5950,3513,2437,.590,Atlantic
3,Utah Jazz,3868,2109,1759,.545,Northwest
4,Phoenix Suns,4361,2335,2026,.535,Pacific
5,Portland Trail Blazers,4198,2238,1960,.533,Northwest
6,Oklahoma City Thunder,4442,2373,2069,.534,Northwest
7,Miami Heat,2721,1431,1290,.526,Southeast
8,Milwaukee Bucks,4361,2282,2079,.523,Central
9,Philadelphia 76ers,5777,3000,2777,.519,Atlantic


In [309]:
DFwin=DFwin.rename(columns={'Lost':'TotalLosses'})
DFwin

Unnamed: 0,Team,GP2022,TotalWins,TotalLosses,Pct.,Division
0,San Antonio Spurs,3703,2261,1442,.611,Southwest
1,Los Angeles Lakers,5840,3460,2380,.592,Pacific
2,Boston Celtics,5950,3513,2437,.590,Atlantic
3,Utah Jazz,3868,2109,1759,.545,Northwest
4,Phoenix Suns,4361,2335,2026,.535,Pacific
5,Portland Trail Blazers,4198,2238,1960,.533,Northwest
6,Oklahoma City Thunder,4442,2373,2069,.534,Northwest
7,Miami Heat,2721,1431,1290,.526,Southeast
8,Milwaukee Bucks,4361,2282,2079,.523,Central
9,Philadelphia 76ers,5777,3000,2777,.519,Atlantic


In [310]:
DFwin=DFwin.rename(columns={'Pct.':'RegSeasonPct'})
DFwin

Unnamed: 0,Team,GP2022,TotalWins,TotalLosses,RegSeasonPct,Division
0,San Antonio Spurs,3703,2261,1442,.611,Southwest
1,Los Angeles Lakers,5840,3460,2380,.592,Pacific
2,Boston Celtics,5950,3513,2437,.590,Atlantic
3,Utah Jazz,3868,2109,1759,.545,Northwest
4,Phoenix Suns,4361,2335,2026,.535,Pacific
5,Portland Trail Blazers,4198,2238,1960,.533,Northwest
6,Oklahoma City Thunder,4442,2373,2069,.534,Northwest
7,Miami Heat,2721,1431,1290,.526,Southeast
8,Milwaukee Bucks,4361,2282,2079,.523,Central
9,Philadelphia 76ers,5777,3000,2777,.519,Atlantic


In [311]:
DFwin.iloc[:,:0]=DFwin.iloc[:,:0].apply(stripSeveral)

Have to get rid of row 30, as it captured subtext as part of the table

In [312]:
wrong_row=[30]
DFwin=DFwin.drop(wrong_row,axis='index')
DFwin

Unnamed: 0,Team,GP2022,TotalWins,TotalLosses,RegSeasonPct,Division
0,San Antonio Spurs,3703,2261,1442,0.611,Southwest
1,Los Angeles Lakers,5840,3460,2380,0.592,Pacific
2,Boston Celtics,5950,3513,2437,0.59,Atlantic
3,Utah Jazz,3868,2109,1759,0.545,Northwest
4,Phoenix Suns,4361,2335,2026,0.535,Pacific
5,Portland Trail Blazers,4198,2238,1960,0.533,Northwest
6,Oklahoma City Thunder,4442,2373,2069,0.534,Northwest
7,Miami Heat,2721,1431,1290,0.526,Southeast
8,Milwaukee Bucks,4361,2282,2079,0.523,Central
9,Philadelphia 76ers,5777,3000,2777,0.519,Atlantic


Now we're ready to format the regular season data

In [313]:
DFwin.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30 entries, 0 to 29
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Team          30 non-null     object
 1   GP2022        30 non-null     object
 2   TotalWins     30 non-null     object
 3   TotalLosses   30 non-null     object
 4   RegSeasonPct  30 non-null     object
 5   Division      30 non-null     object
dtypes: object(6)
memory usage: 1.6+ KB


In [314]:
# import function for the categories to be set: for division cat
from pandas.api.types import CategoricalDtype

# prepare list order of levels (in ascending order when ordinal)
Divisions=["Atlantic","Central","Northwest","Pacific","Southeast", "Southwest"]

# create custom data type
NBAdivisions = CategoricalDtype(categories=Divisions, ordered=False) #ordered=False indicates a nominal var instead of an ordinal

# set the Dtype of the column (one column):
DFwin['Division']=DFwin.Division.astype(NBAdivisions)

In [315]:
DFwin.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30 entries, 0 to 29
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Team          30 non-null     object  
 1   GP2022        30 non-null     object  
 2   TotalWins     30 non-null     object  
 3   TotalLosses   30 non-null     object  
 4   RegSeasonPct  30 non-null     object  
 5   Division      30 non-null     category
dtypes: category(1), object(5)
memory usage: 1.7+ KB


Now turning the other categories into numeric variables (except Team)

In [316]:
severalToNum=lambda x:pd.to_numeric(x,errors='coerce')
#pd.to_numeric,
#apply function:

where=DFwin.columns[1:4]

DFwin.loc[:,where]=DFwin.loc[:,where].apply(severalToNum)

DFwin.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30 entries, 0 to 29
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Team          30 non-null     object  
 1   GP2022        30 non-null     int64   
 2   TotalWins     30 non-null     int64   
 3   TotalLosses   30 non-null     int64   
 4   RegSeasonPct  30 non-null     object  
 5   Division      30 non-null     category
dtypes: category(1), int64(3), object(2)
memory usage: 1.7+ KB


didn't capture RegSeasonPct because it's not an integer; have to turn into a float

In [317]:
DFwin.RegSeasonPct=pd.to_numeric(DFwin.RegSeasonPct,errors='coerce')

In [318]:
DFwin.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30 entries, 0 to 29
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Team          30 non-null     object  
 1   GP2022        30 non-null     int64   
 2   TotalWins     30 non-null     int64   
 3   TotalLosses   30 non-null     int64   
 4   RegSeasonPct  30 non-null     float64 
 5   Division      30 non-null     category
dtypes: category(1), float64(1), int64(3), object(1)
memory usage: 1.7+ KB


We like this. Let's save these data.

In [319]:
DFwin.to_csv("regseason.csv",index=False)
DFwin.to_pickle("regseason.pkl")