In [1]:
import pandas as pd
pd.__version__

'1.4.2'

## Methods and Attributes between Series and DataFrames

In [6]:
nba = pd.read_csv("nba.csv")
nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [20]:
nba = pd.read_csv("nba.csv")
s = pd.Series([1, 2, 3, 4, 5])
nba

nba.head()
nba.head(7)
nba.head(n = 1)

nba.tail()
nba.tail(8)
nba.tail(n = 1)

s.index
nba.index

s.values
nba.values

s.shape
nba.shape

s.dtype
s.dtypes
nba.dtypes
nba.dtypes.value_counts()

# DataFrame has no 'hasnans' attribute
s.hasnans
# nba.hasnans

# Series has no 'columns' attribute
# s.columns
nba.columns

s.axes
nba.axes

# nba.info()

s.info()

[RangeIndex(start=0, stop=458, step=1),
 Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
        'College', 'Salary'],
       dtype='object')]

<class 'pandas.core.series.Series'>
RangeIndex: 3 entries, 0 to 2
Series name: None
Non-Null Count  Dtype
--------------  -----
3 non-null      int64
dtypes: int64(1)
memory usage: 152.0 bytes


## Differences between Shared Methods
- Some methods shared between `Series` and `DataFrame` function differently.
- On a `Series`, the `.sum` method will return the sum of all values.
- On a `DataFrame`, the `.sum()` method will default to a `Series` with the sums of each column
- The `axis` argument can be altered to modify the operations.
- The below commands will not work on a `Series`
- KEY TAKEAWAY: Parameters can affect the functionality of different object in `pandas`

In [2]:
rev = pd.read_csv("revenue.csv", index_col = "Date")
rev.head(3)

s = pd.Series([1, 2, 3, 4, 5])
s.sum()

rev.sum()
rev.sum(axis = 0)
rev.sum(axis = "index")

rev.sum(axis = 1)
rev.sum(axis = "columns")

Date
1/1/16     1606
1/2/16     2060
1/3/16      967
1/4/16     2519
1/5/16      438
1/6/16     1935
1/7/16     1234
1/8/16     2313
1/9/16     2623
1/10/16     555
dtype: int64

## Select One Column from a `DataFrame`
- Use dot syntax ( `.` ) or bracket syntax ( `[]` ) to select a single `Series` from a `DataFrame`.
- A `Series` object is returned when a single column is selected from a `DataFrame`
- Thus, regular `Series` methods can be caled on the resulting object

In [3]:
nba = pd.read_csv("nba.csv")

nba.Name
type(nba.Name)

nba.Number
nba.Salary
# nba.salary

nba["Name"]
type(nba["Name"])
nba["Number"]
nba["Salary"]

nba["Name"].head(5)
nba["Name"].tail(3)

455    Tibor Pleiss
456     Jeff Withey
457             NaN
Name: Name, dtype: object

## Select Two or More Columns from a `DataFrame`
- Pass a list of column names to the square brackets after the `DataFrame`.

In [4]:
nba = pd.read_csv("nba.csv")
nba.head(3)

nba[["Name", "Team"]]
nba[["Team", "Name"]] # Different order
# nba[["Team", "name"]].head(3) # All columns must exist
nba[["Number", "College"]].tail(3)
nba[["Salary", "Team", "Name"]].head(4)

select = ["Salary", "Team", "Name"]
nba[select].head(4)

Unnamed: 0,Salary,Team,Name
0,7730337.0,Boston Celtics,Avery Bradley
1,6796117.0,Boston Celtics,Jae Crowder
2,,Boston Celtics,John Holland
3,1148640.0,Boston Celtics,R.J. Hunter


## Add New Column

- Create a new by referencing it with bracket syntax, then using the = sign to assign its values.
- Bracket syntax is thus used to reference AND to create.
- A single value is called a **scalar value**.
- The `.insert` method can be used to insert values into a specific column position
- The `.assign` method can be used to insert multiple columns
- The method takes an unlimited number of keyword arguments
- The "parameter" will be the name of the new column. No need for quotes, cannot have spaces.
- The argument will be what the column values will be.

In [5]:
nba = pd.read_csv("nba.csv")
nba.tail(3)

# nba["Sport"] # Error - column does not exist
nba["Sport"] = "Basketball" # Careful, you may overwrite
# nba.Sport = "Basketball" -- will not work
nba["League"] = "National Basketball Association"


In [6]:
nba = pd.read_csv("nba.csv")
nba.insert(loc = 3, column = "Sport", value = "Basketball")
nba.insert(loc = 7, column = "League", value = "National Basketball Association")
nba.head(1)

Unnamed: 0,Name,Team,Number,Sport,Position,Age,Height,League,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,Basketball,PG,25.0,6-2,National Basketball Association,180.0,Texas,7730337.0


## Broadcasting Operations
- The `add`, `sub`, `mul`, and `div` methods

In [7]:
nba = pd.read_csv("nba.csv")
nba.tail(3)

nba["Age"] + 10
nba["Age"].add(10)
nba["Age in a Decade"] = nba["Age"] + 10

nba["Salary"] - 5000000
nba["Salary"].sub(5000000)
nba["New Salary"] = nba["Salary"].sub(5000000)

nba["Weight"] * 0.453592
nba["Weight"].mul(0.453592)
nba["Weight in Kilograms"] = nba["Weight"] * 0.453592
    
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Age in a Decade,New Salary,Weight in Kilograms
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,35.0,2730337.0,81.64656
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,35.0,1796117.0,106.59412
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,37.0,,92.98636
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,32.0,-3851360.0,83.91452
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0,39.0,0.0,104.779752


## REVIEW - The `value_counts` Method
- The `value_counts()` method is called on a `Series` object. It returns a frequency distribution.
- A frequency distribution includes each unique value in the `Series` and the number of times it appears.
- The returned object is a new `Series` where the values are the counts
- To get counts across multiple columns, 

In [8]:
players = pd.read_csv("nba.csv")
players["Team"].value_counts()
players["Position"].value_counts()
players["Salary"].value_counts().head(1)

947276.0    31
Name: Salary, dtype: int64

## Drop Rows with Null Values
- The `.dropna()` method deletes `DataFrame` rows with null (`NaN`) values
- The default arguments delete a row if **any** of the cells contain null values.
- Provide the argument **all** to the `how` parameter to delete rows
- This tutorial requires **numpy** to generate null values

In [9]:
nba = pd.read_csv("nba.csv")
nba.tail(3)

nba.dropna()                         # Delete rows that have any null values
nba.dropna(how = "any").tail(3)      # Delete rows 

nba.dropna(how = "all")              # Delete rows with all null values

nba.dropna(subset = ["College"]).tail(3) # Delete column if null value exists in College Series
nba.dropna(subset = ["College", "Salary"]).tail(3) # Delete column if null value exists in College Series

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,Kentucky,2239800.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


## Fill in Null Values
- The `DataFrame` includes `.fillna()` method but it will work across all cells.
- Let's replace the **Salary** `Series` with 0 and the **College** `Series` with the string "None".

In [10]:
nba = pd.read_csv("nba.csv")
# Fill every missing value with 0
nba.fillna(0)

# Fill in the College missing values with "Unknown", make the operation permanent
nba["College"].fillna("Unknown", inplace = True)
nba["College"] = nba["College"].fillna("Unknown")

# Fill in the Salary missing values with 0, make the operation permanent
nba["Salary"].fillna(0, inplace = True)
nba["Salary"] = nba["Salary"].fillna(0)

nba.head(6)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,Unknown,5000000.0
5,Amir Johnson,Boston Celtics,90.0,PF,29.0,6-9,240.0,Unknown,12000000.0


## The `astype` Method

In [11]:
nba = pd.read_csv("nba.csv").dropna(how = "all")

nba["Age"].hasnans
nba["Age"].astype("int")
nba["Age"].astype(int)
nba["Age"] = nba["Age"].astype("int")

nba["Salary"].hasnans # True
nba["Salary"].astype(int)
nba["Salary"] = nba["Salary"].fillna(0)

# nba["Salary"].astype("int")
# nba["Salary"].astype(int)
nba["Salary"] = nba["Salary"].astype("int")
#  nba["Salary"].astype("float")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Name      457 non-null    object  
 1   Team      457 non-null    category
 2   Number    457 non-null    float64 
 3   Position  457 non-null    category
 4   Age       457 non-null    category
 5   Height    457 non-null    object  
 6   Weight    457 non-null    float64 
 7   College   373 non-null    object  
 8   Salary    457 non-null    int64   
dtypes: category(3), float64(2), int64(1), object(3)
memory usage: 28.5+ KB


## The astype Method II

In [18]:
# nba.dtypes
# nba.info()

len(nba)
nba["Position"].nunique() # 5
nba["Position"] = nba["Position"].astype("category")
nba["Team"].nunique()
nba["Team"] = nba["Team"].astype("category")
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   Name         457 non-null    object  
 1   Team         457 non-null    category
 2   Number       457 non-null    float64 
 3   Position     457 non-null    category
 4   Age          457 non-null    category
 5   Height       457 non-null    object  
 6   Weight       457 non-null    float64 
 7   College      373 non-null    object  
 8   Salary       457 non-null    int64   
 9   Salary Rank  457 non-null    int64   
dtypes: category(3), float64(2), int64(2), object(3)
memory usage: 32.1+ KB


## Sort a DataFrame with the sort_values Method I

In [22]:
nba = pd.read_csv("nba.csv")
nba.head(4)

nba.sort_values("Name")
nba.sort_values("Name", ascending = True)
nba.sort_values("Name", ascending = False)

nba.sort_values("Age").head(3)
nba.sort_values("Age", ascending = False).head(3)
# nba.sort_values("Salary").head(3)
# nba.sort_values("Salary", ascending = False).head(3)
# nba.sort_values("Team", ascending = True, inplace = True)
# nba.sort_values("Salary", ascending = False, na_position = "first").head(5)
# nba.sort_values("Salary", ascending = False, na_position = "last").tail(5)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
237,Zaza Pachulia,Dallas Mavericks,27.0,C,32.0,6-11,275.0,,5200000.0
271,Zach Randolph,Memphis Grizzlies,50.0,PF,34.0,6-9,260.0,Michigan State,9638555.0
402,Zach LaVine,Minnesota Timberwolves,8.0,PG,21.0,6-5,189.0,UCLA,2148360.0
270,Xavier Munford,Memphis Grizzlies,14.0,PG,24.0,6-3,180.0,Rhode Island,
386,Wilson Chandler,Denver Nuggets,21.0,SF,29.0,6-8,225.0,DePaul,10449438.0
25,Willie Reed,Brooklyn Nets,33.0,PF,26.0,6-10,220.0,Saint Louis,947276.0
141,Willie Cauley-Stein,Sacramento Kings,0.0,C,22.0,7-0,240.0,Kentucky,3398280.0
385,Will Barton,Denver Nuggets,5.0,SF,25.0,6-6,175.0,Memphis,3533333.0
233,Wesley Matthews,Dallas Mavericks,23.0,SG,29.0,6-5,220.0,Marquette,16407500.0
97,Wesley Johnson,Los Angeles Clippers,33.0,SF,28.0,6-7,215.0,Syracuse,1100602.0


## Sort a `DataFrame` with the `.sort_values()` Method, Part II

In [13]:
nba = pd.read_csv("nba.csv")
nba.head(3)

nba.sort_values(["Team", "Name"]).head(3)
nba.sort_values(["Team", "Name"], ascending = [True, False]).head(3)
nba.sort_values(["Team", "Salary"], ascending = [False, False]).head(3)
nba.sort_values(["Position", "Salary"], ascending = False).head(3)
nba.sort_values(["Position", "Salary"], ascending = False, inplace = True)
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
349,Dwyane Wade,Miami Heat,3.0,SG,34.0,6-4,220.0,Marquette,20000000.0
153,Jimmy Butler,Chicago Bulls,21.0,SG,26.0,6-7,220.0,Marquette,16407500.0
233,Wesley Matthews,Dallas Mavericks,23.0,SG,29.0,6-5,220.0,Marquette,16407500.0


## Sort Index

In [24]:
nba = pd.read_csv("nba.csv")
nba.head(3)

nba = nba.sort_values(["Team", "Name"])
nba.head(3)

nba.sort_index().head(3)
nba.sort_index(ascending = True).head(3)
nba.sort_index(ascending = False).head(3)

nba.sort_index(ascending = False, inplace = True)
nba = nba.sort_index(ascending = True)
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


## Rank Values with the `.rank` Method

In [30]:
nba = pd.read_csv("nba.csv").dropna(how = "all")
nba["Salary"] = nba["Salary"].fillna(0).astype("int")
nba.head(3)

nba["Salary"].rank()
nba["Salary"].rank(ascending = True) # default
nba["Salary"].rank(ascending = False)
nba["Salary"].rank(ascending = False).astype("int")
nba["Salary Rank"] = nba["Salary"].rank(ascending = False).astype("int")
nba.head(3)

# Let's sort by **Salary** `Series` to confirm the `rank()` method worked successfully.
nba.sort_values("Salary", ascending = False)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Salary Rank
109,Kobe Bryant,Los Angeles Lakers,24.0,SF,37.0,6-6,212.0,,25000000,1
169,LeBron James,Cleveland Cavaliers,23.0,SF,31.0,6-8,250.0,,22970500,2
33,Carmelo Anthony,New York Knicks,7.0,SF,32.0,6-8,240.0,Syracuse,22875000,3
251,Dwight Howard,Houston Rockets,12.0,C,30.0,6-11,265.0,,22359364,4
339,Chris Bosh,Miami Heat,1.0,PF,32.0,6-11,235.0,Georgia Tech,22192730,5
...,...,...,...,...,...,...,...,...,...,...
353,Dorell Wright,Miami Heat,11.0,SF,30.0,6-9,205.0,,0,452
264,Jordan Farmar,Memphis Grizzlies,4.0,PG,29.0,6-2,180.0,UCLA,0,452
409,Greg Smith,Minnesota Timberwolves,4.0,PF,25.0,6-10,250.0,Fresno State,0,452
273,Alex Stepheson,Memphis Grizzlies,35.0,PF,28.0,6-10,270.0,USC,0,452
