In [64]:
import pandas as pd

### Pandas Series

In [65]:
# pd.Series(data, index)
groceries = pd.Series(data = [30, 6, 'Yes', 'No'], index = ['eggs', 'apples', 'milk', 'bread'])
groceries

eggs       30
apples      6
milk      Yes
bread      No
dtype: object

In [66]:
groceries.shape

(4,)

In [67]:
groceries.ndim

1

In [68]:
groceries.size

4

In [69]:
groceries.values

array([30, 6, 'Yes', 'No'], dtype=object)

In [70]:
groceries.index

Index(['eggs', 'apples', 'milk', 'bread'], dtype='object')

In [71]:
x = 'bananas' in groceries
x

False

In [72]:
y = 'bread' in groceries
y

True

In [73]:
groceries['eggs']

30

In [74]:
groceries[['milk', 'bread']]

milk     Yes
bread     No
dtype: object

In [75]:
groceries.loc[['eggs', 'apples']]

eggs      30
apples     6
dtype: object

In [76]:
 groceries[[0, 1]]

  groceries[[0, 1]]


eggs      30
apples     6
dtype: object

In [77]:
groceries[[-1]]

  groceries[[-1]]


bread    No
dtype: object

In [78]:
groceries[0]

  groceries[0]


30

In [79]:
groceries.iloc[[2, 3]]

milk     Yes
bread     No
dtype: object

In [80]:
groceries

eggs       30
apples      6
milk      Yes
bread      No
dtype: object

In [81]:
groceries['eggs'] = 2

In [82]:
groceries

eggs        2
apples      6
milk      Yes
bread      No
dtype: object

In [83]:
# We can also delete items from a Pandas Series by using the .drop() method
groceries.drop('apples')

eggs       2
milk     Yes
bread     No
dtype: object

In [84]:
groceries

eggs        2
apples      6
milk      Yes
bread      No
dtype: object

In [85]:
groceries.drop('apples', inplace = True)

In [86]:
groceries

eggs       2
milk     Yes
bread     No
dtype: object

In [87]:
# https://pandas.pydata.org/pandas-docs/stable/reference/series.html#reindexing-selection-label-manipulation

In [88]:
fruits= pd.Series(data = [10, 6, 3,], index = ['apples', 'oranges', 'bananas'])
fruits

apples     10
oranges     6
bananas     3
dtype: int64

In [89]:
fruits + 2

apples     12
oranges     8
bananas     5
dtype: int64

In [90]:
fruits - 2

apples     8
oranges    4
bananas    1
dtype: int64

In [91]:
fruits*  2

apples     20
oranges    12
bananas     6
dtype: int64

In [92]:
fruits / 2

apples     5.0
oranges    3.0
bananas    1.5
dtype: float64

In [93]:
import numpy as np

In [94]:
np.exp(fruits)

apples     22026.465795
oranges      403.428793
bananas       20.085537
dtype: float64

In [95]:
np.sqrt(fruits)

apples     3.162278
oranges    2.449490
bananas    1.732051
dtype: float64

In [96]:
np.power(fruits,2)

apples     100
oranges     36
bananas      9
dtype: int64

In [97]:
x = fruits['bananas'] + 2
x

5

In [98]:
fruits.iloc[0] - 2

8

In [99]:
fruits[['apples', 'oranges']] * 2

apples     20
oranges    12
dtype: int64

In [100]:
fruits.loc[['apples', 'oranges']] / 2

apples     5.0
oranges    3.0
dtype: float64

In [101]:
groceries * 2

eggs          4
milk     YesYes
bread      NoNo
dtype: object

In [102]:
# https://pandas.pydata.org/pandas-docs/stable/reference/series.html#indexing-iteration
# https://pandas.pydata.org/pandas-docs/stable/reference/series.html#reindexing-selection-label-manipulation

### Data Frames

In [103]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants'])}


In [104]:
shopping_carts = pd.DataFrame(items)
shopping_carts

Unnamed: 0,Bob,Alice
bike,245.0,500.0
book,,40.0
glasses,,110.0
pants,25.0,45.0
watch,55.0,


In [105]:
data = {'Bob' : pd.Series([245, 25, 55]),
        'Alice' : pd.Series([40, 110, 500, 45])}

df = pd.DataFrame(data)
df

Unnamed: 0,Bob,Alice
0,245.0,40
1,25.0,110
2,55.0,500
3,,45


In [106]:
shopping_carts

Unnamed: 0,Bob,Alice
bike,245.0,500.0
book,,40.0
glasses,,110.0
pants,25.0,45.0
watch,55.0,


In [107]:
shopping_carts.shape

(5, 2)

In [108]:
shopping_carts.ndim

2

In [109]:
shopping_carts.size

10

In [110]:
shopping_carts.values

array([[245., 500.],
       [ nan,  40.],
       [ nan, 110.],
       [ 25.,  45.],
       [ 55.,  nan]])

In [111]:
shopping_carts.index

Index(['bike', 'book', 'glasses', 'pants', 'watch'], dtype='object')

In [112]:
shopping_carts.columns

Index(['Bob', 'Alice'], dtype='object')

In [113]:
items

{'Bob': bike     245
 pants     25
 watch     55
 dtype: int64,
 'Alice': book        40
 glasses    110
 bike       500
 pants       45
 dtype: int64}

In [114]:
bob_shopping_cart = pd.DataFrame(items, columns=['Bob'])
bob_shopping_cart

Unnamed: 0,Bob
bike,245
pants,25
watch,55


In [115]:
sel_shopping_cart = pd.DataFrame(items, index = ['pants', 'book'])
sel_shopping_cart

Unnamed: 0,Bob,Alice
pants,25.0,45
book,,40


In [116]:
alice_sel_shopping_cart = pd.DataFrame(items, index = ['glasses', 'bike'], columns = ['Alice'])
alice_sel_shopping_cart

Unnamed: 0,Alice
glasses,110
bike,500


In [117]:
data = {'Integers' : [1,2,3],
        'Floats' : [4.5, 8.2, 9.6]}
df = pd.DataFrame(data)
df

Unnamed: 0,Integers,Floats
0,1,4.5
1,2,8.2
2,3,9.6


In [118]:
data = {'Integers' : [1,2,3],
        'Floats' : [4.5, 8.2, 9.6]}
df = pd.DataFrame(data, index = ['label 1', 'label 2', 'label 3'])
df

Unnamed: 0,Integers,Floats
label 1,1,4.5
label 2,2,8.2
label 3,3,9.6


In [119]:
items2 = [{'bikes': 20, 'pants': 30, 'watches': 35},
          {'watches': 10, 'glasses': 50, 'bikes': 15, 'pants':5}]
store_items = pd.DataFrame(items2)
store_items

Unnamed: 0,bikes,pants,watches,glasses
0,20,30,35,
1,15,5,10,50.0


In [120]:
items2 = [{'bikes': 20, 'pants': 30, 'watches': 35},
          {'watches': 10, 'glasses': 50, 'bikes': 15, 'pants':5}]
store_items = pd.DataFrame(items2, index = ['store 1', 'store 2'])
store_items

Unnamed: 0,bikes,pants,watches,glasses
store 1,20,30,35,
store 2,15,5,10,50.0


In [121]:
# https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#intro-to-data-structures
# https://pandas.pydata.org/pandas-docs/stable/reference/frame.html#dataframe

In [122]:
# Access elements using labels
store_items[['bikes']]

Unnamed: 0,bikes
store 1,20
store 2,15


In [123]:
store_items[['bikes', 'pants']]

Unnamed: 0,bikes,pants
store 1,20,30
store 2,15,5


In [126]:
x = store_items.loc[['store 1']]
x

Unnamed: 0,bikes,pants,watches,glasses
store 1,20,30,35,


In [127]:
x = store_items['bikes']['store 2']
x

15

In [128]:
# Add a column to an existing DataFrame
store_items['shirts'] = [15,2]
store_items

Unnamed: 0,bikes,pants,watches,glasses,shirts
store 1,20,30,35,,15
store 2,15,5,10,50.0,2


In [129]:
store_items['suits'] = store_items['pants'] + store_items['shirts']
store_items

Unnamed: 0,bikes,pants,watches,glasses,shirts,suits
store 1,20,30,35,,15,45
store 2,15,5,10,50.0,2,7


In [130]:
# Create a row to be added to the DataFrame
new_items = [{'bikes': 20, 'pants': 30, 'watches': 35, 'glasses': 4}]
new_store = pd.DataFrame(new_items, index = ['store 3'])
new_store

Unnamed: 0,bikes,pants,watches,glasses
store 3,20,30,35,4


In [131]:
store_items = pd.concat([store_items, new_store])
store_items

Unnamed: 0,bikes,pants,watches,glasses,shirts,suits
store 1,20,30,35,,15.0,45.0
store 2,15,5,10,50.0,2.0,7.0
store 3,20,30,35,4.0,,


In [134]:
# Add new column that has data from the existing columns
store_items['new watches'] = store_items['watches'][1:]
store_items

Unnamed: 0,bikes,pants,watches,glasses,shirts,suits,new watches
store 1,20,30,35,,15.0,45.0,
store 2,15,5,10,50.0,2.0,7.0,10.0
store 3,20,30,35,4.0,,,35.0


In [135]:
# Add new column at a specific location
store_items.insert(4, 'shoes', [8,5,0])
store_items

Unnamed: 0,bikes,pants,watches,glasses,shoes,shirts,suits,new watches
store 1,20,30,35,,8,15.0,45.0,
store 2,15,5,10,50.0,5,2.0,7.0,10.0
store 3,20,30,35,4.0,0,,,35.0


In [137]:
# Delete one column from a DataFrame
store_items.pop('pants')
store_items

Unnamed: 0,bikes,watches,glasses,shoes,shirts,suits
store 1,20,35,,8,15.0,45.0
store 2,15,10,50.0,5,2.0,7.0
store 3,20,35,4.0,0,,


In [138]:
# Delete multiple columns from a DataFrame
store_items = store_items.drop(['watches', 'shoes'], axis = 1)
store_items

Unnamed: 0,bikes,glasses,shirts,suits
store 1,20,,15.0,45.0
store 2,15,50.0,2.0,7.0
store 3,20,4.0,,


In [139]:
# Delete rows from a DataFrame
store_items = store_items.drop(['store 2', 'store 1'], axis = 0)
store_items

Unnamed: 0,bikes,glasses,shirts,suits
store 3,20,4.0,,


In [140]:
store_items = store_items.rename(columns = {'bikes': 'hats'})
store_items

Unnamed: 0,hats,glasses,shirts,suits
store 3,20,4.0,,


In [141]:
store_items = store_items.rename(index = {'store 3': 'last store'})
store_items

Unnamed: 0,hats,glasses,shirts,suits
last store,20,4.0,,


In [143]:
store_items = store_items.set_index('hats')
store_items

Unnamed: 0_level_0,glasses,shirts,suits
hats,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20,4.0,,


### Dealing with NaN

In [144]:
items2 = [{'bikes': 20, 'pants': 30, 'watches': 35, 'shirts': 15, 'shoes':8, 'suits':45},
{'watches': 10, 'glasses': 50, 'bikes': 15, 'pants':5, 'shirts': 2, 'shoes':5, 'suits':7},
{'bikes': 20, 'pants': 30, 'watches': 35, 'glasses': 4, 'shoes':10}]
store_items = pd.DataFrame(items2, index = ['store 1', 'store 2', 'store 3'])
store_items

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20,30,35,15.0,8,45.0,
store 2,15,5,10,2.0,5,7.0,50.0
store 3,20,30,35,,10,,4.0


In [145]:
# Count the total NaN values
x =  store_items.isnull().sum().sum()
x

3

In [146]:
# Count the total non-NaN values
y = store_items.count()
y

bikes      3
pants      3
watches    3
shirts     2
shoes      3
suits      2
glasses    2
dtype: int64

In [147]:
# he .dropna(axis) method eliminates any rows with NaN values when axis = 0 is used and will eliminate any columns with NaN values when axis = 1 is used

In [148]:
# Tip: Remember, you learned that you can read axis = 0 as "down" and axis = 1 as "across" the given Numpy ndarray or Pandas dataframe object

In [151]:
# Drop rows having NaN values
store_items.dropna(axis = 0, inplace = False)

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 2,15,5,10,2.0,5,7.0,50.0


In [152]:
# Drop columns having NaN values
store_items.dropna(axis = 1, inplace = False)

Unnamed: 0,bikes,pants,watches,shoes
store 1,20,30,35,8
store 2,15,5,10,5
store 3,20,30,35,10


### Substituting NaN Values

In [154]:
store_items.fillna(0)

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20,30,35,15.0,8,45.0,0.0
store 2,15,5,10,2.0,5,7.0,50.0
store 3,20,30,35,0.0,10,0.0,4.0


In [155]:
store_items.fillna(method = 'ffill', axis = 0)

  store_items.fillna(method = 'ffill', axis = 0)


Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20,30,35,15.0,8,45.0,
store 2,15,5,10,2.0,5,7.0,50.0
store 3,20,30,35,2.0,10,7.0,4.0


In [156]:
store_items.fillna(method = 'ffill', axis = 1)

  store_items.fillna(method = 'ffill', axis = 1)


Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20.0,30.0,35.0,15.0,8.0,45.0,45.0
store 2,15.0,5.0,10.0,2.0,5.0,7.0,50.0
store 3,20.0,30.0,35.0,35.0,10.0,10.0,4.0


In [157]:
store_items.fillna(method = 'backfill', axis = 0)

  store_items.fillna(method = 'backfill', axis = 0)


Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20,30,35,15.0,8,45.0,50.0
store 2,15,5,10,2.0,5,7.0,50.0
store 3,20,30,35,,10,,4.0


In [159]:
store_items.fillna(method = 'backfill', axis = 0, inplace = False)

  store_items.fillna(method = 'backfill', axis = 0, inplace = False)


Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20,30,35,15.0,8,45.0,50.0
store 2,15,5,10,2.0,5,7.0,50.0
store 3,20,30,35,,10,,4.0


In [160]:
store_items.interpolate(method = 'linear', axis = 0)

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20,30,35,15.0,8,45.0,
store 2,15,5,10,2.0,5,7.0,50.0
store 3,20,30,35,2.0,10,7.0,4.0


In [161]:
store_items.interpolate(method = 'linear', axis = 1)

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20.0,30.0,35.0,15.0,8.0,45.0,45.0
store 2,15.0,5.0,10.0,2.0,5.0,7.0,50.0
store 3,20.0,30.0,35.0,22.5,10.0,7.0,4.0


### Loading Data into a pandas DataFrame

In [165]:
google_stock = pd.read_csv('../data/goog.csv')
google_stock.shape

(3313, 7)

In [166]:
google_stock

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2004-08-19,49.676899,51.693783,47.669952,49.845802,49.845802,44994500
1,2004-08-20,50.178635,54.187561,49.925285,53.805050,53.805050,23005800
2,2004-08-23,55.017166,56.373344,54.172661,54.346527,54.346527,18393200
3,2004-08-24,55.260582,55.439419,51.450363,52.096165,52.096165,15361800
4,2004-08-25,52.140873,53.651051,51.604362,52.657513,52.657513,9257400
...,...,...,...,...,...,...,...
3308,2017-10-09,980.000000,985.424988,976.109985,977.000000,977.000000,891400
3309,2017-10-10,980.000000,981.570007,966.080017,972.599976,972.599976,968400
3310,2017-10-11,973.719971,990.710022,972.250000,989.250000,989.250000,1693300
3311,2017-10-12,987.450012,994.119995,985.000000,987.830017,987.830017,1262400


In [167]:
google_stock.head(3)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2004-08-19,49.676899,51.693783,47.669952,49.845802,49.845802,44994500
1,2004-08-20,50.178635,54.187561,49.925285,53.80505,53.80505,23005800
2,2004-08-23,55.017166,56.373344,54.172661,54.346527,54.346527,18393200


In [168]:
google_stock.tail(4)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
3309,2017-10-10,980.0,981.570007,966.080017,972.599976,972.599976,968400
3310,2017-10-11,973.719971,990.710022,972.25,989.25,989.25,1693300
3311,2017-10-12,987.450012,994.119995,985.0,987.830017,987.830017,1262400
3312,2017-10-13,992.0,997.210022,989.0,989.679993,989.679993,1157700


In [169]:
google_stock.isnull().any()

Date         False
Open         False
High         False
Low          False
Close        False
Adj Close    False
Volume       False
dtype: bool

In [170]:
google_stock.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0
mean,380.186092,383.49374,376.519309,380.072458,380.072458,8038476.0
std,223.81865,224.974534,222.473232,223.85378,223.85378,8399521.0
min,49.274517,50.541279,47.669952,49.681866,49.681866,7900.0
25%,226.556473,228.394516,224.003082,226.40744,226.40744,2584900.0
50%,293.312286,295.433502,289.929291,293.029114,293.029114,5281300.0
75%,536.650024,540.0,532.409973,536.690002,536.690002,10653700.0
max,992.0,997.210022,989.0,989.679993,989.679993,82768100.0


In [171]:
google_stock['Adj Close'].describe()

count    3313.000000
mean      380.072458
std       223.853780
min        49.681866
25%       226.407440
50%       293.029114
75%       536.690002
max       989.679993
Name: Adj Close, dtype: float64

In [172]:
google_stock.max()

Date         2017-10-13
Open              992.0
High         997.210022
Low               989.0
Close        989.679993
Adj Close    989.679993
Volume         82768100
dtype: object

In [173]:
google_stock['Close'].min()

49.681866

In [181]:
google_stock.pop('Date')
google_stock.mean()

Open         3.801861e+02
High         3.834937e+02
Low          3.765193e+02
Close        3.800725e+02
Adj Close    3.800725e+02
Volume       8.038476e+06
dtype: float64

In [179]:

google_stock.corr()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
Open,1.0,0.999904,0.999845,0.999745,0.999745,-0.564258
High,0.999904,1.0,0.999834,0.999868,0.999868,-0.562749
Low,0.999845,0.999834,1.0,0.999899,0.999899,-0.567007
Close,0.999745,0.999868,0.999899,1.0,1.0,-0.564967
Adj Close,0.999745,0.999868,0.999899,1.0,1.0,-0.564967
Volume,-0.564258,-0.562749,-0.567007,-0.564967,-0.564967,1.0


In [182]:
google_stock

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
0,49.676899,51.693783,47.669952,49.845802,49.845802,44994500
1,50.178635,54.187561,49.925285,53.805050,53.805050,23005800
2,55.017166,56.373344,54.172661,54.346527,54.346527,18393200
3,55.260582,55.439419,51.450363,52.096165,52.096165,15361800
4,52.140873,53.651051,51.604362,52.657513,52.657513,9257400
...,...,...,...,...,...,...
3308,980.000000,985.424988,976.109985,977.000000,977.000000,891400
3309,980.000000,981.570007,966.080017,972.599976,972.599976,968400
3310,973.719971,990.710022,972.250000,989.250000,989.250000,1693300
3311,987.450012,994.119995,985.000000,987.830017,987.830017,1262400


In [183]:
data = pd.read_csv('../data/fake_company.csv')
data

Unnamed: 0,Year,Name,Department,Age,Salary
0,1990,Alice,HR,25,50000
1,1990,Bob,RD,30,48000
2,1990,Charlie,Admin,45,55000
3,1991,Dakota,HR,26,52000
4,1991,Elsa,RD,31,50000
5,1991,Frank,Admin,46,60000
6,1992,Grace,Admin,27,60000
7,1992,Hoffman,RD,32,52000
8,1992,Inaar,Admin,28,62000


In [184]:
data.groupby(['Year'])['Salary'].sum()

Year
1990    153000
1991    162000
1992    174000
Name: Salary, dtype: int64

In [185]:
data.groupby(['Year'])['Salary'].mean()

Year
1990    51000.0
1991    54000.0
1992    58000.0
Name: Salary, dtype: float64

In [186]:
data.groupby(['Name'])['Salary'].sum()

Name
Alice      50000
Bob        48000
Charlie    55000
Dakota     52000
Elsa       50000
Frank      60000
Grace      60000
Hoffman    52000
Inaar      62000
Name: Salary, dtype: int64

In [188]:
data.groupby(['Year', 'Department'])['Salary'].sum()

Year  Department
1990  Admin          55000
      HR             50000
      RD             48000
1991  Admin          60000
      HR             52000
      RD             50000
1992  Admin         122000
      RD             52000
Name: Salary, dtype: int64

In [None]:
# https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html

In [None]:
# https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf