### 7.1 Handling Missing Data

In [1]:
import pandas as pd
import numpy as np
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])

In [2]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [3]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [4]:
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [5]:
string_data.dropna()

1    artichoke
3      avocado
dtype: object

In [6]:
string_data.fillna('YHATZEE')

0      YHATZEE
1    artichoke
2      YHATZEE
3      avocado
dtype: object

In [7]:
string_data

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

In [8]:
from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [9]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [10]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [11]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [12]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [13]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [14]:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [15]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [16]:
df = pd.DataFrame(np.random.randn(7, 3))
df

Unnamed: 0,0,1,2
0,1.367243,-1.16816,0.018488
1,0.306206,-1.082106,0.140235
2,-0.744698,1.135118,-2.613327
3,0.030054,0.24691,-0.831251
4,1.371924,0.075609,-0.529438
5,2.332816,-0.108371,-0.033417
6,-0.829454,0.97814,1.165417


In [17]:
df.iloc[:4, 1] = NA
df

Unnamed: 0,0,1,2
0,1.367243,,0.018488
1,0.306206,,0.140235
2,-0.744698,,-2.613327
3,0.030054,,-0.831251
4,1.371924,0.075609,-0.529438
5,2.332816,-0.108371,-0.033417
6,-0.829454,0.97814,1.165417


In [18]:
df.iloc[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,1.367243,,
1,0.306206,,
2,-0.744698,,-2.613327
3,0.030054,,-0.831251
4,1.371924,0.075609,-0.529438
5,2.332816,-0.108371,-0.033417
6,-0.829454,0.97814,1.165417


In [19]:
df.dropna()

Unnamed: 0,0,1,2
4,1.371924,0.075609,-0.529438
5,2.332816,-0.108371,-0.033417
6,-0.829454,0.97814,1.165417


In [20]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-0.744698,,-2.613327
3,0.030054,,-0.831251
4,1.371924,0.075609,-0.529438
5,2.332816,-0.108371,-0.033417
6,-0.829454,0.97814,1.165417


In [21]:
df.dropna(thresh=3, axis = 1) # Expected it drop column 1

Unnamed: 0,0,1,2
0,1.367243,,
1,0.306206,,
2,-0.744698,,-2.613327
3,0.030054,,-0.831251
4,1.371924,0.075609,-0.529438
5,2.332816,-0.108371,-0.033417
6,-0.829454,0.97814,1.165417


In [22]:
df.fillna(0)

Unnamed: 0,0,1,2
0,1.367243,0.0,0.0
1,0.306206,0.0,0.0
2,-0.744698,0.0,-2.613327
3,0.030054,0.0,-0.831251
4,1.371924,0.075609,-0.529438
5,2.332816,-0.108371,-0.033417
6,-0.829454,0.97814,1.165417


In [23]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,1.367243,0.5,0.0
1,0.306206,0.5,0.0
2,-0.744698,0.5,-2.613327
3,0.030054,0.5,-0.831251
4,1.371924,0.075609,-0.529438
5,2.332816,-0.108371,-0.033417
6,-0.829454,0.97814,1.165417


In [24]:
df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,1.367243,0.0,0.0
1,0.306206,0.0,0.0
2,-0.744698,0.0,-2.613327
3,0.030054,0.0,-0.831251
4,1.371924,0.075609,-0.529438
5,2.332816,-0.108371,-0.033417
6,-0.829454,0.97814,1.165417


In [25]:
df = pd.DataFrame(np.random.randn(6, 3))
df

Unnamed: 0,0,1,2
0,-0.947046,0.101161,0.595884
1,-0.387724,-1.661954,0.047848
2,-0.125111,0.072455,-1.493539
3,-0.36385,1.052333,0.531574
4,2.472778,-0.413855,1.494256
5,-1.248623,-0.58716,0.211174


In [26]:
df.iloc[2:, 1] = NA
df

Unnamed: 0,0,1,2
0,-0.947046,0.101161,0.595884
1,-0.387724,-1.661954,0.047848
2,-0.125111,,-1.493539
3,-0.36385,,0.531574
4,2.472778,,1.494256
5,-1.248623,,0.211174


In [27]:
df.iloc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,-0.947046,0.101161,0.595884
1,-0.387724,-1.661954,0.047848
2,-0.125111,,-1.493539
3,-0.36385,,0.531574
4,2.472778,,
5,-1.248623,,


In [28]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-0.947046,0.101161,0.595884
1,-0.387724,-1.661954,0.047848
2,-0.125111,-1.661954,-1.493539
3,-0.36385,-1.661954,0.531574
4,2.472778,-1.661954,0.531574
5,-1.248623,-1.661954,0.531574


In [29]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,-0.947046,0.101161,0.595884
1,-0.387724,-1.661954,0.047848
2,-0.125111,-1.661954,-1.493539
3,-0.36385,-1.661954,0.531574
4,2.472778,,0.531574
5,-1.248623,,0.531574


In [30]:
data = pd.Series([1., NA, 3.5, NA, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [31]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [32]:
df

Unnamed: 0,0,1,2
0,-0.947046,0.101161,0.595884
1,-0.387724,-1.661954,0.047848
2,-0.125111,,-1.493539
3,-0.36385,,0.531574
4,2.472778,,
5,-1.248623,,


In [33]:
df.iloc[5,1] = 7.5
df

Unnamed: 0,0,1,2
0,-0.947046,0.101161,0.595884
1,-0.387724,-1.661954,0.047848
2,-0.125111,,-1.493539
3,-0.36385,,0.531574
4,2.472778,,
5,-1.248623,7.5,


In [34]:
df.bfill()

Unnamed: 0,0,1,2
0,-0.947046,0.101161,0.595884
1,-0.387724,-1.661954,0.047848
2,-0.125111,7.5,-1.493539
3,-0.36385,7.5,0.531574
4,2.472778,7.5,
5,-1.248623,7.5,


In [35]:
df.bfill(limit=2)

Unnamed: 0,0,1,2
0,-0.947046,0.101161,0.595884
1,-0.387724,-1.661954,0.047848
2,-0.125111,,-1.493539
3,-0.36385,7.5,0.531574
4,2.472778,7.5,
5,-1.248623,7.5,


In [36]:
df.ffill(limit=1).bfill(limit=1)

Unnamed: 0,0,1,2
0,-0.947046,0.101161,0.595884
1,-0.387724,-1.661954,0.047848
2,-0.125111,-1.661954,-1.493539
3,-0.36385,,0.531574
4,2.472778,7.5,0.531574
5,-1.248623,7.5,


### 7.2 Data Transformation

In [37]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [38]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [39]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [40]:
data['v1'] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [41]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [42]:
data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


In [43]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
                              

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [44]:
meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}
meat_to_animal

{'bacon': 'pig',
 'pulled pork': 'pig',
 'pastrami': 'cow',
 'corned beef': 'cow',
 'honey ham': 'pig',
 'nova lox': 'salmon'}

In [45]:
lowercased = data['food'].str.lower()
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [46]:
data['animal'] = lowercased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [47]:
lowercased.map(meat_to_animal)

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [48]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [49]:
data.replace(-999, np.nan)


0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [50]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [51]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [52]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [53]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [54]:
transform = lambda x: x[:4].upper()
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [55]:
data.index = data.index.map(transform)
data.index

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [56]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [57]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [58]:
data.rename(index={'OHIO': 'INDIANA'},
            columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [59]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [60]:
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [61]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [62]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [63]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

In [64]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [65]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [66]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
cats = pd.cut(ages, bins, labels=group_names)
cats

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [67]:
cats.value_counts()

Youth         5
YoungAdult    3
MiddleAged    3
Senior        1
dtype: int64

In [68]:
data = np.random.rand(20)
data

array([0.68569301, 0.28390605, 0.4755384 , 0.43319142, 0.14628116,
       0.76615301, 0.23417688, 0.79266715, 0.66376817, 0.30428435,
       0.81917852, 0.77079438, 0.55575066, 0.59955049, 0.39327913,
       0.02144902, 0.49863131, 0.98777828, 0.40560664, 0.93940401])

In [69]:
pd.cut(data, 4, precision=2)

[(0.5, 0.75], (0.26, 0.5], (0.26, 0.5], (0.26, 0.5], (0.02, 0.26], ..., (0.02, 0.26], (0.26, 0.5], (0.75, 0.99], (0.26, 0.5], (0.75, 0.99]]
Length: 20
Categories (4, interval[float64]): [(0.02, 0.26] < (0.26, 0.5] < (0.5, 0.75] < (0.75, 0.99]]

In [70]:
data = np.random.randn(1000)
cats = pd.qcut(data,4)
cats

[(-3.464, -0.644], (0.0412, 0.689], (0.0412, 0.689], (-0.644, 0.0412], (-0.644, 0.0412], ..., (0.689, 3.034], (-0.644, 0.0412], (0.689, 3.034], (0.689, 3.034], (0.689, 3.034]]
Length: 1000
Categories (4, interval[float64]): [(-3.464, -0.644] < (-0.644, 0.0412] < (0.0412, 0.689] < (0.689, 3.034]]

In [71]:
pd.value_counts(cats)

(0.689, 3.034]      250
(0.0412, 0.689]     250
(-0.644, 0.0412]    250
(-3.464, -0.644]    250
dtype: int64

In [72]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(-3.464, -1.189], (0.0412, 1.3], (0.0412, 1.3], (-1.189, 0.0412], (-1.189, 0.0412], ..., (0.0412, 1.3], (-1.189, 0.0412], (0.0412, 1.3], (1.3, 3.034], (0.0412, 1.3]]
Length: 1000
Categories (4, interval[float64]): [(-3.464, -1.189] < (-1.189, 0.0412] < (0.0412, 1.3] < (1.3, 3.034]]

## Problems from last week
### Joel R.

This is an API that might be cool to to use also in the next chapter when we use regular expressions. It can search for lyrics, using the artist/band and song 

The url is 
<a href="https://api.lyrics.ovh/v1/"> https://api.lyrics.ovh/v1/ </a>

First, we define the parameters of the search (band AND song)

```python
artist = "queen" #try to use full names
song = "bohemian rhapsody" #with spaces
# This is how the url would look:
url = 'https://api.lyrics.ovh/v1/' #an API that searches lyrics
url= url + artist.lower() + "/" + song.lower()
```

1. How would you interact with the API in python to get the lyrics?

In [73]:
import requests
artist = "Katastro"
song = "Scoreboard"
url = 'https://api.lyrics.ovh/v1/'
url = url + artist.lower() + '/' + song.lower()
url

'https://api.lyrics.ovh/v1/katastro/scoreboard'

In [74]:
query = requests.get(url)

2. What is returned?

In [75]:
query # Not on the site

<Response [404]>

In [76]:
artist = "Sublime"
song = "What I Got"
url = 'https://api.lyrics.ovh/v1/'
url = url + artist.lower() + '/' + song.lower()
url

'https://api.lyrics.ovh/v1/sublime/what i got'

In [77]:
query = requests.get(url)

In [78]:
query

<Response [200]>

3. what type of object is returned by the request?

In [79]:
response = query.json()
response

{'lyrics': "Early in the morning, risin' to the street\nLight me up that cigarette, and I'll strap shoes on my feet (Dee dee dee dee dee)\nGot to find the reason, reason things went wrong\nGot to find a reason why my money's all gone\n\nI got a dalmatian, and I can still get high\nI can play the guitar like a motherfucking riot, uh!\n(Loo da-doo da-doo doo)\n(A-loo da-doo doo doo doo doo da-doo doo)\n\nWell, life is (too short), so love the one you got\n'Cause you might get run over or you might get shot\nNever start no static I just get it off my chest\nNever had to battle with no bulletproof vest\n\nTake a small example, take a ti-di-ti-tip from me\nTake all of your money give it all to charity\nLove is what I got , it's within my reach\nAnd the Sublime style's still straight from Long Beach\n\nIt all comes back to you, you're bound to get what you deserve\nTry and test that, you're bound to get served\nLove's what I got, don't start a riot\nYou'll feel it when the dance gets hot\n\n

4. How can you unpack the lyrics into a string object?

In [80]:
lyrics = response['lyrics']

In [81]:
print(lyrics)

Early in the morning, risin' to the street
Light me up that cigarette, and I'll strap shoes on my feet (Dee dee dee dee dee)
Got to find the reason, reason things went wrong
Got to find a reason why my money's all gone

I got a dalmatian, and I can still get high
I can play the guitar like a motherfucking riot, uh!
(Loo da-doo da-doo doo)
(A-loo da-doo doo doo doo doo da-doo doo)

Well, life is (too short), so love the one you got
'Cause you might get run over or you might get shot
Never start no static I just get it off my chest
Never had to battle with no bulletproof vest

Take a small example, take a ti-di-ti-tip from me
Take all of your money give it all to charity
Love is what I got , it's within my reach
And the Sublime style's still straight from Long Beach

It all comes back to you, you're bound to get what you deserve
Try and test that, you're bound to get served
Love's what I got, don't start a riot
You'll feel it when the dance gets hot

'Cause lovin' is what I got, I said r

### Min-Yao

This question is related to 6.2 Binary Data Formats: Reading Microsoft Excel Files. We are going to look at the list of ballroom dance athletes on World Dancesport Federation. 
Here is the [link](https://www.worlddancesport.org/Athlete/List)

1. Please download all data from the website and make a dataFrame.

In [82]:
link = 'https://www.worlddancesport.org/Athlete/List'
tables = pd.read_html(link)
df = tables[0]
print(df.shape)
df

(25, 7)


Unnamed: 0,Name,Surname,Country,Category,Status,Min,Unnamed: 6
0,Aliaksandr,Samosiuk,BLR,Adult,Active,10055023,
1,Roberto,Destri,ITA,Senior IIIb,Active,10003739,
2,Bernhard,Fuss,GER,Senior IIIa,Active,10003849,
3,Sonja,Fuss,GER,Senior IIIa,Active,10003850,
4,Dieter,Keppeler,GER,Senior IVa,Active,10003823,
5,Manuela,Schraut-Keppeler,GER,Senior IIIb,Active,10003824,
6,Guido,Schubert,GER,Senior IIIa,Active,10006813,
7,Birte,Schubert,GER,Senior IIIa,Active,10006814,
8,Vitam,Kodelja,SLO,Senior IVb,Active,10003561,
9,Barbara,Kodelja,SLO,Senior IIIa,Active,10003562,


2. Please calculate the number of athletes from each country. Which countries are the top 3 countries have the highest number of athletes in dancesport?

In [83]:
df['Country'].value_counts()

GER    20
SLO     2
ITA     1
BLR     1
AUT     1
Name: Country, dtype: int64

3. Let's focus on the top1 country that you find from question 2. Make a new excel file with only the athletes from this country and please calculate the number of athletes from each age group in this country.

In [84]:
new_df = df[df['Country'] == 'GER']
new_df

Unnamed: 0,Name,Surname,Country,Category,Status,Min,Unnamed: 6
2,Bernhard,Fuss,GER,Senior IIIa,Active,10003849,
3,Sonja,Fuss,GER,Senior IIIa,Active,10003850,
4,Dieter,Keppeler,GER,Senior IVa,Active,10003823,
5,Manuela,Schraut-Keppeler,GER,Senior IIIb,Active,10003824,
6,Guido,Schubert,GER,Senior IIIa,Active,10006813,
7,Birte,Schubert,GER,Senior IIIa,Active,10006814,
11,Michael,Feld,GER,Senior IVa,Active,10003779,
12,Heide,Glaser,GER,Senior IIIb,Active,10003780,
13,Alexander,Valko,GER,Senior IIIb,Active,10003575,
14,Barbara,Valko,GER,Senior IVa,Active,10003576,


In [85]:
new_df.to_excel('examples/German_Athletes.xlsx')

In [86]:
new_df['Category'].value_counts()

Senior IIIb    8
Senior IIIa    7
Senior IVa     4
Senior IVb     1
Name: Category, dtype: int64

### Kae

I want to bake a cake for my friend's birthday. 
I found [this](http://www.recipepuppy.com/api/) API with lots of recipes, and I want to find a recipe for a **cake** with **nutella** in it. However, my friend is allergic to eggs, so I can't use any recipes that include eggs.

Please find recipes (and their URLs) of cakes that won't make my friend sick!

In [87]:
## Details on api found at http://www.recipepuppy.com/about/api/
url = 'http://www.recipepuppy.com/api/'
parameters = {'i' : 'nutella', 'q' : 'cake'}
response = requests.get(url, params = parameters)
response

<Response [200]>

In [88]:
data = response.json()
data

{'title': 'Recipe Puppy',
 'version': 0.1,
 'href': 'http://www.recipepuppy.com/',
 'results': [{'title': 'Ferrero Rocher Chocolate Cheeese Cake Minis Recipe',
   'href': 'http://www.grouprecipes.com/80642/ferrero-rocher-chocolate-cheeese-cake-minis.html',
   'ingredients': 'nutella',
   'thumbnail': 'http://img.recipepuppy.com/347493.jpg'},
  {'title': 'Nutella Bundt Cake Recipe',
   'href': 'http://www.grouprecipes.com/77607/nutella-bundt-cake.html',
   'ingredients': 'buttermilk, powdered sugar, eggs, hazelnut syrup, nutella',
   'thumbnail': ''},
  {'title': 'Ultimate Cravings Mini  Cake Recipe',
   'href': 'http://www.grouprecipes.com/21753/ultimate-cravings-mini-cake.html',
   'ingredients': 'flour, raspberry jam, peanut butter, nutella, brown sugar, chocolate liqueur, syrup',
   'thumbnail': 'http://img.recipepuppy.com/395741.jpg'},
  {'title': 'Gianduia Mousse Cake Recipe',
   'href': 'http://www.grouprecipes.com/31372/gianduia-mousse-cake.html',
   'ingredients': 'whipped crea

In [89]:
df = pd.DataFrame(data['results'], columns= ['title', 'ingredients', 'href'])
df

Unnamed: 0,title,ingredients,href
0,Ferrero Rocher Chocolate Cheeese Cake Minis Re...,nutella,http://www.grouprecipes.com/80642/ferrero-roch...
1,Nutella Bundt Cake Recipe,"buttermilk, powdered sugar, eggs, hazelnut syr...",http://www.grouprecipes.com/77607/nutella-bund...
2,Ultimate Cravings Mini Cake Recipe,"flour, raspberry jam, peanut butter, nutella, ...",http://www.grouprecipes.com/21753/ultimate-cra...
3,Gianduia Mousse Cake Recipe,"whipped cream, eggs, butter, nutella, sugar, h...",http://www.grouprecipes.com/31372/gianduia-mou...
4,Cream Cheese and Nutella Filled Pound Cake,"pound cake, cream cheese, nutella, powdered su...",http://www.recipezaar.com/Cream-Cheese-and-Nut...
5,Baked Cake Recipe,"eggs, nutella, butter, baking powder, salt, va...",http://www.grouprecipes.com/25302/baked-cake.html
6,Chocolate Strawberry Pound Cake,"pound cake, nutella, strawberries",http://recipe.aol.com/recipe/chocolate-strawbe...
7,Cinnamon Nutella Cake,"baking powder, butter, caster sugar, eggs, cin...",http://www.recipezaar.com/Cinnamon-Nutella-Cak...
8,Sky-high Layered Ice Cream Cake,"heavy cream, nutella, chocolate ice cream, str...",http://www.foodnetwork.com/recipes/saras-secre...
9,Chocolate-glazed Hazelnut Mousse Cake Recipe,"nutella, cocoa powder, cocoa powder, flour, ga...",http://www.grouprecipes.com/46881/chocolate-gl...


In [90]:
df[df['ingredients'].str.contains(" egg")!=True]

Unnamed: 0,title,ingredients,href
0,Ferrero Rocher Chocolate Cheeese Cake Minis Re...,nutella,http://www.grouprecipes.com/80642/ferrero-roch...
2,Ultimate Cravings Mini Cake Recipe,"flour, raspberry jam, peanut butter, nutella, ...",http://www.grouprecipes.com/21753/ultimate-cra...
4,Cream Cheese and Nutella Filled Pound Cake,"pound cake, cream cheese, nutella, powdered su...",http://www.recipezaar.com/Cream-Cheese-and-Nut...
5,Baked Cake Recipe,"eggs, nutella, butter, baking powder, salt, va...",http://www.grouprecipes.com/25302/baked-cake.html
6,Chocolate Strawberry Pound Cake,"pound cake, nutella, strawberries",http://recipe.aol.com/recipe/chocolate-strawbe...
8,Sky-high Layered Ice Cream Cake,"heavy cream, nutella, chocolate ice cream, str...",http://www.foodnetwork.com/recipes/saras-secre...
9,Chocolate-glazed Hazelnut Mousse Cake Recipe,"nutella, cocoa powder, cocoa powder, flour, ga...",http://www.grouprecipes.com/46881/chocolate-gl...


## Problem for this week
Practice with missing data
1. Create a DataFrame using
```python
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(10, 10), columns= ['a','b','c','d','e','f','g','h','i','j'])
df.iloc[:7, 1:3] = NA
df.iloc[4:8, 5:8] = NA
df.iloc[:,9] = NA
df
```

In [91]:
df = pd.DataFrame(np.random.randn(10, 10), columns= ['a','b','c','d','e','f','g','h','i','j'])
df.iloc[:7, 1:3] = NA
df.iloc[4:8, 5:8] = NA
df.iloc[:,9] = NA
df

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
0,-0.485757,,,-1.925222,0.999875,-1.610897,2.829201,-0.36925,-0.194849,
1,-0.37235,,,-0.653888,0.031172,-0.245609,-0.980438,0.34723,-0.161316,
2,0.357593,,,-0.581407,0.327089,2.128128,-1.234195,-1.03366,-0.622067,
3,1.002534,,,-0.606274,-0.427468,0.18918,0.699466,0.836803,1.757403,
4,1.698762,,,0.226109,0.912496,,,,-0.636272,
5,1.660935,,,1.346899,0.573859,,,,-0.115911,
6,-1.36748,,,-0.254199,-0.544449,,,,0.751524,
7,-0.273865,1.346217,1.06788,-1.002703,0.19967,,,,-0.419357,
8,-0.213522,1.131751,-1.142564,1.595441,0.570184,0.485335,-1.236071,-0.561821,0.284013,
9,-0.784597,0.920107,-0.664024,-0.176445,0.292323,0.720189,0.845043,1.67386,-0.291756,


2. Remove all rows where at least half the columns have missing data

In [92]:
df.dropna(thresh = df.shape[0]/2)

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
0,-0.485757,,,-1.925222,0.999875,-1.610897,2.829201,-0.36925,-0.194849,
1,-0.37235,,,-0.653888,0.031172,-0.245609,-0.980438,0.34723,-0.161316,
2,0.357593,,,-0.581407,0.327089,2.128128,-1.234195,-1.03366,-0.622067,
3,1.002534,,,-0.606274,-0.427468,0.18918,0.699466,0.836803,1.757403,
7,-0.273865,1.346217,1.06788,-1.002703,0.19967,,,,-0.419357,
8,-0.213522,1.131751,-1.142564,1.595441,0.570184,0.485335,-1.236071,-0.561821,0.284013,
9,-0.784597,0.920107,-0.664024,-0.176445,0.292323,0.720189,0.845043,1.67386,-0.291756,


3. Remove all columns where at least half the rows have missing data

In [93]:
df.dropna(axis = 1, thresh = df.shape[1]/2)

Unnamed: 0,a,d,e,f,g,h,i
0,-0.485757,-1.925222,0.999875,-1.610897,2.829201,-0.36925,-0.194849
1,-0.37235,-0.653888,0.031172,-0.245609,-0.980438,0.34723,-0.161316
2,0.357593,-0.581407,0.327089,2.128128,-1.234195,-1.03366,-0.622067
3,1.002534,-0.606274,-0.427468,0.18918,0.699466,0.836803,1.757403
4,1.698762,0.226109,0.912496,,,,-0.636272
5,1.660935,1.346899,0.573859,,,,-0.115911
6,-1.36748,-0.254199,-0.544449,,,,0.751524
7,-0.273865,-1.002703,0.19967,,,,-0.419357
8,-0.213522,1.595441,0.570184,0.485335,-1.236071,-0.561821,0.284013
9,-0.784597,-0.176445,0.292323,0.720189,0.845043,1.67386,-0.291756


4. Combine problems 2 and 3. Does the order matter?

In [94]:
df.dropna(thresh = df.shape[0]/2).dropna(axis = 1, thresh = df.shape[1]/2)

Unnamed: 0,a,d,e,f,g,h,i
0,-0.485757,-1.925222,0.999875,-1.610897,2.829201,-0.36925,-0.194849
1,-0.37235,-0.653888,0.031172,-0.245609,-0.980438,0.34723,-0.161316
2,0.357593,-0.581407,0.327089,2.128128,-1.234195,-1.03366,-0.622067
3,1.002534,-0.606274,-0.427468,0.18918,0.699466,0.836803,1.757403
7,-0.273865,-1.002703,0.19967,,,,-0.419357
8,-0.213522,1.595441,0.570184,0.485335,-1.236071,-0.561821,0.284013
9,-0.784597,-0.176445,0.292323,0.720189,0.845043,1.67386,-0.291756


In [95]:
df.dropna(axis = 1, thresh = df.shape[1]/2).dropna(thresh = df.shape[0]/2)

Unnamed: 0,a,d,e,f,g,h,i
0,-0.485757,-1.925222,0.999875,-1.610897,2.829201,-0.36925,-0.194849
1,-0.37235,-0.653888,0.031172,-0.245609,-0.980438,0.34723,-0.161316
2,0.357593,-0.581407,0.327089,2.128128,-1.234195,-1.03366,-0.622067
3,1.002534,-0.606274,-0.427468,0.18918,0.699466,0.836803,1.757403
8,-0.213522,1.595441,0.570184,0.485335,-1.236071,-0.561821,0.284013
9,-0.784597,-0.176445,0.292323,0.720189,0.845043,1.67386,-0.291756


5. Remove all rows which have NA in either column 'b' or 'f'

In [96]:
df.dropna(subset=['b','f'])

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
8,-0.213522,1.131751,-1.142564,1.595441,0.570184,0.485335,-1.236071,-0.561821,0.284013,
9,-0.784597,0.920107,-0.664024,-0.176445,0.292323,0.720189,0.845043,1.67386,-0.291756,
