# Trabajando con Strings

## Introducción a las operaciones de String con Pandas

In [None]:
import numpy as np
x = np.array([2, 3, 5, 7, 11, 13])
x * 2

array([ 4,  6, 10, 14, 22, 26])

In [None]:
data = ['peter', 'Paul', 'MARY', 'gUIDO']
[s.capitalize() for s in data]

['Peter', 'Paul', 'Mary', 'Guido']

In [None]:
data = ['peter', 'Paul', None, 'MARY', 'gUIDO']
[s.capitalize() for s in data]

AttributeError: 'NoneType' object has no attribute 'capitalize'

In [None]:
data = ['peter', 'Paul', None, 'MARY', 'gUIDO']
[s if s is None else s.capitalize() for s in data]

['Peter', 'Paul', None, 'Mary', 'Guido']

In [None]:
import pandas as pd
names = pd.Series(data)
names

Unnamed: 0,0
0,peter
1,Paul
2,
3,MARY
4,gUIDO


In [None]:
names.str.capitalize()

Unnamed: 0,0
0,Peter
1,Paul
2,
3,Mary
4,Guido


Métodos de string de pandas

|           |                |                |                |
|-----------|----------------|----------------|----------------|
|`len()`    | `lower()`      | `translate()`  | `islower()`    |
|`ljust()`  | `upper()`      | `startswith()` | `isupper()`    |
|`rjust()`  | `find()`       | `endswith()`   | `isnumeric()`  |
|`center()` | `rfind()`      | `isalnum()`    | `isdecimal()`  |
|`zfill()`  | `index()`      | `isalpha()`    | `split()`      |
|`strip()`  | `rindex()`     | `isdigit()`    | `rsplit()`     |
|`rstrip()` | `capitalize()` | `isspace()`    | `partition()`  |
|`lstrip()` | `swapcase()`   | `istitle()`    | `rpartition()` |

In [None]:
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
                   'Eric Idle', 'Terry Jones', 'Michael Palin'])

In [None]:
monte.str.lower()

Unnamed: 0,0
0,graham chapman
1,john cleese
2,terry gilliam
3,eric idle
4,terry jones
5,michael palin


In [None]:
monte.str.len()

Unnamed: 0,0
0,14
1,11
2,13
3,9
4,11
5,13


In [None]:
monte.str.startswith('T')

Unnamed: 0,0
0,False
1,False
2,True
3,False
4,True
5,False


In [None]:
monte.str.split()

Unnamed: 0,0
0,"[Graham, Chapman]"
1,"[John, Cleese]"
2,"[Terry, Gilliam]"
3,"[Eric, Idle]"
4,"[Terry, Jones]"
5,"[Michael, Palin]"


## Métodos usando expresiones regulares (regex)


| Method    | Description |
|-----------|-------------|
| `match`   | Calls `re.match` on each element, returning a Boolean. |
| `extract` | Calls `re.match` on each element, returning matched groups as strings.|
| `findall` | Calls `re.findall` on each element |
| `replace` | Replaces occurrences of pattern with some other string|
| `contains`| Calls `re.search` on each element, returning a boolean |
| `count`   | Counts occurrences of pattern|
| `split`   | Equivalent to `str.split`, but accepts regexps |
| `rsplit`  | Equivalent to `str.rsplit`, but accepts regexps |

In [None]:
monte.str.extract('([A-Za.a]+)', expand=False)

Unnamed: 0,0
0,G
1,J
2,T
3,E
4,T
5,M


In [None]:
monte.str.findall(r'^[^AEIOU].*[^aeiou]$')

Unnamed: 0,0
0,[Graham Chapman]
1,[]
2,[Terry Gilliam]
3,[]
4,[Terry Jones]
5,[Michael Palin]


## Otros métodos

| Method | Description |
|--------|-------------|
| `get` | Indexes each element |
| `slice` | Slices each element|
| `slice_replace` | Replaces slice in each element with the passed value|
| `cat`      | Concatenates strings|
| `repeat` | Repeats values |
| `normalize` | Returns Unicode form of strings |
| `pad` | Adds whitespace to left, right, or both sides of strings|
| `wrap` | Splits long strings into lines with length less than a given width|
| `join` | Joins strings in each element of the `Series` with the passed separator|
| `get_dummies` | Extracts dummy variables as a `DataFrame` |

In [None]:
monte

Unnamed: 0,0
0,Graham Chapman
1,John Cleese
2,Terry Gilliam
3,Eric Idle
4,Terry Jones
5,Michael Palin


In [None]:
monte.str[0:3]

Unnamed: 0,0
0,Gra
1,Joh
2,Ter
3,Eri
4,Ter
5,Mic


In [None]:
monte.str.split().str[-1]

Unnamed: 0,0
0,Chapman
1,Cleese
2,Gilliam
3,Idle
4,Jones
5,Palin


In [None]:
full_monte = pd.DataFrame({'name': monte,
                           'info': ['B|C|D', 'B|D', 'A|C',
                                    'B|D', 'B|C', 'B|C|D']})
full_monte

Unnamed: 0,name,info
0,Graham Chapman,B|C|D
1,John Cleese,B|D
2,Terry Gilliam,A|C
3,Eric Idle,B|D
4,Terry Jones,B|C
5,Michael Palin,B|C|D


In [None]:
full_monte['info'].str.get_dummies('|')

Unnamed: 0,A,B,C,D
0,0,1,1,1
1,0,1,0,1
2,1,0,1,0
3,0,1,0,1
4,0,1,1,0
5,0,1,1,1


# Ejemplo: Base de Datos de Recetas

In [None]:
repo = "https://raw.githubusercontent.com/jakevdp/open-recipe-data/master"
! mkdir data
!cd data && curl -O {repo}/recipeitems.json.gz
!gunzip data/recipeitems.json.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 29.3M  100 29.3M    0     0  42.6M      0 --:--:-- --:--:-- --:--:-- 42.5M


In [None]:
recipes = pd.read_json('data/recipeitems.json', lines=True)
recipes.shape

(173278, 17)

In [None]:
recipes.head()

Unnamed: 0,_id,name,ingredients,url,image,ts,cookTime,source,recipeYield,datePublished,prepTime,description,totalTime,creator,recipeCategory,dateModified,recipeInstructions
0,{'$oid': '5160756b96cc62079cc2db15'},Drop Biscuits and Sausage Gravy,Biscuits\n3 cups All-purpose Flour\n2 Tablespo...,http://thepioneerwoman.com/cooking/2013/03/dro...,http://static.thepioneerwoman.com/cooking/file...,{'$date': 1365276011104},PT30M,thepioneerwoman,12,2013-03-11,PT10M,"Late Saturday afternoon, after Marlboro Man ha...",,,,,
1,{'$oid': '5160756d96cc62079cc2db16'},Hot Roast Beef Sandwiches,12 whole Dinner Rolls Or Small Sandwich Buns (...,http://thepioneerwoman.com/cooking/2013/03/hot...,http://static.thepioneerwoman.com/cooking/file...,{'$date': 1365276013902},PT20M,thepioneerwoman,12,2013-03-13,PT20M,"When I was growing up, I participated in my Ep...",,,,,
2,{'$oid': '5160756f96cc6207a37ff777'},Morrocan Carrot and Chickpea Salad,Dressing:\n1 tablespoon cumin seeds\n1/3 cup /...,http://www.101cookbooks.com/archives/moroccan-...,http://www.101cookbooks.com/mt-static/images/f...,{'$date': 1365276015332},,101cookbooks,,2013-01-07,PT15M,A beauty of a carrot salad - tricked out with ...,,,,,
3,{'$oid': '5160757096cc62079cc2db17'},Mixed Berry Shortcake,Biscuits\n3 cups All-purpose Flour\n2 Tablespo...,http://thepioneerwoman.com/cooking/2013/03/mix...,http://static.thepioneerwoman.com/cooking/file...,{'$date': 1365276016700},PT15M,thepioneerwoman,8,2013-03-18,PT15M,It's Monday! It's a brand new week! The birds ...,,,,,
4,{'$oid': '5160757496cc6207a37ff778'},Pomegranate Yogurt Bowl,For each bowl: \na big dollop of Greek yogurt\...,http://www.101cookbooks.com/archives/pomegrana...,http://www.101cookbooks.com/mt-static/images/f...,{'$date': 1365276020318},,101cookbooks,Serves 1.,2013-01-20,PT5M,A simple breakfast bowl made with Greek yogurt...,,,,,


In [None]:
recipes.iloc[0]

Unnamed: 0,0
_id,{'$oid': '5160756b96cc62079cc2db15'}
name,Drop Biscuits and Sausage Gravy
ingredients,Biscuits\n3 cups All-purpose Flour\n2 Tablespo...
url,http://thepioneerwoman.com/cooking/2013/03/dro...
image,http://static.thepioneerwoman.com/cooking/file...
ts,{'$date': 1365276011104}
cookTime,PT30M
source,thepioneerwoman
recipeYield,12
datePublished,2013-03-11


In [None]:
recipes.ingredients.str.len().describe()

Unnamed: 0,ingredients
count,173278.0
mean,244.617926
std,146.705285
min,0.0
25%,147.0
50%,221.0
75%,314.0
max,9067.0


In [None]:
recipes.name[np.argmax(recipes.ingredients.str.len())]

'Carrot Pineapple Spice &amp; Brownie Layer Cake with Whipped Cream &amp; Cream Cheese Frosting and Marzipan Carrots'

In [None]:
recipes.description.str.contains('[Bb]reakfast').sum()

3524

In [None]:
recipes.ingredients.str.contains('[Cc]innamon').sum()

np.int64(10526)

In [None]:
recipes.ingredients.str.contains('[Cc]inamon').sum()

np.int64(11)

### Recomendador simple de recetas

In [None]:
spice_list = ['salt', 'pepper', 'oregano', 'sage', 'parsley',
              'rosemary', 'tarragon', 'thyme', 'paprika', 'cumin']

In [None]:
import re
spice_df = pd.DataFrame({
    spice: recipes.ingredients.str.contains(spice, re.IGNORECASE) # Para que no distinga entre mayúsculas y minúsculas
    for spice in spice_list})
spice_df.head()

Unnamed: 0,salt,pepper,oregano,sage,parsley,rosemary,tarragon,thyme,paprika,cumin
0,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,True,True,False,False,False,False,False,False,False,True
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False


In [None]:
selection = spice_df.query('parsley & paprika')
len(selection)

656

In [None]:
recipes.name[selection.index]

Unnamed: 0,name
626,Roasted Eggplant and Pickled Beet Sandwiches
750,"Pardina lentils with white wine, Serrano ham a..."
802,Monkfish goujons with romesco sauce
847,Dukkah lamb cutlets with quinoa salad
1248,White Bean Edamame Salad with Bacon and Smoky ...
...,...
172510,Sicilian-style salmon with garlic mushrooms an...
172891,Homemade Spanish Romesco Sauce
172904,Grilled asparagus with vegetable crumble
172905,New potato and chorizo tortilla
