# 3.11 向量化字符串操作

## 3.11.1 Pandas字符串操作简介

In [12]:
import numpy as np

In [13]:
x = np.array([2, 3, 5, 7, 11, 13])

In [14]:
x

array([ 2,  3,  5,  7, 11, 13])

In [15]:
x * 2

array([ 4,  6, 10, 14, 22, 26])

In [16]:
data = ['peter', 'Paul', 'MARy', 'gUIDO']

In [17]:
[s.capitalize() for s in data]

['Peter', 'Paul', 'Mary', 'Guido']

In [18]:
data = data + [None]

In [19]:
data

['peter', 'Paul', 'MARy', 'gUIDO', None]

In [20]:
[s.capitalize() for s in data]

AttributeError: 'NoneType' object has no attribute 'capitalize'

In [21]:
import pandas as pd

In [22]:
names = pd.Series(data)

In [23]:
names

0    peter
1     Paul
2     MARy
3    gUIDO
4     None
dtype: object

In [24]:
names.str.capitalize()

0    Peter
1     Paul
2     Mary
3    Guido
4     None
dtype: object

In [25]:
names.str.isnumeric()

0    False
1    False
2    False
3    False
4     None
dtype: object

## 3.11.2 Pandas字符串方法列表

In [26]:
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam', 'Eric Idle', 'Terry Jones', 'Michael Palin'])

1. 与Python字符串方法相似的方法

In [27]:
monte

0    Graham Chapman
1       John Cleese
2     Terry Gilliam
3         Eric Idle
4       Terry Jones
5     Michael Palin
dtype: object

In [28]:
monte.str.lower()

0    graham chapman
1       john cleese
2     terry gilliam
3         eric idle
4       terry jones
5     michael palin
dtype: object

In [29]:
monte.str.len()

0    14
1    11
2    13
3     9
4    11
5    13
dtype: int64

In [30]:
monte.str.startswith('T')

0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool

In [31]:
monte.str.split()

0    [Graham, Chapman]
1       [John, Cleese]
2     [Terry, Gilliam]
3         [Eric, Idle]
4       [Terry, Jones]
5     [Michael, Palin]
dtype: object

2. 使用正则表达式的方法

In [32]:
monte.str.extract('([A-Za-z]+)')

Unnamed: 0,0
0,Graham
1,John
2,Terry
3,Eric
4,Terry
5,Michael


In [33]:
# 找出所有开头和结尾都是辅音字母的名字
monte.str.findall(r'^[^AEIOU].*[^aeiou]$')

0    [Graham Chapman]
1                  []
2     [Terry Gilliam]
3                  []
4       [Terry Jones]
5     [Michael Palin]
dtype: object

In [34]:
monte

0    Graham Chapman
1       John Cleese
2     Terry Gilliam
3         Eric Idle
4       Terry Jones
5     Michael Palin
dtype: object

3. 其他字符串方法

In [35]:
monte.str[0:3]

0    Gra
1    Joh
2    Ter
3    Eri
4    Ter
5    Mic
dtype: object

In [36]:
monte.str.split().str.get(-1)

0    Chapman
1     Cleese
2    Gilliam
3       Idle
4      Jones
5      Palin
dtype: object

In [37]:
monte.str.split().str.get(0)

0     Graham
1       John
2      Terry
3       Eric
4      Terry
5    Michael
dtype: object

In [38]:
full_monte = pd.DataFrame({'name': monte,
                          'info': ['B|C|D', 'B|D', 'A|C', 'B|D', 'B|C', 'B|D|C']})

In [39]:
full_monte

Unnamed: 0,info,name
0,B|C|D,Graham Chapman
1,B|D,John Cleese
2,A|C,Terry Gilliam
3,B|D,Eric Idle
4,B|C,Terry Jones
5,B|D|C,Michael Palin


In [40]:
full_monte['info'].str.get_dummies('|')

Unnamed: 0,A,B,C,D
0,0,1,1,1
1,0,1,0,1
2,1,0,1,0
3,0,1,0,1
4,0,1,1,0
5,0,1,1,1


## 3.11.3 案例：食谱数据库

In [41]:
try:
    recipes = pd.read_json('D:/Python3Space/BookStudy/book3/data/20170107-061401-recipeitems.json')
except ValueError as e:
    print("ValerError: ", e)

ValerError:  Trailing data


In [42]:
with open('D:/Python3Space/BookStudy/book3/data/20170107-061401-recipeitems.json') as f:
    line = f.readline()
pd.read_json(line).shape

(2, 12)

In [43]:
# 将文件内容读取成Python数组
with open('D:/Python3Space/BookStudy/book3/data/20170107-061401-recipeitems.json', 'r', encoding='utf-8') as f:
    # 提取每一行的内容
    data = (line.strip() for line in f)
    # 将所有内容合并成一个列表
    data_json = "[{0}]".format(','.join(data))
# 用JSON形式读取数据
recipes = pd.read_json(data_json)

In [44]:
recipes.shape

(173278, 17)

In [45]:
recipes.head()

Unnamed: 0,_id,cookTime,creator,dateModified,datePublished,description,image,ingredients,name,prepTime,recipeCategory,recipeInstructions,recipeYield,source,totalTime,ts,url
0,{'$oid': '5160756b96cc62079cc2db15'},PT30M,,,2013-03-11,"Late Saturday afternoon, after Marlboro Man ha...",http://static.thepioneerwoman.com/cooking/file...,Biscuits\n3 cups All-purpose Flour\n2 Tablespo...,Drop Biscuits and Sausage Gravy,PT10M,,,12,thepioneerwoman,,{'$date': 1365276011104},http://thepioneerwoman.com/cooking/2013/03/dro...
1,{'$oid': '5160756d96cc62079cc2db16'},PT20M,,,2013-03-13,"When I was growing up, I participated in my Ep...",http://static.thepioneerwoman.com/cooking/file...,12 whole Dinner Rolls Or Small Sandwich Buns (...,Hot Roast Beef Sandwiches,PT20M,,,12,thepioneerwoman,,{'$date': 1365276013902},http://thepioneerwoman.com/cooking/2013/03/hot...
2,{'$oid': '5160756f96cc6207a37ff777'},,,,2013-01-07,A beauty of a carrot salad - tricked out with ...,http://www.101cookbooks.com/mt-static/images/f...,Dressing:\n1 tablespoon cumin seeds\n1/3 cup /...,Morrocan Carrot and Chickpea Salad,PT15M,,,,101cookbooks,,{'$date': 1365276015332},http://www.101cookbooks.com/archives/moroccan-...
3,{'$oid': '5160757096cc62079cc2db17'},PT15M,,,2013-03-18,It's Monday! It's a brand new week! The birds ...,http://static.thepioneerwoman.com/cooking/file...,Biscuits\n3 cups All-purpose Flour\n2 Tablespo...,Mixed Berry Shortcake,PT15M,,,8,thepioneerwoman,,{'$date': 1365276016700},http://thepioneerwoman.com/cooking/2013/03/mix...
4,{'$oid': '5160757496cc6207a37ff778'},,,,2013-01-20,A simple breakfast bowl made with Greek yogurt...,http://www.101cookbooks.com/mt-static/images/f...,For each bowl: \na big dollop of Greek yogurt\...,Pomegranate Yogurt Bowl,PT5M,,,Serves 1.,101cookbooks,,{'$date': 1365276020318},http://www.101cookbooks.com/archives/pomegrana...


In [46]:
recipes.iloc[0]

_id                                {'$oid': '5160756b96cc62079cc2db15'}
cookTime                                                          PT30M
creator                                                             NaN
dateModified                                                        NaN
datePublished                                                2013-03-11
description           Late Saturday afternoon, after Marlboro Man ha...
image                 http://static.thepioneerwoman.com/cooking/file...
ingredients           Biscuits\n3 cups All-purpose Flour\n2 Tablespo...
name                                    Drop Biscuits and Sausage Gravy
prepTime                                                          PT10M
recipeCategory                                                      NaN
recipeInstructions                                                  NaN
recipeYield                                                          12
source                                                  thepione

In [47]:
recipes.ingredients.str.len().describe()

count    173278.000000
mean        244.617926
std         146.705285
min           0.000000
25%         147.000000
50%         221.000000
75%         314.000000
max        9067.000000
Name: ingredients, dtype: float64

In [48]:
recipes.name[np.argmax(recipes.ingredients.str.len())]

will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.
  return getattr(obj, method)(*args, **kwds)


'Carrot Pineapple Spice &amp; Brownie Layer Cake with Whipped Cream &amp; Cream Cheese Frosting and Marzipan Carrots'

哪些食谱是早餐

In [49]:
recipes.description.str.contains('[Bb]reakfast').sum()

3524

有多少食谱用肉桂（cinnamon）作为食材

In [50]:
recipes.ingredients.str.contains('[Cc]innamon').sum()

10526

哪些食谱把肉桂写错了，写成了“cinamon”

In [51]:
recipes.ingredients.str.contains('[Cc]inamon').sum()

11

1. 制作简易的美食推荐系统

In [52]:
spice_list = ['salt', 'pepper', 'oregano', 'sage', 'parsley', 'rosemary', 'tarragon', 'thyme', 'paprika', 'cumin']

In [53]:
import re

In [54]:
spice_df = pd.DataFrame(dict((spice, recipes.ingredients.str.contains(spice, re.IGNORECASE)) for spice in spice_list))

In [55]:
spice_df.head()

Unnamed: 0,cumin,oregano,paprika,parsley,pepper,rosemary,sage,salt,tarragon,thyme
0,False,False,False,False,False,False,True,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,True,False,False,True,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False


In [56]:
selection = spice_df.query('parsley & paprika & tarragon')

In [57]:
len(selection)

10

In [58]:
selection

Unnamed: 0,cumin,oregano,paprika,parsley,pepper,rosemary,sage,salt,tarragon,thyme
2069,False,False,True,True,True,False,False,False,True,False
74964,False,False,True,True,False,False,False,False,True,False
93768,False,False,True,True,True,False,True,True,True,False
113926,False,False,True,True,True,False,False,True,True,False
137686,False,False,True,True,True,False,False,True,True,False
140530,False,False,True,True,True,False,False,True,True,True
158475,True,False,True,True,True,False,False,True,True,False
158486,False,False,True,True,True,False,False,True,True,False
163175,False,True,True,True,True,False,False,True,True,False
165243,False,False,True,True,True,False,False,True,True,False


In [59]:
recipes.name[selection.index]

2069      All cremat with a Little Gem, dandelion and wa...
74964                         Lobster with Thermidor butter
93768      Burton's Southern Fried Chicken with White Gravy
113926                     Mijo's Slow Cooker Shredded Beef
137686                     Asparagus Soup with Poached Eggs
140530                                 Fried Oyster Po’boys
158475                Lamb shank tagine with herb tabbouleh
158486                 Southern fried chicken in buttermilk
163175            Fried Chicken Sliders with Pickles + Slaw
165243                        Bar Tartine Cauliflower Salad
Name: name, dtype: object

In [60]:
recipes.name

0                           Drop Biscuits and Sausage Gravy
1                                 Hot Roast Beef Sandwiches
2                        Morrocan Carrot and Chickpea Salad
3                                     Mixed Berry Shortcake
4                                   Pomegranate Yogurt Bowl
5                                        Krispy Easter Eggs
6         Spinach and Kamut Salad with Chili-Orange Dres...
7                                  Eggs in Hash Brown Nests
8                                                   Za'atar
9                          Pistachio Chocolate Chip Muffins
10                                        Glazed Easter Ham
11                                Pretty, Yummy Fruit Salad
12                    No-Bake Chocolate Bottom Pumpkin Tart
13                                          Seeded Popovers
14                        Spinach and Black Bean Quesadilla
15                                 Easter Leftover Sandwich
16                                      

3.11完