In [34]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
import numpy as np

In [35]:
"""
Handling missing data
For numeric data, pandas uses the floating-point value NaN (Not a Number) to represent missing data.
We call this a sentinel value that can be easily detected:
NA: not available, NA data may either be data that does not exist or that exists but was not observed
"""
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
string_data.isnull()
string_data[0] = None   # None is also treated as NA
string_data

'\nHandling missing data\nFor numeric data, pandas uses the floating-point value NaN (Not a Number) to represent missing data.\nWe call this a sentinel value that can be easily detected:\nNA: not available, NA data may either be data that does not exist or that exists but was not observed\n'

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

0    False
1    False
2     True
3    False
dtype: bool

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

In [36]:
"""
dropna
fillna
"""
string_data.dropna()
string_data
string_data.fillna(0)   # fill 0
string_data[string_data.notnull()]

'\ndropna\nfillna\n'

1    artichoke
3      avocado
dtype: object

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

0            0
1    artichoke
2            0
3      avocado
dtype: object

1    artichoke
3      avocado
dtype: object

In [37]:
"""
With DataFrame objects, things are a bit more complex. 
You may want to drop rows or columns that are all NA or only those containing any NAs. 
dropna by default drops any row containing a missing value:
"""
from numpy import nan as NA
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])
data
cleaned = data.dropna()
cleaned
data.dropna(how='all')  # only drop rows that are all NA
data[4] = NA
data
data.dropna(axis=1, how='all')
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df
df.dropna()
df.dropna(thresh=2)  # keep rows that |non-NA| >= thresh

'\nWith DataFrame objects, things are a bit more complex. \nYou may want to drop rows or columns that are all NA or only those containing any NAs. \ndropna by default drops any row containing a missing value:\n'

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


Unnamed: 0,0,1,2
0,1.0,6.5,3.0


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


Unnamed: 0,0,1,2
0,-1.338612,,
1,-1.307247,,
2,0.277792,,-0.668068
3,-1.949001,,-0.056118
4,-0.994766,0.061283,0.028327
5,-1.74009,-0.513514,-0.579992
6,-1.120695,-0.497315,1.920483


Unnamed: 0,0,1,2
4,-0.994766,0.061283,0.028327
5,-1.74009,-0.513514,-0.579992
6,-1.120695,-0.497315,1.920483


Unnamed: 0,0,1,2
2,0.277792,,-0.668068
3,-1.949001,,-0.056118
4,-0.994766,0.061283,0.028327
5,-1.74009,-0.513514,-0.579992
6,-1.120695,-0.497315,1.920483


In [38]:
# filling in missing data
df
df.fillna(0)
df.fillna({1: 0.5, 2: 0})   # calling fillna with a dict, you can use a different fill value for each column
_ = df.fillna(0, inplace=True)  # set in-place to modify the existing object in-place
df

Unnamed: 0,0,1,2
0,-1.338612,,
1,-1.307247,,
2,0.277792,,-0.668068
3,-1.949001,,-0.056118
4,-0.994766,0.061283,0.028327
5,-1.74009,-0.513514,-0.579992
6,-1.120695,-0.497315,1.920483


Unnamed: 0,0,1,2
0,-1.338612,0.0,0.0
1,-1.307247,0.0,0.0
2,0.277792,0.0,-0.668068
3,-1.949001,0.0,-0.056118
4,-0.994766,0.061283,0.028327
5,-1.74009,-0.513514,-0.579992
6,-1.120695,-0.497315,1.920483


Unnamed: 0,0,1,2
0,-1.338612,0.5,0.0
1,-1.307247,0.5,0.0
2,0.277792,0.5,-0.668068
3,-1.949001,0.5,-0.056118
4,-0.994766,0.061283,0.028327
5,-1.74009,-0.513514,-0.579992
6,-1.120695,-0.497315,1.920483


Unnamed: 0,0,1,2
0,-1.338612,0.0,0.0
1,-1.307247,0.0,0.0
2,0.277792,0.0,-0.668068
3,-1.949001,0.0,-0.056118
4,-0.994766,0.061283,0.028327
5,-1.74009,-0.513514,-0.579992
6,-1.120695,-0.497315,1.920483


In [39]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df
df.fillna(method='ffill')
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,1.481407,1.471054,-1.615446
1,-0.762316,1.667152,-0.273457
2,-0.346891,,-0.278784
3,-1.248762,,1.747193
4,-0.050225,,
5,-0.149063,,


Unnamed: 0,0,1,2
0,1.481407,1.471054,-1.615446
1,-0.762316,1.667152,-0.273457
2,-0.346891,1.667152,-0.278784
3,-1.248762,1.667152,1.747193
4,-0.050225,1.667152,1.747193
5,-0.149063,1.667152,1.747193


Unnamed: 0,0,1,2
0,1.481407,1.471054,-1.615446
1,-0.762316,1.667152,-0.273457
2,-0.346891,1.667152,-0.278784
3,-1.248762,1.667152,1.747193
4,-0.050225,,1.747193
5,-0.149063,,1.747193


In [40]:
# data transformation
# removing duplicates
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data
# The DataFrame method duplicated returns a boolean Series indicating whether each row is a duplicate (has been observed in a previous row) or not:
data.duplicated()
data.drop_duplicates()
# filter based only on some column
data['v1'] = range(7)
data.drop_duplicates(['k1'])
# by default keep the first observed value, passing keep='last' will return the last one
data.drop_duplicates(['k1', 'k2'], keep='last')


Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


In [41]:
# transforming data using a function or mapping
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'],
                    'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
# suppose you wanted to add a column indicating the type of animal that each food came from
meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}
# the map method on a Series accepts a function or dict-like object containing a mapping
lowercased = data['food'].str.lower()
lowercased
data['animal'] = lowercased.map(meat_to_animal)
data
data['food'].map(lambda x: meat_to_animal[x.lower()])

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [42]:
# replace value
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data
data.replace(-999, np.nan)
# pass a list to replace multiple values
data.replace([-999, -1000], np.nan)
data.replace([-999, -1000], [np.nan, 0])
data.replace({-999: np.nan, -1000: 0})

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [43]:
# renaming axis indexes
data = pd.DataFrame(np.arange(12).reshape((3, 4)), index=['Ohio', 'Colorado', 'New York'], columns=['one', 'two', 'three', 'four'])
transform = lambda x: x[:4].upper()
data.index.map(transform)
data
# without modifying the original data: rename
data.rename(index=str.title, columns=str.upper)
data.rename(index={'OHIO': 'INDIANA'},
            columns={'three': 'peekaboo'})

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


Unnamed: 0,one,two,peekaboo,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [44]:
# discretization and binning, continuous data is often discretized into "bins" for analysis
ages = [18, 20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
# divide ages into bins of 18-25, 26-35, 36-60, 61-100
cats = pd.cut(ages, bins)
cats
# returns a special Categorical object
cats.codes
cats.categories
pd.value_counts(cats)
# 'right' means closed(inclusive)
pd.cut(ages, [18, 26, 36, 61, 100], right=False)
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)
"""
If you pass an integer number of bins to cut instead of explicit bin edges, it will compute
equal-length bins based on the minimum and maximum values in the data.
Consider the case of some uniformly distributed data chopped into fourths:
"""
data = np.random.rand(20)
cats2 = pd.cut(data, 4, precision=2)    # precision limits the decimal precision to two digits
cats2
pd.value_counts(cats2)
"""
A closely related function, qcut, bins the data based on sample quantiles. Depending
on the distribution of the data, using cut will not usually result in each bin having the
same number of data points. Since qcut uses sample quantiles instead, by definition
you will obtain roughly equal-size bins:
"""
data = np.random.randn(1000)  # Normally distributed
cats = pd.qcut(data, 4)  # Cut into quartiles
cats
pd.value_counts(cats)
# Similar to cut you can pass your own quantiles (numbers between 0 and 1, inclusive):
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])


[NaN, (18, 25], (18, 25], (18, 25], (25, 35], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 13
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

array([-1,  0,  0,  0,  1,  0,  0,  2,  1,  3,  2,  2,  1], dtype=int8)

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

[[18, 26), [18, 26), [18, 26), [18, 26), [26, 36), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 13
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

[NaN, Youth, Youth, Youth, YoungAdult, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 13
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

'\nIf you pass an integer number of bins to cut instead of explicit bin edges, it will compute\nequal-length bins based on the minimum and maximum values in the data.\nConsider the case of some uniformly distributed data chopped into fourths:\n'

[(0.065, 0.29], (0.065, 0.29], (0.74, 0.96], (0.51, 0.74], (0.51, 0.74], ..., (0.29, 0.51], (0.51, 0.74], (0.51, 0.74], (0.51, 0.74], (0.74, 0.96]]
Length: 20
Categories (4, interval[float64]): [(0.065, 0.29] < (0.29, 0.51] < (0.51, 0.74] < (0.74, 0.96]]

(0.74, 0.96]     7
(0.51, 0.74]     7
(0.29, 0.51]     3
(0.065, 0.29]    3
dtype: int64

'\nA closely related function, qcut, bins the data based on sample quantiles. Depending\non the distribution of the data, using cut will not usually result in each bin having the\nsame number of data points. Since qcut uses sample quantiles instead, by definition\nyou will obtain roughly equal-size bins:\n'

[(-0.0538, 0.67], (-0.679, -0.0538], (-0.679, -0.0538], (-0.679, -0.0538], (-2.7319999999999998, -0.679], ..., (-0.0538, 0.67], (-2.7319999999999998, -0.679], (-2.7319999999999998, -0.679], (-0.0538, 0.67], (-0.0538, 0.67]]
Length: 1000
Categories (4, interval[float64]): [(-2.7319999999999998, -0.679] < (-0.679, -0.0538] < (-0.0538, 0.67] < (0.67, 2.878]]

(0.67, 2.878]                    250
(-0.0538, 0.67]                  250
(-0.679, -0.0538]                250
(-2.7319999999999998, -0.679]    250
dtype: int64

[(-0.0538, 1.361], (-1.248, -0.0538], (-1.248, -0.0538], (-1.248, -0.0538], (-1.248, -0.0538], ..., (-0.0538, 1.361], (-1.248, -0.0538], (-1.248, -0.0538], (-0.0538, 1.361], (-0.0538, 1.361]]
Length: 1000
Categories (4, interval[float64]): [(-2.7319999999999998, -1.248] < (-1.248, -0.0538] < (-0.0538, 1.361] < (1.361, 2.878]]

In [45]:
# detecting and filtering outliers
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()
# suppose you wanted to find values in one of the columns exceeding 3 in absolute value:
col = data[2]
col[np.abs(col) > 3]
# To select all rows having a value exceeding 3 or –3, you can use the any method on a boolean DataFrame:
data[(np.abs(data) > 3).any(1)]
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()
# np.sign(data) produces 1 and -1 values based on whether the values in data are positive or negative
np.sign(data).head()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.03664,-0.01817,0.006584,0.110594
std,1.003683,0.988902,0.985568,1.012155
min,-3.160948,-3.214273,-3.128082,-3.060592
25%,-0.643485,-0.678188,-0.650528,-0.585655
50%,0.055015,-0.07759,0.003877,0.12084
75%,0.729034,0.618198,0.660974,0.817493
max,3.075683,3.454051,3.171843,3.418695


516    3.171843
759   -3.128082
Name: 2, dtype: float64

Unnamed: 0,0,1,2,3
35,3.075683,0.314223,-0.08644,0.107742
114,0.099328,-0.39338,0.530284,-3.060592
172,0.638273,3.201298,1.028018,0.13599
516,-1.541038,0.009116,3.171843,-1.7909
525,0.485865,0.975344,-1.614589,3.418695
640,0.754152,3.032145,1.595234,1.200116
668,-0.076308,-3.214273,-1.01519,1.116034
759,0.04554,-1.864289,-3.128082,-1.704669
821,-3.160948,0.984527,1.218199,-0.520556
936,-0.86111,3.454051,0.871704,1.0133


Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.036725,-0.018643,0.00654,0.110236
std,1.002956,0.986001,0.98463,1.010682
min,-3.0,-3.0,-3.0,-3.0
25%,-0.643485,-0.678188,-0.650528,-0.585655
50%,0.055015,-0.07759,0.003877,0.12084
75%,0.729034,0.618198,0.660974,0.817493
max,3.0,3.0,3.0,3.0


Unnamed: 0,0,1,2,3
0,1.0,1.0,1.0,1.0
1,-1.0,-1.0,1.0,-1.0
2,1.0,1.0,1.0,-1.0
3,1.0,1.0,1.0,-1.0
4,1.0,-1.0,1.0,-1.0


In [46]:
# permutation and random sampling
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
sampler
df
df.take(sampler)    # new order
# to select a random subset without replacement
df.sample(n=3)
# to generate a sample allowing repeat choices, pass replace=True
choices = pd.Series([5, 6, -1, 4, 7])
draws = choices.sample(n=10, replace=True)
draws

array([1, 4, 2, 0, 3])

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


Unnamed: 0,0,1,2,3
1,4,5,6,7
4,16,17,18,19
2,8,9,10,11
0,0,1,2,3
3,12,13,14,15


Unnamed: 0,0,1,2,3
3,12,13,14,15
4,16,17,18,19
2,8,9,10,11


3    4
2   -1
0    5
4    7
4    7
1    6
0    5
1    6
4    7
4    7
dtype: int64

In [47]:
# computing indicator/dummy variables
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)})
pd.get_dummies(df['key'])
dummies = pd.get_dummies(df['key'], prefix='key')
dummies
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [48]:
# regular expression
import re
text = "foo bar\t baz \tqux"
regex = re.compile('\s+')
regex.split(text)
# get a list of all patterns matching the regex, use the findall method
regex.findall(text)
# search returns only the first match, match only matches at the beginning of the string
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)
regex.findall(text)
m = regex.search(text)
m
# search returns a special match object for the first email address in the text
text[m.start():m.end()]
# regex.match returns None, as it only will match if the pattern occurs at the start of the string
print(regex.match(text))
# sub will return a new string with occurrences of the pattern replaced by the new string 
print(regex.sub('REDACTED', text))

['foo', 'bar', 'baz', 'qux']

[' ', '\t ', ' \t']

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

<_sre.SRE_Match object; span=(5, 20), match='dave@google.com'>

'dave@google.com'

None
Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



In [49]:
"""
Suppose you wanted to find email addresses and simultaneously segment each
address into its three components: username, domain name, and domain suffix. To
do this, put parentheses around the parts of the pattern to segment:
"""
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)
m = regex.match('robin@bright.com')
m.groups()
regex.findall(text)
# sub also has access to groups in each match using special symbols like \1 and \2, \1 corresponds to the first matched group
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

'\nSuppose you wanted to find email addresses and simultaneously segment each\naddress into its three components: username, domain name, and domain suffix. To\ndo this, put parentheses around the parts of the pattern to segment:\n'

('robin', 'bright', 'com')

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com



In [56]:
# vectorized string functions in pandas
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com', 'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data
data.isnull()
"""
You can apply string and regular expression methods can be applied (passing a
lambda or other function) to each value using data.map, but it will fail on the NA
(null) values. To cope with this, Series has array-oriented methods for string operations
that skip NA values. These are accessed through Series’s str attribute; for example,
we could check whether each email address has 'gmail' in it with str.contains:
"""
data.map(lambda x: str(x)+'1')
data.str.contains('gmail')
data.str.findall(pattern, flags=re.IGNORECASE)
matches = data.str.match(pattern, flags=re.IGNORECASE)
matches
matches.str.get(1)
data.str[:5]

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

"\nYou can apply string and regular expression methods can be applied (passing a\nlambda or other function) to each value using data.map, but it will fail on the NA\n(null) values. To cope with this, Series has array-oriented methods for string operations\nthat skip NA values. These are accessed through Series’s str attribute; for example,\nwe could check whether each email address has 'gmail' in it with str.contains:\n"

Dave     dave@google.com1
Steve    steve@gmail.com1
Rob        rob@gmail.com1
Wes                  nan1
dtype: object

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

Dave    NaN
Steve   NaN
Rob     NaN
Wes     NaN
dtype: float64

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object