In [1]:
import numpy as np 
import pandas as pd

In [2]:
data = pd.Series([0.25, 0.5, 0.75,1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [3]:
data[2]

0.75

In [4]:
data[1:4]

1    0.50
2    0.75
3    1.00
dtype: float64

In [5]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
population_dict = {'California': 38333531,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38333531
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [7]:
population['Texas']

26448193

In [8]:
population['California':'Illinois']

California    38333531
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [9]:
pd.Series({2:'a', 1:'b', 3:'c'})

2    a
1    b
3    c
dtype: object

Pandas DataFrame objects:

In [10]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297, 'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [11]:
states = pd.DataFrame({'population': population,
                       'area': area})
states

Unnamed: 0,population,area
California,38333531,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [12]:
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [13]:
states.columns

Index(['population', 'area'], dtype='object')

Thus DataFrame can be thought of as a generalization of a two-dimensional NumPy array, where both rows and columns 
have a generalized index for accesing the data.

In [14]:
states['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [15]:
#Constructing DataFrame objects from list of dict:
data = [{'a': i, 'b': 2*i}
        for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [16]:
#even some keys in the dictionary are missing, pandas will fill them with NaN 
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


Indexers: loc, iloc,ix - 

In [17]:
data = pd.Series(['a', 'b', 'c'], index = [1, 3, 5])
data

1    a
3    b
5    c
dtype: object

In [18]:
data[1] #this is explicit index while indexing

'a'

In [19]:
data[1:3] #this is implicit index when slicing

3    b
5    c
dtype: object

loc attribute allows indexing and slicing that always references the explicit index

In [20]:
data.loc[1]

'a'

In [21]:
data.loc[1:3]

1    a
3    b
dtype: object

iloc attribute allows indexing and slicing that always references the implicit indexing

In [22]:
data.iloc[1]

'b'

In [23]:
data.iloc[1:3]

3    b
5    c
dtype: object

The guided principle of python code is "explicit is always better than implicit"

In [24]:
states['density'] = states['population'] / states['area']
states

Unnamed: 0,population,area,density
California,38333531,423967,90.416308
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


In [25]:
states.values

array([[3.83335310e+07, 4.23967000e+05, 9.04163083e+01],
       [2.64481930e+07, 6.95662000e+05, 3.80187404e+01],
       [1.96511270e+07, 1.41297000e+05, 1.39076746e+02],
       [1.95528600e+07, 1.70312000e+05, 1.14806121e+02],
       [1.28821350e+07, 1.49995000e+05, 8.58837628e+01]])

In [26]:
states.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
population,38333530.0,26448190.0,19651130.0,19552860.0,12882140.0
area,423967.0,695662.0,141297.0,170312.0,149995.0
density,90.41631,38.01874,139.0767,114.8061,85.88376


In [27]:
states.values[0] # passing a single index to an array accesses a row

array([3.83335310e+07, 4.23967000e+05, 9.04163083e+01])

In [28]:
states['area'] # passing a single "index" to a DataFrame accesses a column

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [29]:
states.iloc[:3, :2]

Unnamed: 0,population,area
California,38333531,423967
Texas,26448193,695662
New York,19651127,141297


In [30]:
states.iloc[:3, :1]

Unnamed: 0,population
California,38333531
Texas,26448193
New York,19651127


In [31]:
states.loc[:'Illinois', :'population']

Unnamed: 0,population
California,38333531
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


In [32]:
states.loc[:'Illinois', :'area']

Unnamed: 0,population,area
California,38333531,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [33]:
states.ix[:3, :'area']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,population,area
California,38333531,423967
Texas,26448193,695662
New York,19651127,141297


In [34]:
# In the loc indexer we can combine masking and fancy indexing as following:
states.loc[states.density > 100, ['population', 'density']]

Unnamed: 0,population,density
New York,19651127,139.076746
Florida,19552860,114.806121


Index alignment in DataFrame:

In [35]:
rng = np.random.RandomState(42)
A = pd.DataFrame(rng.randint(0, 20, (2,2)), columns= list('AB'))
A

Unnamed: 0,A,B
0,6,19
1,14,10


In [36]:
B = pd.DataFrame(rng.randint(0, 10, (3,3)), columns= list('BAC'))
B

Unnamed: 0,B,A,C
0,7,4,6
1,9,2,6
2,7,4,3


In [37]:
A + B

Unnamed: 0,A,B,C
0,10.0,26.0,
1,16.0,19.0,
2,,,


In [38]:
fill = A.stack().mean()
A.add(B, fill_value=fill)

Unnamed: 0,A,B,C
0,10.0,26.0,18.25
1,16.0,19.0,18.25
2,16.25,19.25,15.25


# Handling missing data:

# None: pythonic missing data-

In [39]:
import numpy as np
import pandas as pd

In [40]:
vals1 = np.array([1, None, 3, 4])
vals1

array([1, None, 3, 4], dtype=object)

In [41]:
for dtype in ['object', 'int']:
    print("dtype=", dtype)
    %timeit np.arange(1E6, dtype=dtype).sum()
    print()

dtype= object
66 ms ± 440 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

dtype= int
2.08 ms ± 39.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)



also if you perform aggregation functions like sum() or min()across an array containing a missing value it will produce an error.

# NaN: missing numerical data-

In [42]:
vals2 = np.array([1, np.nan, 3, 4])
vals2.dtype

dtype('float64')

In [43]:
1 + np.nan

nan

In [44]:
0 *np.nan

nan

In [45]:
vals2.sum(), vals2.max(), vals2.min()

(nan, nan, nan)

In [46]:
#but it can be solved with the help of following:
np.nansum(vals2), np.nanmax(vals2), np.nanmin(vals2)
# keep in mind that NaN is specifically a floating-point value; there is no equivalent value for integers, strings,
# or any other datatype.

(8.0, 4.0, 1.0)

In [47]:
pd.Series([1, np.nan, 3, None])
# Pandas automatically type-casts when NA vlaues are present

0    1.0
1    NaN
2    3.0
3    NaN
dtype: float64

# Operating on null values:
isnull(), notnull(), dropna(), fillna() are the functions used for operating on missing data

# Hierarchical indexing:
The better way: Pandas multiIndex-

In [48]:
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]

populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]
pop = pd.Series(populations, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [49]:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex(levels=[['California', 'New York', 'Texas'], [2000, 2010]],
           labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])

In [50]:
pop = pop.reindex(index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [51]:
# to access all data for which the second index is 2010, use pandas slicing notation
pop[:, 2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [52]:
# the unstack()method will quickly convert multiply-indexed Series into a conventionally indexed DataFrame
pop_df = pop.unstack()
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [53]:
# stack method provides the opposite operation
pop_df.stack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [54]:
# if we need to add another column 'under 18' it can be done in a following manner:
pop_df = pd.DataFrame({'total': pop,
                       'under18': [9267089, 9284094,
                                   4687374, 4318033,
                                   5906301, 6879014]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [55]:
# now we compute fraction of people under 18 by year:
f_u18 = pop_df['under18'] / pop_df['total']
f_u18.unstack()

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


# MultiIndex for columns: example-

In [56]:
#hierarchical indices and columns:
index = pd.MultiIndex.from_product([[2013, 2014], [1,2]],
                                    names=['year', 'visit'])

columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                     names=['Subject', 'Type'])

#mock some data
data = np.round(np.random.randn(4,6), 1)
data[:, ::2] *= 10
data += 37

#create the dataframe
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data


Unnamed: 0_level_0,Subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,Type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,41.0,36.9,43.0,37.6,33.0,36.5
2013,2,24.0,37.4,48.0,39.4,25.0,36.0
2014,1,37.0,36.7,40.0,37.5,45.0,37.9
2014,2,38.0,36.1,14.0,37.4,39.0,35.7


In [57]:
health_data['Guido']

Unnamed: 0_level_0,Type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,43.0,37.6
2013,2,48.0,39.4
2014,1,40.0,37.5
2014,2,14.0,37.4


# Data aggregations on the multi-indices:

In [58]:
health_data

Unnamed: 0_level_0,Subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,Type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,41.0,36.9,43.0,37.6,33.0,36.5
2013,2,24.0,37.4,48.0,39.4,25.0,36.0
2014,1,37.0,36.7,40.0,37.5,45.0,37.9
2014,2,38.0,36.1,14.0,37.4,39.0,35.7


In [59]:
data_mean = health_data.mean(level='year')
data_mean

Subject,Bob,Bob,Guido,Guido,Sue,Sue
Type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,32.5,37.15,45.5,38.5,29.0,36.25
2014,37.5,36.4,27.0,37.45,42.0,36.8


In [60]:
data_mean.mean(axis=1, level='Type')

Type,HR,Temp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,35.666667,37.3
2014,35.5,36.883333


In [61]:
#signature in pandas v0.18
#pd.concat(objs, join='outer', join_axes=None, ignore_index=False, keys=None, levels=None, names=None, 
#          verify_integrity=False, copy=True)

# Combining Datasets: Merge and Join

In [62]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'hire_date': [2004, 2008, 2012, 2014]})
print(df1); print(df2)

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
  employee  hire_date
0     Lisa       2004
1      Bob       2008
2     Jake       2012
3      Sue       2014


In [63]:
df3 = pd.merge(df1, df2)
df3

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


In [64]:
# many to one joins
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
                    'supervisor': ['Carly', 'Guido', 'Steve']})
print(df3); print(df4); print(pd.merge(df3, df4))

  employee        group  hire_date
0      Bob   Accounting       2008
1     Jake  Engineering       2012
2     Lisa  Engineering       2004
3      Sue           HR       2014
         group supervisor
0   Accounting      Carly
1  Engineering      Guido
2           HR      Steve
  employee        group  hire_date supervisor
0      Bob   Accounting       2008      Carly
1     Jake  Engineering       2012      Guido
2     Lisa  Engineering       2004      Guido
3      Sue           HR       2014      Steve


# GroupBy: split, Apply,combine

In [65]:
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'], 
                   'data': range(6)}, columns=['key', 'data'])
df

Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [66]:
df.groupby('key')

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x00000084F35E4828>

In [67]:
df.groupby('key').sum()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,5
C,7


In [70]:
# Aggregate, transform, filter, apply:
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A','B','C','A','B','C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                   columns= ['key', 'data1', 'data2'])
df

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


In [71]:
#Aggregation:
df.groupby('key').aggregate(['min', np.median, 'max'])

Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,min,median,max,min,median,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,0,1.5,3,3,4.0,5
B,1,2.5,4,0,3.5,7
C,2,3.5,5,3,6.0,9


In [73]:
#another useful pattern is to pass a dictionary mapping column names to operations to be applied on that column:
df.groupby('key').aggregate({'data1': 'min',
                             'data2': 'max'})

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,5
B,1,7
C,2,9


In [74]:
#Filtering:
def filter_func(x):
    return x['data2'].std() > 4

print(df); print(df.groupby('key').std());
print(df.groupby('key').filter(filter_func))

  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9
       data1     data2
key                   
A    2.12132  1.414214
B    2.12132  4.949747
C    2.12132  4.242641
  key  data1  data2
1   B      1      0
2   C      2      3
4   B      4      7
5   C      5      9


In [75]:
df.groupby('key').transform(lambda x: x - x.mean())

Unnamed: 0,data1,data2
0,-1.5,1.0
1,-1.5,-3.5
2,-1.5,-3.0
3,1.5,-1.0
4,1.5,3.5
5,1.5,3.0


In [77]:
# the apply() method:
def norm_by_data2(x):
    # x is a DataFrame of group values
    x['data1'] /=x['data2'].sum()
    return x

print(df); print(df.groupby('key').apply(norm_by_data2))

  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9
  key     data1  data2
0   A  0.000000      5
1   B  0.142857      0
2   C  0.166667      3
3   A  0.375000      3
4   B  0.571429      7
5   C  0.416667      9


# Pivot tables:

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

titanic = sns.load_dataset('titanic')

In [2]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
# let's look for survival rate by gender

titanic.groupby('sex')[['survived']].mean()

Unnamed: 0_level_0,survived
sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


This immediately gives us some insight that survival percent of men is quite less as compared to women.
But we need to go one step deeper like this:
We will group by gender and class, select the survived, apply a mean aggregate, combine the resulting group, and then unstack the hierarchical index to reveal the multidimensionality.

In [4]:
titanic.groupby(['sex', 'class'])['survived'].aggregate('mean').unstack()

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


# Pivot table syntax:

In [5]:
# whatever we did in the previous step can be done easily with the help of pivot_table(pivot table syntax) as:

titanic.pivot_table('survived', index='sex', columns='class')

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


This is more readable than groupby approach, and produces the same result.

In [6]:
# Multilevel pivot tables:

age = pd.cut(titanic['age'], [0, 18, 80])
titanic.pivot_table('survived', ['sex', age], 'class')

Unnamed: 0_level_0,class,First,Second,Third
sex,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,"(18, 80]",0.909091,1.0,0.511628
female,,0.972973,0.9,0.423729
male,"(18, 80]",0.8,0.6,0.215686
male,,0.375,0.071429,0.133663


In [8]:
# we can apply the same strategy when working with columns as well; let's add info on fare paid using pd.qcut to 
#automatically compute quantiles:

fare = pd.qcut(titanic['fare'], 2)
titanic.pivot_table('survived', ['sex', age], [fare,'class'])

Unnamed: 0_level_0,fare,"(-0.001, 14.454]","(-0.001, 14.454]","(-0.001, 14.454]","(14.454, 512.329]","(14.454, 512.329]","(14.454, 512.329]"
Unnamed: 0_level_1,class,First,Second,Third,First,Second,Third
sex,age,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
female,"(18, 80]",,1.0,0.714286,0.909091,1.0,0.318182
female,,,0.88,0.444444,0.972973,0.914286,0.391304
male,"(18, 80]",,0.0,0.26087,0.8,0.818182,0.178571
male,,0.0,0.098039,0.125,0.391304,0.030303,0.192308


In [10]:
titanic.pivot_table('survived', ['sex', age], [fare,'class'], fill_value=0)

Unnamed: 0_level_0,fare,"(-0.001, 14.454]","(-0.001, 14.454]","(-0.001, 14.454]","(14.454, 512.329]","(14.454, 512.329]","(14.454, 512.329]"
Unnamed: 0_level_1,class,First,Second,Third,First,Second,Third
sex,age,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
female,"(18, 80]",0,1.0,0.714286,0.909091,1.0,0.318182
female,,0,0.88,0.444444,0.972973,0.914286,0.391304
male,"(18, 80]",0,0.0,0.26087,0.8,0.818182,0.178571
male,,0,0.098039,0.125,0.391304,0.030303,0.192308


In [12]:
titanic.pivot_table(index='sex', columns='class',
                    aggfunc = {'survived': sum, 'fare': 'mean'})

Unnamed: 0_level_0,fare,fare,fare,survived,survived,survived
class,First,Second,Third,First,Second,Third
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
female,106.125798,21.970121,16.11881,91,70,72
male,67.226127,19.741782,12.661633,45,17,47


In [13]:
titanic.pivot_table('survived', index='sex', columns='class', margins=True)

class,First,Second,Third,All
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.968085,0.921053,0.5,0.742038
male,0.368852,0.157407,0.135447,0.188908
All,0.62963,0.472826,0.242363,0.383838


# Vectorized string operations:

Introducing pandas string operations:

In [1]:
import numpy as np
x = np.array([2, 3, 5, 7, 11, 13])
x *2

array([ 4,  6, 10, 14, 22, 26])

In [2]:
data = ['peter', 'Paul', 'MARY', 'gUIDO']
[s.capitalize() for s in data]

['Peter', 'Paul', 'Mary', 'Guido']

In [3]:
# but if we have a missing value it will break.
# Therefore suppose we create a Pandas series with this data:

import pandas as pd
names = pd.Series(data)
names

0    peter
1     Paul
2     MARY
3    gUIDO
dtype: object

In [4]:
names.str.capitalize()

0    Peter
1     Paul
2     Mary
3    Guido
dtype: object

In [5]:
# if we have a mising value like:
data = ['peter', 'Paul', None, 'MARY', 'gUIDO']
names = pd.Series(data)
names

0    peter
1     Paul
2     None
3     MARY
4    gUIDO
dtype: object

In [6]:
names.str.capitalize()

0    Peter
1     Paul
2     None
3     Mary
4    Guido
dtype: object

In [8]:
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam', 'Eric idle', 'Terry Jones', 'Michael Palin'])

In [9]:
monte.str.lower() # it returns a series of strings

0    graham chapman
1       john cleese
2     terry gilliam
3         eric idle
4       terry jones
5     michael palin
dtype: object

In [11]:
monte.str.len() # some return numbers

0    14
1    11
2    13
3     9
4    11
5    13
dtype: int64

In [12]:
monte.str.startswith('T') # some will return boolean values

0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool

In [13]:
monte.str.split()

0    [Graham, Chapman]
1       [John, Cleese]
2     [Terry, Gilliam]
3         [Eric, idle]
4       [Terry, Jones]
5     [Michael, Palin]
dtype: object

In [14]:
# we can extract the first name from each by asking for a contiguous group of characters at the beginning of 
# each element
monte.str.extract('([A-Za-z]+)')

Unnamed: 0,0
0,Graham
1,John
2,Terry
3,Eric
4,Terry
5,Michael
