In [1]:
import pandas as pd   
import numpy as np 

#  get_dummies() 

In [2]:
df = pd.DataFrame({"key": list("bbacab"), "data1": range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


To convert categorical variables of a Series into a “dummy”

In [3]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,False,True,False
1,False,True,False
2,True,False,False
3,False,False,True
4,True,False,False
5,False,True,False


In [4]:
df['key'].str.get_dummies()

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


prefix adds a prefix to the the column names which is useful for merging the result with the original

In [5]:
dummies = pd.get_dummies(df['key'] , prefix="Raghav")
dummies

Unnamed: 0,Raghav_a,Raghav_b,Raghav_c
0,False,True,False
1,False,True,False
2,True,False,False
3,False,False,True
4,True,False,False
5,False,True,False


#  Working with text data

In [6]:
pd.Series(["a", "b", "c"])

0    a
1    b
2    c
dtype: object

In [7]:
pd.Series(['A','B','C'],dtype='string')

0    A
1    B
2    C
dtype: string

Convert the integer series into string 

In [8]:
s1 = pd.Series([1,2,np.nan], dtype="Int64")
s1

0       1
1       2
2    <NA>
dtype: Int64

In [9]:
s2 = s1.astype('string')
s2

0       1
1       2
2    <NA>
dtype: string

In [10]:
s = pd.Series(["a", None, "b"], dtype="string")
s

0       a
1    <NA>
2       b
dtype: string

In [11]:
s.str.count("a")

0       1
1    <NA>
2       0
dtype: Int64

In [12]:
s.dropna().str.count("a")

0    1
2    0
dtype: Int64

# String methods

In [13]:
s = pd.Series(
    ["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"], dtype="string"
)

s

0       A
1       B
2       C
3    Aaba
4    Baca
5    <NA>
6    CABA
7     dog
8     cat
dtype: string

To convert the string into lower case 

In [14]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5    <NA>
6    caba
7     dog
8     cat
dtype: string

To convert upper case 

In [15]:
s.str.upper()

0       A
1       B
2       C
3    AABA
4    BACA
5    <NA>
6    CABA
7     DOG
8     CAT
dtype: string

 length of string 

In [16]:
s.str.len()

0       1
1       1
2       1
3       4
4       4
5    <NA>
6       4
7       3
8       3
dtype: Int64

# Splitting and replacing strings

In [18]:
s2 = pd.Series(["a_b_c", "c_d_e", np.nan, "r_a_g_h_a_v"], dtype="string")
s2

0          a_b_c
1          c_d_e
2           <NA>
3    r_a_g_h_a_v
dtype: string

In [19]:
s2.str.split("_")

0             [a, b, c]
1             [c, d, e]
2                  <NA>
3    [r, a, g, h, a, v]
dtype: object

Elements in the split lists can be accessed using get or [] notation:

In [20]:
s2.str.split("_").get(1)

['c', 'd', 'e']

In [26]:
s2.str.split("_").str[2]

0       c
1       e
2    <NA>
3       g
dtype: object

It is easy to expand this to return a DataFrame using expand

In [27]:
s2.str.split("_", expand=True)

Unnamed: 0,0,1,2,3,4,5
0,a,b,c,,,
1,c,d,e,,,
2,,,,,,
3,r,a,g,h,a,v


replace optionally uses regular expressions:

In [29]:
s3 = pd.Series(
    ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"],
    dtype="string",
)
s3

0       A
1       B
2       C
3    Aaba
4    Baca
5        
6    <NA>
7    CABA
8     dog
9     cat
dtype: string

In [36]:
s3.str.replace("^.a|dog","R",case=False ,regex=True)

0       A
1       B
2       C
3     Rba
4     Rca
5        
6    <NA>
7     RBA
8       R
9      Rt
dtype: string

#  Concatenation

Concatenating a single Series into a string

In [43]:
s = pd.Series(["a", "b", "c", "d"], dtype="string")
s.str.cat(sep=" , ")


'a , b , c , d'

In [44]:
s.str.cat()

'abcd'

By default, missing values are ignored. Using na_rep, they can be given a representation

In [46]:
t = pd.Series(["a", "b", np.nan, "d"], dtype="string")
t

0       a
1       b
2    <NA>
3       d
dtype: string

In [47]:

t.str.cat(sep=",")


'a,b,d'

In [48]:
t.str.cat(sep=",", na_rep="-")

'a,b,-,d'

# Concatenating a Series and something list-like into a Series

In [49]:
s

0    a
1    b
2    c
3    d
dtype: string

In [50]:
s.str.cat(['A','B','C','D'])

0    aA
1    bB
2    cC
3    dD
dtype: string

In [51]:
t,s

(0       a
 1       b
 2    <NA>
 3       d
 dtype: string,
 0    a
 1    b
 2    c
 3    d
 dtype: string)

Concatenating a Series and something array-like into a Series

In [54]:
d = pd.concat([t,s ], axis=1)
d

Unnamed: 0,0,1
0,a,a
1,b,b
2,,c
3,d,d


In [56]:
s.str.cat(d , na_rep='_')

0    aaa
1    bbb
2    c_c
3    ddd
dtype: string