In [71]:
import pandas as pd
import numpy as np

df = pd.read_csv('String_processing.csv', sep = '|')
df

Unnamed: 0,Country,Users,Phone_numbers,Plan Cost
0,India,RAHUL,91-9737964925,12
1,India,Rohit,91-9737964925,($12)
2,India,Panda,91-9737964925,($15)
3,USA,albert,91-9737964925,($18)
4,USA,np.nan,91-9737964925,($14)
5,n,dinNO,91-9737964925,($13)
6,AUS,Markus,91-9737964925,($11)
7,AUS,Renata,91-9737964925,($9)
8,CANA,Tim,91-9737964925,($8)
9,CANA,Tom,91-9737964925,($7)


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Country        11 non-null     object
 1   Users          11 non-null     object
 2   Phone_numbers  11 non-null     object
 3   Plan Cost      11 non-null     object
dtypes: object(4)
memory usage: 484.0+ bytes


**Converting columns to string dtype**

In [7]:
df['Country'] = df['Country'].astype('string') # converting country column to a string dtype

df['Users'] = df['Users'].astype('string') # converting a Users column to a  string dtype

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Country        11 non-null     string
 1   Users          11 non-null     string
 2   Phone_numbers  11 non-null     object
 3   Plan Cost      11 non-null     object
dtypes: object(2), string(2)
memory usage: 484.0+ bytes


**STRING FUNCTIONS**

In [8]:
# 1. Finding the length of each string under a column in a DataFrame

df['Users'].str.len()

0     5
1     5
2     5
3     6
4     6
5     5
6     6
7     6
8     3
9     3
10    6
Name: Users, dtype: Int64

In [13]:
# 2. Counting characters of each value under a column

df['Country'].str.count('A')

0     0
1     0
2     0
3     1
4     1
5     0
6     1
7     1
8     2
9     2
10    2
Name: Country, dtype: Int64

In [16]:
# 3. Finding an exact match under a column

df['Country'].str.match('US')
# df[df['Country'].str.match('US')]  # filtering the match

0     False
1     False
2     False
3      True
4      True
5     False
6     False
7     False
8     False
9     False
10    False
Name: Country, dtype: boolean

In [20]:
# 4. Checking if each value under a column contains a sequence of characters

df['Country'].str.contains('US')
# df[df['Country'].str.contains('US')] # filtering out the match

0     False
1     False
2     False
3      True
4      True
5     False
6      True
7      True
8     False
9     False
10    False
Name: Country, dtype: boolean

In [27]:
pattern = r'[9][1][-]'  # the r is a raw string which treats literals as strings instead of escape sequences.
df['Phone_numbers'].str.contains(pattern)
# df['Phone_numbers'].str.contains(pattern).count()

0     True
1     True
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
10    True
Name: Phone_numbers, dtype: bool

In [34]:
# 5. Changing the case of each string value under a column

# df['Users'].str.lower()
df['Users'].str.upper()
# df['Users'].str.swapcase()
# df['Users'].str.startswith('L')
# df['Users'].str.endswith('R')

0      RAHUL
1      ROHIT
2      PANDA
3     ALBERT
4     NP.NAN
5      DINNO
6     MARKUS
7     RENATA
8        TIM
9        TOM
10    TURNER
Name: Users, dtype: string

In [41]:
# 6. Splitting each value in a column with a separator

# df['Phone_numbers'].str.split('-')
# df['Phone_numbers'].str.split('-').str.get(1) # fetches the first value. after splitting, 91 is at index 0, while the rest are at index 1
# df['Phone_numbers'].str.split('-').str[1]
# df['Phone_numbers'].str.split('-', expand = True)
df['Phone_numbers'].str.split('9', expand = True, n = 4)

Unnamed: 0,0,1,2,3,4
0,,1-,737,64,25
1,,1-,737,64,25
2,,1-,737,64,25
3,,1-,737,64,25
4,,1-,737,64,25
5,,1-,737,64,25
6,,1-,737,64,25
7,,1-,737,64,25
8,,1-,737,64,25
9,,1-,737,64,25


In [72]:
# 7. Replacing each value of a column with another value

df['Plan Cost'] = df['Plan Cost'].str.replace('$', '')
df['Plan Cost'] = df['Plan Cost'].str.replace('(', '')
df['Plan Cost'] = df['Plan Cost'].str.replace(')', '/ -RS')
df

Unnamed: 0,Country,Users,Phone_numbers,Plan Cost
0,India,RAHUL,91-9737964925,12
1,India,Rohit,91-9737964925,12/ -RS
2,India,Panda,91-9737964925,15/ -RS
3,USA,albert,91-9737964925,18/ -RS
4,USA,np.nan,91-9737964925,14/ -RS
5,n,dinNO,91-9737964925,13/ -RS
6,AUS,Markus,91-9737964925,11/ -RS
7,AUS,Renata,91-9737964925,9/ -RS
8,CANA,Tim,91-9737964925,8/ -RS
9,CANA,Tom,91-9737964925,7/ -RS


In [73]:
df['Plan Cost'] = df['Plan Cost'].str[0:2]
df

Unnamed: 0,Country,Users,Phone_numbers,Plan Cost
0,India,RAHUL,91-9737964925,12
1,India,Rohit,91-9737964925,12
2,India,Panda,91-9737964925,15
3,USA,albert,91-9737964925,18
4,USA,np.nan,91-9737964925,14
5,n,dinNO,91-9737964925,13
6,AUS,Markus,91-9737964925,11
7,AUS,Renata,91-9737964925,9/
8,CANA,Tim,91-9737964925,8/
9,CANA,Tom,91-9737964925,7/


In [74]:
# 8. Stripping values

df['Plan Cost'] = df['Plan Cost'].str.strip('/')

df['Phone_numbers'] = df['Phone_numbers'].str.strip('"')

df

Unnamed: 0,Country,Users,Phone_numbers,Plan Cost
0,India,RAHUL,91-9737964925,12
1,India,Rohit,91-9737964925,12
2,India,Panda,91-9737964925,15
3,USA,albert,91-9737964925,18
4,USA,np.nan,91-9737964925,14
5,n,dinNO,91-9737964925,13
6,AUS,Markus,91-9737964925,11
7,AUS,Renata,91-9737964925,9
8,CANA,Tim,91-9737964925,8
9,CANA,Tom,91-9737964925,7


In [75]:
# Creating a new string column currency

df['Currency'] = ' Ksh'

In [76]:
df

Unnamed: 0,Country,Users,Phone_numbers,Plan Cost,Currency
0,India,RAHUL,91-9737964925,12,Ksh
1,India,Rohit,91-9737964925,12,Ksh
2,India,Panda,91-9737964925,15,Ksh
3,USA,albert,91-9737964925,18,Ksh
4,USA,np.nan,91-9737964925,14,Ksh
5,n,dinNO,91-9737964925,13,Ksh
6,AUS,Markus,91-9737964925,11,Ksh
7,AUS,Renata,91-9737964925,9,Ksh
8,CANA,Tim,91-9737964925,8,Ksh
9,CANA,Tom,91-9737964925,7,Ksh


In [77]:
# 9. Concatenating 2 string columns of a DataFrame

df['Plan Cost'] = df['Plan Cost'].str.cat(df['Currency'])

del df['Currency']

df

Unnamed: 0,Country,Users,Phone_numbers,Plan Cost
0,India,RAHUL,91-9737964925,12 Ksh
1,India,Rohit,91-9737964925,12 Ksh
2,India,Panda,91-9737964925,15 Ksh
3,USA,albert,91-9737964925,18 Ksh
4,USA,np.nan,91-9737964925,14 Ksh
5,n,dinNO,91-9737964925,13 Ksh
6,AUS,Markus,91-9737964925,11 Ksh
7,AUS,Renata,91-9737964925,9 Ksh
8,CANA,Tim,91-9737964925,8 Ksh
9,CANA,Tom,91-9737964925,7 Ksh
