In [1]:
# source: https://pbpython.com/currency-cleanup.html

import pandas as pd

df=pd.read_excel('currency_cleanup.xlsx')
df.head()

Unnamed: 0,Customer,Sales
0,Jones Brothers,500
1,Beta Corp,"$1,000.00"
2,Globex Corp,300.1
3,Acme,$750.01
4,Initech,300


In [2]:
df.dtypes

Customer    object
Sales       object
dtype: object

In [3]:
df['Sales'].astype(float)

ValueError: could not convert string to float: '$1,000.00'

In [4]:
# Remove $ and ,

df['Sales']=df['Sales'].str.replace('$', '', regex=True)
df['Sales']=df['Sales'].str.replace(',', '', regex=True)
df

Unnamed: 0,Customer,Sales
0,Jones Brothers,
1,Beta Corp,1000.0
2,Globex Corp,
3,Acme,750.01
4,Initech,
5,Hooli,


### <span style="color:blue"> Basically, an object column contained all strings. In reality, an object column can contain a mixture of multiple types.</span> 

Just look at the below codes

In [5]:
df1 = df.copy()
df1['Sales'].apply(type)

0    <class 'float'>
1      <class 'str'>
2    <class 'float'>
3      <class 'str'>
4    <class 'float'>
5    <class 'float'>
Name: Sales, dtype: object

### Add a column to show each type

In [6]:
df['Sales_type']=df['Sales'].apply(lambda x:type(x).__name__)
# df['Sales'].apply(type).value_counts()
df

Unnamed: 0,Customer,Sales,Sales_type
0,Jones Brothers,,float
1,Beta Corp,1000.0,str
2,Globex Corp,,float
3,Acme,750.01,str
4,Initech,,float
5,Hooli,,float


In [7]:
# example

number=1235
number_string='$1,235'

print(type(number), type(number_string))

float(number_string.replace(',', '').replace('$', ''))

<class 'int'> <class 'str'>


1235.0

In [8]:
# but when apply the same code to the number

float(number.replace(',', '').replace('$', ''))

AttributeError: 'int' object has no attribute 'replace'

When pandas tries to do a similar approach by using the str accessor, it returns an NaN instead of an error. That’s why the numeric values get converted to NaN .

In [10]:
def clean_currency(x):
    """ If the value is a string, then remove currency symbol and delimiters
    otherwise, the value is numeric and can be converted
    """
    if isinstance (x, str):
        return x.replace(',', '').replace('$', '')
    return x

In [11]:
df=pd.read_excel('currency_cleanup.xlsx')
df.head()

Unnamed: 0,Customer,Sales
0,Jones Brothers,500
1,Beta Corp,"$1,000.00"
2,Globex Corp,300.1
3,Acme,$750.01
4,Initech,300


In [12]:
df['Sales']=df['Sales'].apply(clean_currency).astype(float)
df['Sales_type']=df['Sales'].apply(lambda x:type(x).__name__)
df

Unnamed: 0,Customer,Sales,Sales_type
0,Jones Brothers,500.0,float
1,Beta Corp,1000.0,float
2,Globex Corp,300.1,float
3,Acme,750.01,float
4,Initech,300.0,float
5,Hooli,250.0,float


In [13]:
df.dtypes

Customer       object
Sales         float64
Sales_type     object
dtype: object

### Shorten the codes

In [14]:
df=pd.read_excel('currency_cleanup.xlsx')
df.head()

Unnamed: 0,Customer,Sales
0,Jones Brothers,500
1,Beta Corp,"$1,000.00"
2,Globex Corp,300.1
3,Acme,$750.01
4,Initech,300


In [15]:
df['Sales']=df['Sales'].apply(lambda x:x.replace(',', '').replace('$', '') 
                              if isinstance (x, str) else x).astype(float)
df

Unnamed: 0,Customer,Sales
0,Jones Brothers,500.0
1,Beta Corp,1000.0
2,Globex Corp,300.1
3,Acme,750.01
4,Initech,300.0
5,Hooli,250.0


## Alternative Solutions

In [16]:
df=pd.read_excel('currency_cleanup.xlsx')
df.head()

Unnamed: 0,Customer,Sales
0,Jones Brothers,500
1,Beta Corp,"$1,000.00"
2,Globex Corp,300.1
3,Acme,$750.01
4,Initech,300


In [17]:
df['Sales']=df['Sales'].replace({',':'', '\$':''}, regex=True).astype(float)
df

Unnamed: 0,Customer,Sales
0,Jones Brothers,500.0
1,Beta Corp,1000.0
2,Globex Corp,300.1
3,Acme,750.01
4,Initech,300.0
5,Hooli,250.0


Convert the column to a string and safely use str.replace.
Use the dtype argument to read_excel to force the original column of data to be stored as a string:

In [18]:
df=pd.read_excel('currency_cleanup.xlsx', dtype={'Sales':str})
df['Sales'].apply(type).value_counts()

<class 'str'>    6
Name: Sales, dtype: int64

In [19]:
df['Sales']=df['Sales'].str.replace(',', '').str.replace('$', '').astype(float)
df

  """Entry point for launching an IPython kernel.


Unnamed: 0,Customer,Sales
0,Jones Brothers,500.0
1,Beta Corp,1000.0
2,Globex Corp,300.1
3,Acme,750.01
4,Initech,300.0
5,Hooli,250.0


Since all values are stored as strings, the replacement code works as expected and does not incorrectly convert some values to NaN.

In [20]:
df=pd.read_excel('currency_cleanup.xlsx', dtype={'Sales':str})

In [21]:
df['Sales']=df['Sales'].str.replace(',', '', regex=True).str.replace('$', '', regex=True).astype(float)
df

Unnamed: 0,Customer,Sales
0,Jones Brothers,500.0
1,Beta Corp,1000.0
2,Globex Corp,300.1
3,Acme,750.01
4,Initech,300.0
5,Hooli,250.0
