# Changing Data Types in Pandas

In [2]:
import numpy as np
import pandas as pd

Our Dummy DataFrame:

In [3]:
d = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data=d)

In [4]:
df

Unnamed: 0,col1,col2
0,1,3
1,2,4


Check our default types:

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   col1    2 non-null      int64
 1   col2    2 non-null      int64
dtypes: int64(2)
memory usage: 160.0 bytes


In [6]:
df.dtypes

col1    int64
col2    int64
dtype: object

Cast all columns to int32:

In [7]:
df.astype('int32').dtypes

col1    int32
col2    int32
dtype: object

Cast just 'col1' to int32:

In [8]:
df.astype({'col1': 'int32'}).dtypes

col1    int32
col2    int64
dtype: object

Convert to categorical:

In [9]:
df.astype('category').dtypes

col1    category
col2    category
dtype: object

### Custom Categorical types:

In [10]:
from pandas.api.types import CategoricalDtype

cat_dtype = CategoricalDtype(
    categories=[2, 1],
    ordered=True
)

df.astype({'col1': cat_dtype}).dtypes

col1    category
col2       int64
dtype: object

In [11]:
df

Unnamed: 0,col1,col2
0,1,3
1,2,4


We defined `col1` above as having just two categories `2` and `1`, and in that strict order.  

So, if we now sort by `col1` it will reverse the display order:   

In [12]:
df.astype({'col1': cat_dtype}).sort_values(by=['col1'])

Unnamed: 0,col1,col2
1,2,4
0,1,3


### Dates as Strings to DateTime64:

In [13]:
import datetime as dt

In [14]:
ser_dates = pd.Series(['2023-01-01', '2023-01-02', '2023-01-03'])
ser_dates

0    2023-01-01
1    2023-01-02
2    2023-01-03
dtype: object

Casting to `datetime64` enables us to do things like easily and correctly increment dates by a number of days:

In [15]:
ser_dates.astype('datetime64') + dt.timedelta(days=40)

0   2023-02-10
1   2023-02-11
2   2023-02-12
dtype: datetime64[ns]

### Strings to numbers:

In [16]:
s1 = pd.Series(['1.0000', '2', -3])
pd.to_numeric(s1)

0    1.0
1    2.0
2   -3.0
dtype: float64

Change the number format:

In [17]:
pd.to_numeric(s1, downcast='signed')

0    1
1    2
2   -3
dtype: int8

How to ignore errors:

In [18]:
s2 = pd.Series(['one', '2.000', '3', 4])
s2

0      one
1    2.000
2        3
3        4
dtype: object

In [31]:
pd.to_numeric(s2, errors='ignore')

0      one
1    2.000
2        3
3        4
dtype: object

How to force it:

In [20]:
pd.to_numeric(s2, errors='coerce')

0    NaN
1    2.0
2    3.0
3    4.0
dtype: float64

## Automatic Type Conversion of An Entire DataFrame

Creating a bogus DataFrame with forced data types:

In [33]:
df = pd.DataFrame(
    {
        "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
        "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
        "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")),
        "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")),
        "e": pd.Series([10, np.nan, 20], dtype=np.dtype("float")),
        "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")),
    }
)

In [34]:
df

Unnamed: 0,a,b,c,d,e,f
0,1,x,True,h,10.0,
1,2,y,False,i,,100.5
2,3,z,,,20.0,200.0


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   a       3 non-null      int32  
 1   b       3 non-null      object 
 2   c       2 non-null      object 
 3   d       2 non-null      object 
 4   e       2 non-null      float64
 5   f       2 non-null      float64
dtypes: float64(2), int32(1), object(3)
memory usage: 260.0+ bytes


Convert DataFrame to use "Best Possible" dtypes:

In [37]:
dfn = df.convert_dtypes()
dfn

Unnamed: 0,a,b,c,d,e,f
0,1,x,True,h,10.0,
1,2,y,False,i,,100.5
2,3,z,,,20.0,200.0


In [38]:
dfn.dtypes

a      Int32
b     string
c    boolean
d     string
e      Int64
f    Float64
dtype: object

## Inferring Object Types

In [26]:
dfo = df.astype('object')
dfo

Unnamed: 0,a,b,c,d,e,f
0,1,x,True,h,10.0,
1,2,y,False,i,,100.5
2,3,z,,,20.0,200.0


In [39]:
dfo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   a       3 non-null      object
 1   b       3 non-null      object
 2   c       2 non-null      object
 3   d       2 non-null      object
 4   e       2 non-null      object
 5   f       2 non-null      object
dtypes: object(6)
memory usage: 272.0+ bytes


In [28]:
dfo.infer_objects()

Unnamed: 0,a,b,c,d,e,f
0,1,x,True,h,10.0,
1,2,y,False,i,,100.5
2,3,z,,,20.0,200.0


In [40]:
dfo.infer_objects().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   a       3 non-null      int64  
 1   b       3 non-null      object 
 2   c       2 non-null      object 
 3   d       2 non-null      object 
 4   e       2 non-null      float64
 5   f       2 non-null      float64
dtypes: float64(2), int64(1), object(3)
memory usage: 272.0+ bytes
