In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("data/sales_data_types.csv")
df.head()

Unnamed: 0,Customer Number,Customer Name,2016,2017,Percent Growth,Jan Units,Month,Day,Year,Active
0,10002.0,Quest Industries,"$125,000.00",$162500.00,30.00%,500,1,10,2015,Y
1,552278.0,Smith Plumbing,"$920,000.00","$101,2000.00",10.00%,700,6,15,2014,Y
2,23477.0,ACME Industrial,"$50,000.00",$62500.00,25.00%,125,3,29,2016,Y
3,24900.0,Brekke LTD,"$350,000.00",$490000.00,4.00%,75,10,27,2015,Y
4,651029.0,Harbor Co,"$15,000.00",$12750.00,-15.00%,Closed,2,2,2014,N


In [3]:
df.dtypes

Customer Number    float64
Customer Name       object
2016                object
2017                object
Percent Growth      object
Jan Units           object
Month                int64
Day                  int64
Year                 int64
Active              object
dtype: object

In [4]:
# 1. 使用astype做数据类型转换
df['Customer Number'].astype("int")

0     10002
1    552278
2     23477
3     24900
4    651029
Name: Customer Number, dtype: int32

In [5]:
# 2. 使用自定义函数实现数据类型转换
# df['2016'].astype('int')
def convert_float(value):
    # value = $125,000.00
    value = value.replace("$", "").replace(",", "")
    return float(value)

# df['2016'].apply(convert_float)

# lambda表达式
df['2016'].apply(lambda value: float(value.replace("$", "").replace(",", "")))

0    125000.0
1    920000.0
2     50000.0
3    350000.0
4     15000.0
Name: 2016, dtype: float64

In [10]:
# 3. np.where
# help(np.where)
np.where(df['Active'] == 'Y',1, 0)

array([1, 1, 1, 1, 0])

In [15]:
# 3. to_numeric函数
# pd.to_numeric(df['Customer Number'], downcast="float")

# 将errors参数设置为coerce：如果某个值转换失败，会使用NAN来代替
# downcast注意事项：转换过程中，不一定会按照downcast指定的数据类型转换，比如出现NAN的时候
# NAN是float类型。
pd.to_numeric(df['Jan Units'], errors="coerce", downcast="integer").fillna(0)

0    500.0
1    700.0
2    125.0
3     75.0
4      0.0
Name: Jan Units, dtype: float64

In [18]:
# 4. to_datetime函数
# 4.1 int/float时间戳转化为时间类型
# import time
# time.time()

pd.to_datetime(1643186131.3147855, unit="s")

Timestamp('2022-01-26 08:35:31')

In [20]:
# 4.2. 满足时间格式的字符串
pd.to_datetime("2022/12/12", format="%Y/%m/%d")

Timestamp('2022-12-12 00:00:00')

In [21]:
# 4.3. 列表、数组、元组
pd.to_datetime([2,3,4], unit='D', origin=pd.Timestamp("2022-01-01"))

DatetimeIndex(['2022-01-03', '2022-01-04', '2022-01-05'], dtype='datetime64[ns]', freq=None)

In [23]:
# 4.4. Series或者DataFrame类型
series = pd.Series(['2022-01-01', '2022-01-02', '2022-01-03'])
pd.to_datetime(series)

0   2022-01-01
1   2022-01-02
2   2022-01-03
dtype: datetime64[ns]

In [29]:
# 在读取文件的时候，就转换好数据类型
df1 = pd.read_csv(
    "data/sales_data_types.csv",
    dtype={"Customer Number": "int"},
    converters={
        "2016": lambda value: float(value.replace("$", "").replace(",", "")),
        "2017": lambda value: float(value.replace("$", "").replace(",", "")),
        "Percent Growth": lambda value: float(value.replace("%", ""))/100,
        "Jan Units": lambda value: pd.to_numeric(value, errors="coerce"),
        "Active": lambda value: np.where(value=='Y', True, False)
    }
)
df1['Active'].astype("bool")

0     True
1     True
2     True
3     True
4    False
Name: Active, dtype: bool