In [1]:
import pandas as pd

In [14]:
df = pd.read_csv('../../datasets/weather/aqi.csv')
df.head(5)

Unnamed: 0,time,cityname,aqi,pm2_5,pm10,so2,no2,co,o3,primary_pollutant
0,2014-12-31,阿坝州,53,33,55,3,23,1.0,35.0,PM10
1,2015-01-31,阿坝州,31,18,29,7,10,0.5,45.0,
2,2015-01-30,阿坝州,34,19,30,7,13,0.6,48.0,
3,2015-01-29,阿坝州,31,18,31,7,15,0.5,32.0,
4,2015-01-28,阿坝州,29,18,29,7,14,0.6,27.0,


In [15]:
df.shape

(557424, 10)

处理大型数据时 ，要看看内存情况

In [16]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 557424 entries, 0 to 557423
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   time               557424 non-null  object 
 1   cityname           557424 non-null  object 
 2   aqi                557424 non-null  int64  
 3   pm2_5              557424 non-null  int64  
 4   pm10               557424 non-null  int64  
 5   so2                557424 non-null  int64  
 6   no2                557424 non-null  int64  
 7   co                 557424 non-null  float64
 8   o3                 211516 non-null  float64
 9   primary_pollutant  528587 non-null  object 
dtypes: float64(2), int64(5), object(3)
memory usage: 155.2 MB


可以看出 一共占用内存 155.2MB。 每个样本包含10个属性：2个 float 型；  5个 int 型； 3个 string 型

###### 来计算平均每个类型的数据占用的内存

In [26]:
for dtype in ['float64', 'int64', 'object'] :
    selected_dtype = df.select_dtypes(include = [dtype])
    sum_usage_b = selected_dtype.memory_usage(deep=True).sum()
    sum_usage_mb = sum_usage_b / 1024**2
    print(f'{dtype}\t型数据一共占用内存\t{sum_usage_mb}\tMB')

float64	型数据一共占用内存	8.5057373046875	MB
int64	型数据一共占用内存	21.26416015625	MB
object	型数据一共占用内存	125.47677516937256	MB


###### 查询一下数值类型的数据能表示的数据范围

In [27]:
import numpy as np

In [35]:
types = ['uint8', 'int8', 'int16', 'int32', 'int64', 'float32', 'float64']
for it in types:
    if((it != 'float32') & (it !='float64')):
        print(np.iinfo(it))
    else:
        print(np.finfo(it))

Machine parameters for uint8
---------------------------------------------------------------
min = 0
max = 255
---------------------------------------------------------------

Machine parameters for int8
---------------------------------------------------------------
min = -128
max = 127
---------------------------------------------------------------

Machine parameters for int16
---------------------------------------------------------------
min = -32768
max = 32767
---------------------------------------------------------------

Machine parameters for int32
---------------------------------------------------------------
min = -2147483648
max = 2147483647
---------------------------------------------------------------

Machine parameters for int64
---------------------------------------------------------------
min = -9223372036854775808
max = 9223372036854775807
---------------------------------------------------------------

Machine parameters for float32
----------------------------

可以发现 int64 类型数据可表示范围足够大，对于当前数据集有点浪费，于是就要降型

##### int型数据降型

定义一个计算内存的方法

In [36]:
def mem_usage(obj):
    if isinstance(obj, pd.DataFrame):
        usage_b = obj.memory_usage(deep=True).sum()
    else:                 # 对于 Series 结构
        usage_b = obj.memory_usage(deep=True)
    usage_mb = usage_b/1024**2
    return '{:03.2f} MB'.format(usage_mb)
        

数据类型转换

In [43]:
df_int = df.select_dtypes(include = ['int64'])
convert_int = df_int.apply(pd.to_numeric, downcast='unsigned')
print(mem_usage(df_int))
print(mem_usage(convert_int))

21.26 MB
5.32 MB


##### float型数据转换 也一样

In [45]:
df_float = df.select_dtypes(include = ['float64'])
convert_float = df_float.apply(pd.to_numeric, downcast='float')
print(mem_usage(df_float))
print(mem_usage(convert_float))

8.51 MB
4.25 MB


##### 接着再转换最占内存的 string 型数据