# 大数据处理技巧

In [None]:
import pandas as pd

In [None]:
gl = pd.read_csv('./data/game_logs.csv')
gl.head()

In [None]:
gl.shape

In [None]:
gl.info(memory_usage='deep')

In [None]:
for dtype in ['float64','int64','object']:
    selected_dtype = gl.select_dtypes(include = [dtype])
    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
    mean_usage_mb = mean_usage_b/1024**2
    print ('平均内存占用',dtype,mean_usage_mb)

In [None]:
import numpy as np
int_types = ['uint8','int8','int16','int32','int64']
for it in int_types:
    print (np.iinfo(it))

In [None]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else:
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b/1024**2
    return '{:03.2f} MB'.format(usage_mb)

gl_int = gl.select_dtypes(include = ['int64'])
converted_int = gl_int.apply(pd.to_numeric,downcast='unsigned')
print (mem_usage(gl_int))
print (mem_usage(converted_int))

```
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_numeric.html
```

In [None]:
gl_float = gl.select_dtypes(include=['float64'])
converted_float = gl_float.apply(pd.to_numeric,downcast='float')

print(mem_usage(gl_float))
print(mem_usage(converted_float))

In [None]:
optimized_gl = gl.copy()

optimized_gl[converted_int.columns] = converted_int
optimized_gl[converted_float.columns] = converted_float

print(mem_usage(gl))
print(mem_usage(optimized_gl))

In [None]:
gl_obj = gl.select_dtypes(include = ['object']).copy()
gl_obj.describe()

In [None]:
dow = gl_obj.day_of_week
dow.head()

In [None]:
dow_cat = dow.astype('category')
dow_cat.head()

In [None]:
dow_cat.head(10).cat.codes

In [None]:
print (mem_usage(dow))
print (mem_usage(dow_cat))

In [None]:
converted_obj = pd.DataFrame()

for col in gl_obj.columns:
    num_unique_values = len(gl_obj[col].unique())
    num_total_values = len(gl_obj[col])
    if num_unique_values / num_total_values < 0.5:
        converted_obj.loc[:,col] = gl_obj[col].astype('category')
    else:
        converted_obj.loc[:,col] = gl_obj[col]

In [None]:
print(mem_usage(gl_obj))
print(mem_usage(converted_obj))

In [None]:
date = optimized_gl.date
date[:5]