In [1]:
import pandas as pd
import numpy as np
import gc

In [9]:
df = pd.read_csv('./data/amex-default-prediction/train_data.csv')
df.head()

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5531451 entries, 0 to 5531450
Columns: 190 entries, customer_ID to D_145
dtypes: float64(185), int64(1), object(4)
memory usage: 7.8+ GB


In [10]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage before optimization is: {:.2f} MB'.format(start_mem))
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [5]:
%%time
df_optimized = reduce_mem_usage(df)

Memory usage before optimization is: 8018.31 MB
Memory usage after optimization is: 2125.91 MB
Decreased by 73.5%
CPU times: user 5min 54s, sys: 10min 33s, total: 16min 27s
Wall time: 26min 11s


In [6]:
%%time
df_optimized.to_feather('./data/train_x_resized.feather')

CPU times: user 10.9 s, sys: 2.37 s, total: 13.2 s
Wall time: 3.8 s


In [7]:
del df
gc.collect()

15

In [2]:
%%time
df = pd.read_csv('./data/amex-default-prediction/test_data.csv')
df.head()

CPU times: user 4min 43s, sys: 2min 36s, total: 7min 19s
Wall time: 10min 29s


Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-02-19,0.631315,0.001912,0.010728,0.814497,0.007547,0.168651,0.009971,0.002347,...,,,,,0.004669,,,,0.008281,
1,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-03-25,0.587042,0.005275,0.011026,0.810848,0.001817,0.241389,0.000166,0.009132,...,,,,0.000142,0.00494,0.009021,,0.003695,0.003753,0.00146
2,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-04-25,0.609056,0.003326,0.01639,1.00462,0.000114,0.266976,0.004196,0.004192,...,,,,7.4e-05,0.002114,0.004656,,0.003155,0.002156,0.006482
3,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-05-20,0.614911,0.009065,0.021672,0.816549,0.009722,0.188947,0.004123,0.015325,...,,,,0.004743,0.006392,0.00289,,0.006044,0.005206,0.007855
4,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-06-15,0.591673,0.238794,0.015923,0.810456,0.002026,0.180035,0.000731,0.011281,...,,,,0.008133,0.004329,0.008384,,0.001008,0.007421,0.009471


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11363762 entries, 0 to 11363761
Columns: 190 entries, customer_ID to D_145
dtypes: float64(185), int64(1), object(4)
memory usage: 16.1+ GB


In [None]:
%%time
df_optimized = reduce_mem_usage(df)

Memory usage before optimization is: 16472.74 MB


In [None]:
%%time
df_optimized.to_feather('./data/test_x_resized.feather')