# Load data, process, reduce memory and save to pickle 

In [1]:
import pickle
import pandas as pd
from process_data import reduce_mem_usage, process_data

data_path = '../data/'

In [2]:
%%time
# Load data

train = pd.read_csv(data_path + 'raw/train.csv')
train = train.sort_values(['fact_temperature', 'climate', 'fact_cwsm_class',
                           'fact_latitude', 'fact_longitude', 'fact_time'])

dev_in = pd.read_csv(data_path + 'raw/dev_in.csv')
dev_out = pd.read_csv(data_path + 'raw/dev_out.csv')
test = pd.read_csv(data_path + 'raw/eval.csv')

print('Train:', train.shape)
print('Test:', test.shape, '\n')
print('Dev in:', dev_in.shape)
print('Dev out:', dev_out.shape)

Train: (3129592, 129)
Test: (1137731, 123) 

Dev in: (50000, 129)
Dev out: (50000, 129)
Wall time: 1min 16s


In [3]:
# Concat data

train['Set'] = 'train'
dev_in['Set'] = 'dev_in'
dev_out['Set'] = 'dev_out'
test['Set'] = 'test'

merged_df = pd.concat([train, dev_in, dev_out, test])
del(train, dev_in, dev_out, test)

In [4]:
%%time
# Process data

merged_df = process_data(merged_df)

Wall time: 25.8 s


In [5]:
# Reduce memory

features = [x for x in merged_df.columns[6:] if x != 'Set']
merged_df[features] = reduce_mem_usage(merged_df[features])

Memory usage of dataframe is 4198.32 MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.float32)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.int8)


wrf_hail converted to int
Memory usage after optimization is: 2103.33 MB
Decreased by 49.9%


In [6]:
# Split data

train = merged_df[merged_df['Set'] == 'train'].drop(columns='Set')
dev_in = merged_df[merged_df['Set'] == 'dev_in'].drop(columns='Set')
dev_out = merged_df[merged_df['Set'] == 'dev_out'].drop(columns='Set')

test_drop_cols = list(train.columns[:6]) + ['Set']
test = merged_df[merged_df['Set'] == 'test'].drop(columns=test_drop_cols)

print('Train:', train.shape)
print('Test:', test.shape, '\n')
print('Dev in:', dev_in.shape)
print('Dev out:', dev_out.shape)

Train: (3129592, 131)
Test: (1137731, 125) 

Dev in: (50000, 131)
Dev out: (50000, 131)


In [7]:
# Save data

with open(data_path + 'processed/train.pcl', 'wb') as f:
    pickle.dump(train, f)

with open(data_path + 'processed/dev_in.pcl', 'wb') as f:
    pickle.dump(dev_in, f)
        
with open(data_path + 'processed/dev_out.pcl', 'wb') as f:
    pickle.dump(dev_out, f)
        
with open(data_path + 'processed/test.pcl', 'wb') as f:
    pickle.dump(test, f)