In [11]:
import pandas as pd
import numpy as np
import featuretools as ft

N_PARTITIONS = 1000

In [13]:
PATH = '/home/ubuntu/data/astro'
test = pd.read_csv(f'{PATH}/test_set.csv', nrows = 100)
test.head()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,13,59798.3205,2,-1.299735,1.357315,0
1,13,59798.3281,1,-2.095392,1.148654,0
2,13,59798.3357,3,-0.923794,1.763655,0
3,13,59798.3466,4,-4.009815,2.602911,0
4,13,59798.3576,5,-3.403503,5.367328,0


In [14]:
test_meta = pd.read_csv(f'{PATH}/test_set_metadata.csv')
test_meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3492890 entries, 0 to 3492889
Data columns (total 11 columns):
object_id             int64
ra                    float64
decl                  float64
gal_l                 float64
gal_b                 float64
ddf                   int64
hostgal_specz         float64
hostgal_photoz        float64
hostgal_photoz_err    float64
distmod               float64
mwebv                 float64
dtypes: float64(9), int64(2)
memory usage: 293.1 MB


In [15]:
test_meta['object_id'].nunique()

3492890

In [16]:
import os

# Partitioning Data

The first step is to partition the large data files into a number of smaller files. We'll make 1000 partitions and divide the data evenly into each partition.

```{python}
def make_partitions():
    for i in range(N_PARTITIONS):
        os.makedirs(f'{PATH}/test_partitions/p{i}/', exists_ok = False)

make_partitions()
```

In [None]:
print(len(os.listdir(f'{PATH}/test_partitions')))

In [21]:
test_meta['partition'] = test_meta['object_id'] % 1000

for partition, grouped in test_meta.groupby('partition'):
    grouped.drop(columns = ['partition'], inplace = True)
    grouped.to_csv(f'{PATH}/test_partitions/p{partition}/test_set_metadata.csv', index = False)

In [22]:
!wc -l ~/data/astro/test_set.csv

  453653105   453653105 19793878383 /home/ubuntu/data/astro/test_set.csv


In [24]:
chunksize = 10_000_000

# Read in file iteratively
for i, chunk in enumerate(pd.read_csv(f'{PATH}/test_set.csv', chunksize = chunksize)):
    
    # Create the partition numbers in the chunk
    chunk['partition'] = chunk['object_id'] % 1000
    
    # Iterate through partitions
    for partition, grouped in chunk.groupby('partition'):
        # Drop the partition 
        grouped.drop(columns = ['partition'], inplace = True)
        partition_directory = f'{PATH}/test_partitions/p{partition}/test_set.csv'
        
        # First time must write the headers
        if i == 0:
            with open(partition_directory, 'w') as fout:
                # Save the group to the correct directory
                grouped.to_csv(fout, header = True, index = False)
        
        # Otherwise just append
        elif i > 0:
            with open(partition_directory, 'a') as fout:
                # Save the group to the correct directory
                grouped.to_csv(fout, header = False, index = False)
        
    print(f'{round(100 * (((i + 1) * chunksize) / 453_653_105), 2)}% complete.', end = '\r')

101.39906349808847% complete.

In [25]:
os.listdir('/home/ubuntu/data/astro/test_partitions/p999/')

['test_set_metadata.csv', 'test_set.csv']

In [None]:
test.info()

In [28]:
train = pd.read_csv('%s/training_set.csv' % PATH)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1421705 entries, 0 to 1421704
Data columns (total 6 columns):
object_id    1421705 non-null int64
mjd          1421705 non-null float64
passband     1421705 non-null int64
flux         1421705 non-null float64
flux_err     1421705 non-null float64
detected     1421705 non-null int64
dtypes: float64(3), int64(3)
memory usage: 65.1 MB


In [29]:
train.head()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,615,59750.4229,2,-544.810303,3.622952,1
1,615,59750.4306,1,-816.434326,5.55337,1
2,615,59750.4383,3,-471.385529,3.801213,1
3,615,59750.445,4,-388.984985,11.395031,1
4,615,59752.407,2,-681.858887,4.041204,1


In [30]:
es = ft.EntitySet('astro')