# Preprocess data and extract features.

In [1]:
%env PYTHONHASHSEED=13
%load_ext autoreload
%autoreload 2

env: PYTHONHASHSEED=13


In [2]:
# If you need fully deterministic results between runs, set the following environment value prior to launching jupyter.
# Instructions can be found in HOW-TO-ENVIRONMENT.md.
# See comment in sherlock.features.paragraph_vectors.infer_paragraph_embeddings_features for more info.
%env PYTHONHASHSEED

'13'

In [3]:
from datetime import datetime
import os
import sys
import time

import numpy as np
import pandas as pd

from sherlock import helpers
from sherlock.functional import extract_features_to_csv
from sherlock.features.paragraph_vectors import initialise_pretrained_model, initialise_nltk
from sherlock.features.preprocessing import (
    extract_features,
    convert_string_lists_to_lists,
    prepare_feature_extraction,
    load_parquet_values,
)
from sherlock.features.word_embeddings import initialise_word_embeddings

In [4]:
print(f'Started at {datetime.now()}.')

Started at 2022-12-07 16:18:14.760987.


## Download data
This will download the raw values and preprocessed files, the corresponding labels as well as a few other supporting files:
- `download_data()` will download 3.6GB of data for preprocessing and model training into the `data/` directory.
- `prepare_feature_extraction()` will download +/- 800 MB of data into the `features/` directory.

In [5]:
helpers.download_data()
prepare_feature_extraction()

Downloading the raw data into ../data/data/.
Downloading data directory.
Access denied with the following error:



 	Cannot retrieve the public link of the file. You may need to change
	the permission to 'Anyone with the link', or have had many accesses. 

You may still be able to access the file from the browser:

	 https://drive.google.com/u/1/uc?id=1rNZC5sAkyAx3w2qODzspgLLOm6Zbr8V3&export=download 



Data was downloaded.
Preparing feature extraction by downloading 4 files:
        
 ../sherlock/features/glove.6B.50d.txt, 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy,
        
 ../sherlock/features/par_vec_trained_400.pkl.trainables.syn1neg.npy, and 
 ../sherlock/features/par_vec_trained_400.pkl.wv.vectors.npy.
        
All files for extracting word and paragraph embeddings are present.


In [6]:
if not os.path.exists('../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy'):
    raise SystemExit(
        """
        Trained paragraph vectors do not exist,
        please run the '01-train-paragraph-vector-features' notebook before continuing
        """
    )

## Read in raw data
You can skip this step if you want to use a preprocessed data file.

## Report memory usage (can be slow)

In [7]:
report_memory = False

## Extract features
It is important that the string-representations of lists are first converted into lists of strings.
The labels should be a list of semantic types.

In [8]:
timestr = time.strftime("%Y%m%d-%H%M%S")

# Features will be output to the following files
X_test_filename_csv = f'../data/data/processed/test_{timestr}.csv'
X_train_filename_csv = f'../data/data/processed/train_{timestr}.csv'
X_validation_filename_csv = f'../data/data/processed/validation_{timestr}.csv'

### PREPARATION

In [9]:
# ensure embedding initialisation is outside of timing for extract_features
prepare_feature_extraction()
initialise_word_embeddings()
initialise_pretrained_model(400)
initialise_nltk()

Preparing feature extraction by downloading 4 files:
        
 ../sherlock/features/glove.6B.50d.txt, 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy,
        
 ../sherlock/features/par_vec_trained_400.pkl.trainables.syn1neg.npy, and 
 ../sherlock/features/par_vec_trained_400.pkl.wv.vectors.npy.
        
All files for extracting word and paragraph embeddings are present.
Initialising word embeddings
Initialise Word Embeddings process took 0:00:05.092324 seconds.
Initialise Doc2Vec Model, 400 dim, process took 0:00:09.027146 seconds. (filename = ../sherlock/features/par_vec_trained_400.pkl)
Initialised NLTK, process took 0:00:00.246814 seconds.


[nltk_data] Downloading package punkt to /home/ritvikp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ritvikp/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
%load_ext line_profiler

In [11]:
# default process creation mode is changed in Python 3.8 to 'spawn' which causes "name not defined" type errors. Force fork 'mode' for now (this only needs to be called once).
# https://bugs.python.org/issue39931
#mp.set_start_method('fork', force=True)

## EXTRACT FEATURES TO CSV

### TEST SET

In [13]:
values = load_parquet_values("../data/data/raw/test_values.parquet")

extract_features_to_csv(X_test_filename_csv, values)

values = None

Starting ../data/data/processed/test_20221207-161817.csv at 2022-12-07 16:19:29.964290. Rows=67959, using 128 CPU cores
Exporting 1588 column features
Finished. Processed 67959 rows in 0:00:31.665194, key_count=128


In [14]:
print(f'Finished at {datetime.now()}')

Finished at 2022-12-07 16:20:01.682396


### TRAIN SET

In [15]:
values = load_parquet_values("../data/data/raw/train_values.parquet")

extract_features_to_csv(X_train_filename_csv, values)

values = None

Starting ../data/data/processed/train_20221207-161817.csv at 2022-12-07 16:20:03.082942. Rows=211008, using 128 CPU cores
Exporting 1588 column features
Finished. Processed 211008 rows in 0:01:20.542489, key_count=128


In [16]:
print(f'Finished at {datetime.now()}')

Finished at 2022-12-07 16:21:24.674610


### VALIDATION SET

In [17]:
values = load_parquet_values("../data/data/raw/val_values.parquet")

extract_features_to_csv(X_validation_filename_csv, values)

values = None

Starting ../data/data/processed/validation_20221207-161817.csv at 2022-12-07 16:21:25.295464. Rows=66135, using 128 CPU cores
Exporting 1588 column features
Finished. Processed 66135 rows in 0:00:31.213417, key_count=128


In [18]:
print(f'Finished at {datetime.now()}')

Finished at 2022-12-07 16:21:56.536273


### Read Locally Processed Features

In [19]:
start = datetime.now()

X_test = pd.read_csv(X_test_filename_csv, dtype=np.float32)

print(f'Load Features (test) process took {datetime.now() - start} seconds.')

Load Features (test) process took 0:00:08.664135 seconds.


In [20]:
X_test.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.072711,-0.057171,-0.005628,-0.010939,0.003236,0.037846,0.029798,-0.188146,-0.113799,-0.150237
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.182226,0.073399,-0.208214,0.089828,-0.166301,-0.170056,-0.063574,-0.032031,-0.036662,-0.101705
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.080397,-0.07677,-0.131979,0.264618,-0.035719,-0.098069,0.033647,-0.141033,0.15635,-0.333573
3,1.0,0.0,0.5,0.25,0.0,1.0,0.5,1.0,-2.0,0.0,...,-0.008227,-0.012152,-0.006132,0.059574,-0.066753,-0.00564,0.050405,-0.048401,-0.009492,-0.054889
4,1.0,0.0,0.05,0.0475,0.0,1.0,0.0,1.0,15.052631,4.129483,...,-0.102535,-0.053089,0.047979,0.102334,-0.19075,-0.029778,-0.042135,-0.103742,0.010977,-0.23722


In [21]:
start = datetime.now()

X_train = pd.read_csv(X_train_filename_csv, dtype=np.float32)

print(f'Load Features (train) process took {datetime.now() - start} seconds.')

Load Features (train) process took 0:00:26.325385 seconds.


In [22]:
X_train.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.047116,-0.049354,-0.131504,0.069075,-0.074675,0.017592,0.05663,-0.065194,-0.014413,-0.076834
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.343503,-0.251298,-0.264449,0.307831,-0.414797,-0.286773,-0.468284,-0.01454,0.158269,-0.158744
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.034603,0.07785,-0.361814,0.054728,-0.340731,-0.022665,-0.114884,0.190471,-0.009275,-0.257596
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.117727,0.027886,-0.053385,0.03882,-0.114509,-0.094668,0.036082,-0.15294,-0.023013,-0.10146
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.046155,0.013668,-0.002721,0.129323,-0.130178,-0.048886,-0.139746,-0.158365,-0.145093,-0.102929


In [23]:
start = datetime.now()

X_validation = pd.read_csv(X_validation_filename_csv, dtype=np.float32)

print(f'Load Features (validation) process took {datetime.now() - start} seconds.')

Load Features (validation) process took 0:00:08.028494 seconds.


In [24]:
X_validation.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.104543,0.183075,-0.294479,0.165276,-0.4974,0.068864,-0.041753,-0.057303,0.334317,-0.391376
1,1.0,0.0,0.25,0.1875,0.0,1.0,0.0,10.0,-0.666667,1.154701,...,-0.266734,-0.388268,-0.133944,-0.209854,-0.04382,-0.084358,-0.112332,0.148436,0.22521,-0.373678
2,1.0,0.0,0.857143,0.122449,0.0,1.0,1.0,6.0,2.166667,-2.041241,...,-0.25917,-0.087984,0.019003,0.034542,-0.084229,-0.224373,-0.111631,-0.054898,-0.178127,-0.471892
3,1.0,0.0,0.333333,0.222222,0.0,1.0,0.0,1.0,-1.5,0.707107,...,-0.112254,-0.09892,-0.061606,0.087361,-0.085039,-0.062957,-0.07411,-0.029276,0.048899,-0.144342
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.223919,-0.112982,0.042955,0.106152,-0.139727,-0.088458,-0.051371,0.005091,-0.152462,-0.196212


## Impute NaN values with feature means

In [25]:
start = datetime.now()

train_columns_means = pd.DataFrame(X_train.mean()).transpose()

print(f'Transpose process took {datetime.now() - start} seconds.')

Transpose process took 0:00:01.076151 seconds.


In [26]:
start = datetime.now()

X_train.fillna(train_columns_means.iloc[0], inplace=True)
X_validation.fillna(train_columns_means.iloc[0], inplace=True)
X_test.fillna(train_columns_means.iloc[0], inplace=True)

train_columns_means=None

print(f'FillNA process took {datetime.now() - start} seconds.')

FillNA process took 0:00:01.037685 seconds.


In [27]:
start = datetime.now()

X_train.to_parquet('../data/data/processed/train.parquet', engine='pyarrow', compression='snappy')
X_validation.to_parquet('../data/data/processed/validation.parquet', engine='pyarrow', compression='snappy')
X_test.to_parquet('../data/data/processed/test.parquet', engine='pyarrow', compression='snappy')

print(f'Save parquet process took {datetime.now() - start} seconds.')

Save parquet process took 0:00:16.574153 seconds.


In [28]:
print(f'Completed at {datetime.now()}.')

Completed at 2022-12-07 16:22:59.473831.
