### In this notebook, we analyze whether the use of our method helps speed up feature selection as a preprocessing, pruning step. Naturally, it only makes sense to do so if the "pre-pruned" results are compatible in quality with what we would get with the pruners.

### Let's assume for now that the dataset in question involves using the number of taxi trips to predict the number  of accidents in traffic.

In [114]:
import pandas as pd
import json
import os
import warnings; warnings.simplefilter('ignore')
from sklearn.impute import SimpleImputer
import numpy as np

SEPARATOR = '|'

In [45]:
taxi_trips = pd.read_csv('nyc_indicators/taxi_count.csv', sep=SEPARATOR)
taxi_trips = taxi_trips.rename(columns={'count': 'taxi_count'})
traffic_accidents = pd.read_csv('nyc_indicators/crash_count.csv', sep=SEPARATOR)
traffic_accidents = traffic_accidents.rename(columns={'count': 'crash_count'})
taxi_crash = pd.merge(taxi_trips, traffic_accidents, on='time', how='inner')

In [92]:
def join_datasets(base_dataset, dataset_directory, key, mean_data_imputation=True):
    '''
    Given (1) a base dataset, (2) a directory with datasets that only have two 
    columns (one key and one numerical attribute), and (3) a key that is present 
    in all of them and helps for joining purposes, this function generates a big
    table composed of all joined datasets.
    '''
    
    augmented_dataset = base_dataset
    dataset_names = [f for f in os.listdir(dataset_directory) if '.csv' in f]
    for name in dataset_names:
        try:
            ### Step 1: read the dataset in the directory
            dataset = pd.read_csv(os.path.join(dataset_directory, name), 
                                  sep=SEPARATOR)
            
            ### Step 2:  rename the numerical column in the dataset
            numerical_column = [i for i in dataset.columns if i != key][0]
            dataset = dataset.rename(columns={numerical_column: name.split('.')[0]})
    
            ### Step 3: augment the table
            augmented_dataset = pd.merge(augmented_dataset, 
                                         dataset,
                                         how='left',
                                         on=key)
        except pd.errors.EmptyDataError:
            continue
    
    augmented_dataset = augmented_dataset.set_index(key)

    if mean_data_imputation:
        fill_NaN = SimpleImputer(missing_values=np.nan, strategy='mean')
        new_data = pd.DataFrame(fill_NaN.fit_transform(augmented_dataset))
        new_data.columns = augmented_dataset.columns
        new_data.index = augmented_dataset.index
        return new_data
    
    return augmented_dataset

In [95]:
augmented_dataset = join_datasets(taxi_crash, 'nyc_indicators/', 'time')

In [96]:
augmented_dataset.head()

Unnamed: 0_level_0,taxi_count,crash_count,311_category_Agency_Issues_added_zeros,taxispeed_speed_avg,311_category_SCRIE_added_zeros,cyclist_killed_sum,311_category_electric_added_zeros,311_category_Illegal_parking_added_zeros,311_category_Vacant_Lot_added_zeros,311_category_consumer_complaint_added_zeros,...,311_category_Noise_added_zeros,311_category_Literature_request_added_zeros,311_category_taxi_added_zeros,311_category_collection_added_zeros,311_category_homeless_person_assistance_added_zeros,311_category_Traffic_added_zeros,311_category_Damaged_Tree_added_zeros,motorist_killed_sum,311_category_Enforcement_added_zeros,311_category_graffiti_added_zeros
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-07-01,354746.0,443.0,0.0,17.902601,1.0,0.0,89.0,72.0,5.0,25.0,...,934.0,9.0,25.0,70.0,26.341051,61.0,32.0,0.0,13.0,6.0
2012-07-02,412337.0,475.0,0.0,16.507096,74.0,0.0,177.0,93.0,15.0,72.0,...,453.0,112.0,40.0,24.0,26.341051,88.0,80.0,1.0,123.0,169.0
2012-07-03,495375.0,577.0,0.0,15.954152,63.0,0.0,140.0,102.0,14.0,57.0,...,487.0,85.0,28.0,48.0,26.341051,121.0,73.0,1.0,58.0,38.0
2012-07-04,345717.0,353.0,0.0,17.80881,4.0,0.0,61.0,51.0,6.0,18.0,...,882.0,10.0,31.0,19.0,26.341051,70.0,25.0,0.0,8.0,5.0
2012-07-05,417036.0,517.0,0.0,16.603352,52.0,0.0,201.0,108.0,10.0,66.0,...,617.0,84.0,46.0,2.0,26.341051,100.0,60.0,0.0,71.0,96.0


### Now let's see which of these features would be classified by our approach as bad for augmentation. To this end, let's build our model over openml-training instances with $\theta = 1$, svm-rbf, and one candidate per query in the training examples. NOTE THAT THESE TRAINING INSTANCES REFER TO REGRESSION PROBLEMS.

### For now we are going to use the following class policy: if the gain in R2-score is predicted as "above zero", we consider that the feature should not be pruned. Otherwise it should.

In [169]:
from sklearn.svm import SVC
from sklearn.metrics import r2_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler

FEATURES = ['query_num_of_columns', 'query_num_of_rows', 'query_row_column_ratio',
            'query_max_skewness', 'query_max_kurtosis', 'query_max_unique', 
            'candidate_num_rows', 'candidate_max_skewness', 'candidate_max_kurtosis',
            'candidate_max_unique', 'query_target_max_pearson', 
            'query_target_max_spearman', 'query_target_max_covariance', 
            'query_target_max_mutual_info', 'candidate_target_max_pearson', 
            'candidate_target_max_spearman', 'candidate_target_max_covariance', 
            'candidate_target_max_mutual_info']
THETA = 1

def train_rbf_svm(features, classes):
    '''
    Builds a model using features to predict associated classes,
    '''

    feature_scaler = MinMaxScaler().fit(features)
    features_train = feature_scaler.transform(features)
    clf = SVC(max_iter=1000, gamma='auto')
    clf.fit(features_train, classes)

    return feature_scaler, clf

In [170]:
openml_training = pd.read_csv('../classification/training-simplified-data-generation.csv')
openml_training['class_pos_neg'] = ['gain' if row['gain_in_r2_score'] > 0 else 'loss' 
                                    for index, row in openml_training.iterrows()]
openml_training_high_containment = openml_training.loc[openml_training['containment_fraction'] >= THETA]
openml_training_high_containment.shape

(7566, 36)

In [171]:
feature_scaler, model = train_rbf_svm(openml_training_high_containment[FEATURES], 
                                      openml_training_high_containment['class_pos_neg'])

### Now, for each single-feature dataset in nyc_indicators/, we will join it with the base table, generate all features and classify it.

In [103]:
# The next two lines are important for importing files that are in the parent directory, 
# necessary to generate the features
import sys
sys.path.append('../')
from feature_factory import *

In [142]:
def compute_features(query_dataset, candidate_dataset, key, target_name, mean_data_imputation=True):
    '''
    This function generates all the features required to determine, through classification, 
    whether an augmentation with the candidate_dataset (which is single-feature) is likely to 
    hamper the model (or simply bring no gain)
    '''
    
    # Step 1: individual query features
    feature_factory_query = FeatureFactory(query_dataset.drop([target_name], axis=1))
    query_dataset_individual_features = feature_factory_query.get_individual_features(func=max_in_modulus)
    ## In order, the returned features are number_of_columns, number_of_rows, row_to_column_ratio,
    ## max_mean, max_outlier_percentage, max_skewness, max_kurtosis, max_number_of_unique_values.
    ## For now, we're only using number_of_columns, number_of_rows, row_to_column_ratio, 
    ## max_skewness, max_kurtosis, max_number_of_unique_values, so we remove the unnecessary elements 
    ## in the lines below
    query_dataset_individual_features = [query_dataset_individual_features[index] for index in [0, 1, 2, 5, 6, 7]]
    
    # Step 2: individual candidate features
    feature_factory_candidate = FeatureFactory(candidate_dataset)
    candidate_dataset_individual_features = feature_factory_candidate.get_individual_features(func=max_in_modulus)
    ## For now, we're only using number_of_rows, max_skewness, max_kurtosis, max_number_of_unique_values, 
    ## so we remove the unnecessary elements in the lines below 
    candidate_dataset_individual_features = [candidate_dataset_individual_features[index] for index in [1, 5, 6, 7]]
    
    # Step 3: join the datasets and compute pairwise features
    augmented_dataset = pd.merge(query_dataset, 
                                 candidate_dataset,
                                 how='left',
                                 on=key)
    #augmented_dataset = augmented_dataset.set_index(key)
    if mean_data_imputation:
        fill_NaN = SimpleImputer(missing_values=np.nan, strategy='mean')
        new_dataset = pd.DataFrame(fill_NaN.fit_transform(augmented_dataset))
        new_dataset.columns = augmented_dataset.columns
        new_dataset.index = augmented_dataset.index
        augmented_dataset = new_dataset
    
    # Step 3.1: get query-target features 
    ## The features are, in order: max_query_target_pearson, max_query_target_spearman, 
    ## max_query_target_covariance, max_query_target_mutual_info
    feature_factory_full_query = FeatureFactory(query_dataset)
    query_features_target = feature_factory_full_query.get_pairwise_features_with_target(target_name,
                                                                                         func=max_in_modulus)
    # Step 3.2: get candidate-target features
    ## The features are, in order: max_query_candidate_pearson, max_query_candidate_spearman, 
    ## max_query_candidate_covariance, max_query_candidate_mutual_info
    column_names = candidate_dataset.columns.tolist() + [target_name]
    feature_factory_candidate_target = FeatureFactory(augmented_dataset[column_names])
    candidate_features_target = feature_factory_candidate_target.get_pairwise_features_with_target(target_name,
                                                                                                   func=max_in_modulus)
    return np.array(query_dataset_individual_features + 
                    candidate_dataset_individual_features + 
                    query_features_target + 
                    candidate_features_target)

candidate_dataset = pd.read_csv('nyc_indicators/citibike_count.csv', sep=SEPARATOR)
candidate_dataset = candidate_dataset.rename(columns={'count':'citibike_count'})
features = compute_features(taxi_crash.set_index('time'), 
                            candidate_dataset.set_index('time'), 
                            'time',
                            'crash_count')

In [153]:
print(features)

[1.00000000e+00 1.46200000e+03 1.46200000e+03 3.86883428e-01
 1.47702153e+00 1.45600000e+03 1.63300000e+03 3.02000052e-01
 5.28414244e-01 1.60300000e+03 1.60072219e-01 1.24021640e-01
 9.50707731e+05 8.68223838e-01 2.62378575e-01 2.87505489e-01
 2.27171493e+05 7.52732960e-01]


### Let's quickly sanity-check these features.

In [152]:
print('query number of rows', taxi_crash.shape[0], 
      'query skewness', taxi_crash['taxi_count'].skew(), 
      'query kurtosis', taxi_crash['taxi_count'].kurtosis(),
      'candidate number of unique values', len(set(taxi_crash['taxi_count'])))

print('candidate number of rows', candidate_dataset.shape[0],
      'candidate skewness', candidate_dataset['citibike_count'].skew(), 
      'candidate kurtosis', candidate_dataset['citibike_count'].kurtosis(),
      'candidate number of unique values', len(set(candidate_dataset['citibike_count'])))

augmented_dataset = pd.merge(taxi_crash,
                             candidate_dataset,
                             how='left',
                             on='time').set_index('time')
fill_NaN = SimpleImputer(missing_values=np.nan, strategy='mean')
new_dataset = pd.DataFrame(fill_NaN.fit_transform(augmented_dataset))
new_dataset.columns = augmented_dataset.columns
new_dataset.index = augmented_dataset.index
augmented_dataset = new_dataset

from scipy.stats import pearsonr, spearmanr
from scipy import cov
from sklearn.metrics.cluster import normalized_mutual_info_score

print('query target pearson', pearsonr(augmented_dataset['taxi_count'], augmented_dataset['crash_count']),
      'query target spearman', spearmanr(augmented_dataset['taxi_count'], augmented_dataset['crash_count']),
      'query target covariance', cov(augmented_dataset['taxi_count'], augmented_dataset['crash_count'])[0, 1],
      'query target mutual info', normalized_mutual_info_score(augmented_dataset['taxi_count'], augmented_dataset['crash_count']))

print('candidate target pearson', pearsonr(augmented_dataset['citibike_count'], augmented_dataset['crash_count']),
      'candidate target spearman', spearmanr(augmented_dataset['citibike_count'], augmented_dataset['crash_count']),
      'candidate target covariance', cov(augmented_dataset['citibike_count'], augmented_dataset['crash_count'])[0, 1],
      'candidate target mutual info', normalized_mutual_info_score(augmented_dataset['citibike_count'], augmented_dataset['crash_count']))


query number of rows 1462 query skewness -0.38688342830440225 query kurtosis 1.4770215280107495 candidate number of unique values 1456
candidate number of rows 1633 candidate skewness 0.30200005209988534 candidate kurtosis -0.5284142437311292 candidate number of unique values 1603
query target pearson (0.160072218826492, 7.506211089340906e-10) query target spearman SpearmanrResult(correlation=0.12402163995232865, pvalue=1.970536577824118e-06) query target covariance 950707.7308413646 query target mutual info 0.8682238377526429
candidate target pearson (0.2623785747241163, 1.9218632466061835e-24) candidate target spearman SpearmanrResult(correlation=0.2875054890817096, pvalue=3.2000264303199555e-29) candidate target covariance 227171.49311560777 candidate target mutual info 0.7527329601416918


### Ok! So let's see what label is predicted once we present these features to our model

In [174]:
def normalize_features(features, scaler=None):
    '''
    This function normalizes features using sklearn's StandardScaler
    '''
    if not scaler:
        scaler = MinMaxScaler().fit(features)
    return scaler.transform(features)

print(model.predict(normalize_features([features], feature_scaler)))

['loss']


### Now let's create predictions for all candidates

In [177]:
candidate_label = {}
candidate_names = [f for f in os.listdir('nyc_indicators/') if '.csv' in f]
feature_vectors = []
for name in candidate_names:
        candidate_dataset = pd.read_csv(os.path.join('nyc_indicators/', name),
                                        sep=SEPARATOR)
        numerical_column = [i for i in candidate_dataset.columns if i != 'time'][0]
        candidate_dataset = candidate_dataset.rename(columns={numerical_column: name.split('.')[0]})
        features = compute_features(taxi_crash.set_index('time'), 
                                    candidate_dataset.set_index('time'), 
                                    'time',
                                    'crash_count')
        feature_vectors.append(features)

In [178]:
predictions = model.predict(normalize_features(np.array(feature_vectors)))

In [180]:
for name, pred in zip(candidate_names, predictions):
    candidate_label[name] = pred
candidate_label

{'311_category_Agency_Issues_added_zeros.csv': 'gain',
 'taxispeed_speed_avg.csv': 'gain',
 '311_category_SCRIE_added_zeros.csv': 'gain',
 'cyclist_killed_sum.csv': 'gain',
 '311_category_electric_added_zeros.csv': 'gain',
 '311_category_Illegal_parking_added_zeros.csv': 'gain',
 '311_category_Vacant_Lot_added_zeros.csv': 'gain',
 '311_category_consumer_complaint_added_zeros.csv': 'gain',
 'weather_temperature_mean.csv': 'gain',
 '311_category_Litter_basket_added_zeros.csv': 'gain',
 '311_category_dof_added_zeros.csv': 'gain',
 'pedestrians_killed_sum.csv': 'loss',
 '311_category_Construction_added_zeros.csv': 'loss',
 '311_category_DOH_New_License_Application_Request_added_zeros.csv': 'gain',
 '311_category_Sidewalk_Condition_added_zeros.csv': 'gain',
 'persons_killed_sum.csv': 'loss',
 '311_category_Food_Establishment_added_zeros.csv': 'gain',
 '311_category_Drinking_added_zeros.csv': 'loss',
 '311_category_unsanitary_added_zeros.csv': 'gain',
 '311_category_rodent_added_zeros.csv': 