In [25]:
import sys
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import os

In [26]:
FEATURE_VECTOR = ['query_num_of_columns',
                  'query_num_of_rows',
                  'query_row_column_ratio',
                  'query_max_mean',
                  'query_max_outlier_percentage',
                  'query_max_skewness',
                  'query_max_kurtosis',
                  'query_max_unique',
                  'candidate_num_of_columns',
                  'candidate_num_rows',
                  'candidate_row_column_ratio',
                  'candidate_max_mean',
                  'candidate_max_outlier_percentage',
                  'candidate_max_skewness',
                  'candidate_max_kurtosis',
                  'candidate_max_unique',
                  'query_target_max_pearson',
                  'query_target_max_spearman',
                  'query_target_max_covariance',
                  'query_target_max_mutual_info',
                  'candidate_target_max_pearson',
                  'candidate_target_max_spearman',
                  'candidate_target_max_covariance',
                  'candidate_target_max_mutual_info',
                  'max_pearson_difference',
                  'containment_fraction']
GAIN_COLUMN_NAME = 'gain_in_r2_score'

In [27]:
def print_df(df):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        print(df)

In [47]:
def create_model(alpha, training_filename, feature_vector, gain_column):
    """This functions builds a classifier based on the training data.
    """
    
    training_data = pd.read_csv(training_filename)
    training_data['class'] = ['gain' if row[gain_column] > alpha else 'loss'
                              for index, row in training_data.iterrows()]
    X_train = training_data[feature_vector]
    y_train = training_data['class']
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, y_train)
    
    return clf

In [56]:
def predict_class_for_features(classifier, test_filename, feature_vector):

    test_data = pd.read_csv(test_filename)
    X_test = test_data[feature_vector]
    predicted_classes = classifier.predict(X_test)
    
    test_features = pd.read_csv(test_filename)
    test_features['class'] = predicted_classes
    test_features['candidate'] = test_features['candidate'].apply(
        lambda x: os.path.basename(x)
    )
    test_features.rename(columns={'containment_fraction': 'cf'}, inplace=True)
    test_features.rename(columns={'gain_in_r2_score': 'g_r2'}, inplace=True)
    
    return test_features[['candidate', 'cf', 'g_r2', 'class']]

In [48]:
rf_classifier = create_model(
    alpha=0,
    training_filename='../data/training-for-model.csv',
    feature_vector=FEATURE_VECTOR,
    gain_column=GAIN_COLUMN_NAME
)



## NY Taxi and Vehicle Collision Problem

In [21]:
taxi_vehicle_collision_classes = predict_class_for_features(
    rf_classifier,
    'taxi-vehicle-collision-records-features',
    FEATURE_VECTOR
)

In [22]:
print_df(taxi_vehicle_collision_classes)

                                          candidate dataset_ranking_class
0            datamart.socrata.data-wa-gov.wajg-ig9g             good_gain
1  datamart.upload.a817349748524c618bec5505f46feaef             good_gain
2            datamart.socrata.data-wa-gov.wajg-ig9g             good_gain


## College Debt

In [57]:
college_debt_classes = predict_class_for_features(
    rf_classifier,
    'college-debt-records-features',
    FEATURE_VECTOR
)

In [58]:
print_df(college_debt_classes)

                                          candidate        cf      g_r2 class
0            datamart.socrata.data-wa-gov.wajg-ig9g  1.000000  0.808117  gain
1  datamart.upload.a817349748524c618bec5505f46feaef  1.000000  0.727736  gain
2            datamart.socrata.data-wa-gov.wajg-ig9g  0.386005  1.207520  gain


## Poverty Estimation

In [59]:
poverty_estimation_classes = predict_class_for_features(
    rf_classifier,
    'poverty-estimation-records-features',
    FEATURE_VECTOR
)

In [60]:
print_df(poverty_estimation_classes)

                                           candidate        cf      g_r2 class
0          datamart.socrata.data-sfgov-org.ua32-eewd  0.001276 -0.299140  loss
1          datamart.socrata.data-sfgov-org.49cy-x5m5  0.001276 -0.290706  loss
2          datamart.socrata.data-sfgov-org.33nh-56zb  0.001276 -0.317801  gain
3   datamart.upload.177bdaeafccf45ffb2d28dd1d057197c  1.000000  0.028248  gain
4          datamart.socrata.data-sfgov-org.k7mk-w2pq  0.001276 -0.310184  loss
5          datamart.socrata.data-sfgov-org.d7xx-7z6v  0.001594 -0.299034  loss
6          datamart.socrata.data-sfgov-org.g5sr-9nhs  0.000638 -0.304692  loss
7          datamart.socrata.data-sfgov-org.858q-nwrm  0.000319 -0.299818  loss
8          datamart.socrata.data-sfgov-org.7qzr-p6xn  0.001276 -0.301699  loss
9             datamart.socrata.data-ny-gov.43kr-jb2c  0.015625  1.389624  gain
10            datamart.socrata.data-wa-gov.t94r-s3m2  0.012117  0.052730  gain
