In [1]:
import sys
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import pearsonr
import os

In [2]:
FEATURE_VECTOR = ['query_num_of_columns',
                  'query_num_of_rows',
                  'query_row_column_ratio',
                  'query_max_mean',
                  'query_max_outlier_percentage',
                  'query_max_skewness',
                  'query_max_kurtosis',
                  'query_max_unique',
                  'candidate_num_of_columns',
                  'candidate_num_rows',
                  'candidate_row_column_ratio',
                  'candidate_max_mean',
                  'candidate_max_outlier_percentage',
                  'candidate_max_skewness',
                  'candidate_max_kurtosis',
                  'candidate_max_unique',
                  'query_target_max_pearson',
                  'query_target_max_spearman',
                  'query_target_max_covariance',
                  'query_target_max_mutual_info',
                  'candidate_target_max_pearson',
                  'candidate_target_max_spearman',
                  'candidate_target_max_covariance',
                  'candidate_target_max_mutual_info',
                  'max_pearson_difference',
                  'containment_fraction']
GAIN_COLUMN_NAME = 'gain_in_r2_score'

In [3]:
def print_df(df):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        print(df)

In [4]:
def create_model(alpha, training_filename, feature_vector, gain_column):
    """This functions builds a classifier based on the training data.
    """
    
    training_data = pd.read_csv(training_filename)
    training_data['class'] = ['gain' if row[gain_column] > alpha else 'loss'
                              for index, row in training_data.iterrows()]
    X_train = training_data[feature_vector]
    y_train = training_data['class']
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, y_train)
    
    return clf

In [5]:
def predict_class_for_features(classifier, test_filename, feature_vector):

    test_data = pd.read_csv(test_filename)
    # avoiding overflow
    for col in test_data.select_dtypes([np.number]).columns:
        test_data = test_data[test_data[col] < np.finfo(np.float32).max]
    test_data.reset_index(inplace=True)
    X_test = test_data[feature_vector]
    # replacing NaN values
    if X_test.isna().values.any():
        X_test.fillna(value=0, inplace=True)
    predicted_classes = classifier.predict(X_test)
    predicted_proba = classifier.predict_proba(X_test)
    
    test_data['class'] = predicted_classes
    test_data['candidate'] = test_data['candidate'].apply(
        lambda x: os.path.basename(x)
    )
    test_data.rename(columns={'containment_fraction': 'cf'}, inplace=True)
    test_data.rename(columns={'gain_in_r2_score': 'g_r2'}, inplace=True)
    
    probabilities = pd.DataFrame(predicted_proba, columns=list(classifier.classes_))
    test_data['p(gain)'] = probabilities['gain']
    test_data['p(loss)'] = probabilities['loss']
    
    return test_data[['candidate', 'cf', 'g_r2', 'class', 'p(gain)', 'p(loss)']]

In [6]:
def compute_precision_recall(df):
    
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    for index, row in df.iterrows():
        if row['g_r2'] > 0 and row['class'] == 'gain':
            tp += 1
        elif row['g_r2'] > 0 and row['class'] == 'loss':
            fn += 1
        elif row['g_r2'] < 0 and row['class'] == 'gain':
            fp += 1
        else:
            tn += 1

    precision = tp / (tp + fp) if tp + fp > 0 else np.nan
    recall = tp / (tp + fn) if tp + fn > 0 else np.nan
    
    print('Precision: %.4f\nRecall: %.4f'%(precision, recall))

In [7]:
def compute_pearson(df, col_1, col_2):
    return pearsonr(df[col_1], df[col_2])[0]

In [8]:
rf_classifier = create_model(
    alpha=0,
    training_filename='../data/training-for-model.csv',
    feature_vector=FEATURE_VECTOR,
    gain_column=GAIN_COLUMN_NAME
)



## NY Taxi and Vehicle Collision Problem

In [9]:
taxi_vehicle_collision_classes = predict_class_for_features(
    rf_classifier,
    'taxi-vehicle-collision-records-features',
    FEATURE_VECTOR
)

In [10]:
print_df(taxi_vehicle_collision_classes)

                                             candidate        cf      g_r2  \
0    datamart.socrata.data-cityofchicago-org.ijdy-a...  0.143646  4.029437   
1    datamart.socrata.data-cityofchicago-org.ijdy-a...  0.143646  4.029437   
2    datamart.socrata.data-ny-gov.869e-5zik_Closing...  0.033149  0.187176   
3    datamart.socrata.data-cityofnewyork-us.ucdy-by...  0.011050  0.032369   
4          datamart.socrata.data-ny-gov.dtfv-pchi_Date  0.143646  0.658823   
5    datamart.socrata.data-cityofnewyork-us.5c4s-jw...  0.033149  1.319678   
6    datamart.socrata.data-cityofnewyork-us.wg9x-4k...  0.000000  0.000000   
7    datamart.socrata.data-sfgov-org.4phr-3hrm_Elec...  0.000000  0.000000   
8    datamart.socrata.data-sfgov-org.62ex-d3qk_Rpt_...  0.005525 -6.566033   
9    datamart.socrata.data-sfgov-org.62ex-d3qk_From...  0.016575 -6.326834   
10   datamart.socrata.data-sfgov-org.62ex-d3qk_Thru...  0.016575 -5.345981   
11   datamart.socrata.data-sfgov-org.62ex-d3qk_Loan...  0.000000

In [11]:
compute_precision_recall(taxi_vehicle_collision_classes)

Precision: 0.3333
Recall: 0.2258


In [12]:
# Pearson between containment fraction and gain in R2
compute_pearson(taxi_vehicle_collision_classes, 'cf', 'g_r2')

0.30926206685429014

In [13]:
# Pearson between containment fraction and probability of being classified as gain
compute_pearson(taxi_vehicle_collision_classes, 'cf', 'p(gain)')

-0.07859762230452515

In [14]:
# Pearson between gain in R2 and probability of being classified as gain
compute_pearson(taxi_vehicle_collision_classes, 'g_r2', 'p(gain)')

0.010981608459809515

## College Debt

In [15]:
college_debt_classes = predict_class_for_features(
    rf_classifier,
    'college-debt-records-features',
    FEATURE_VECTOR
)

In [16]:
print_df(college_debt_classes)

                                           candidate   cf      g_r2 class  \
0      datamart.socrata.data-wa-gov.wajg-ig9g_UNITID  1.0  0.808117  gain   
1  datamart.upload.a817349748524c618bec5505f46fea...  1.0  0.727736  gain   

   p(gain)  p(loss)  
0      0.8      0.2  
1      0.7      0.3  


In [17]:
compute_precision_recall(college_debt_classes)

Precision: 1.0000
Recall: 1.0000


In [18]:
# Pearson between containment fraction and gain in R2
compute_pearson(college_debt_classes, 'cf', 'g_r2')



nan

In [19]:
# Pearson between containment fraction and probability of being classified as gain
compute_pearson(college_debt_classes, 'cf', 'p(gain)')

nan

In [20]:
# Pearson between gain in R2 and probability of being classified as gain
compute_pearson(college_debt_classes, 'g_r2', 'p(gain)')

1.0

## Poverty Estimation

In [21]:
poverty_estimation_classes = predict_class_for_features(
    rf_classifier,
    'poverty-estimation-records-features',
    FEATURE_VECTOR
)

In [22]:
print_df(poverty_estimation_classes)

                                           candidate        cf      g_r2  \
0     datamart.socrata.data-sfgov-org.ua32-eewd_MIPS  0.001276 -0.299140   
1     datamart.socrata.data-sfgov-org.49cy-x5m5_MIPS  0.001276 -0.290706   
2     datamart.socrata.data-sfgov-org.33nh-56zb_MIPS  0.001276 -0.317801   
3  datamart.upload.177bdaeafccf45ffb2d28dd1d05719...  1.000000  0.028248   
4     datamart.socrata.data-sfgov-org.k7mk-w2pq_MIPS  0.001276 -0.310184   
5     datamart.socrata.data-sfgov-org.d7xx-7z6v_MIPS  0.001594 -0.299034   
6     datamart.socrata.data-sfgov-org.g5sr-9nhs_MIPS  0.000638 -0.304692   
7     datamart.socrata.data-sfgov-org.858q-nwrm_MIPS  0.000319 -0.299818   
8     datamart.socrata.data-sfgov-org.7qzr-p6xn_MIPS  0.001276 -0.301699   

  class  p(gain)  p(loss)  
0  loss      0.3      0.7  
1  loss      0.4      0.6  
2  gain      0.6      0.4  
3  gain      0.6      0.4  
4  loss      0.3      0.7  
5  loss      0.3      0.7  
6  loss      0.3      0.7  
7  loss      0.

In [23]:
compute_precision_recall(poverty_estimation_classes)

Precision: 0.5000
Recall: 1.0000


In [24]:
# Pearson between containment fraction and gain in R2
compute_pearson(poverty_estimation_classes, 'cf', 'g_r2')

0.997610183234824

In [25]:
# Pearson between containment fraction and probability of being classified as gain
compute_pearson(poverty_estimation_classes, 'cf', 'p(gain)')

0.6190989311094166

In [26]:
# Pearson between gain in R2 and probability of being classified as gain
compute_pearson(poverty_estimation_classes, 'g_r2', 'p(gain)')

0.5887713733160523