In [1]:
import pandas as pd
import numpy as np
import shutil
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, \
    mean_squared_log_error, median_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os
import json
import re

import warnings; warnings.simplefilter('ignore')

In [2]:
def train_and_test_model(data, target_variable_name):
    """Builds a model using data to predict the target variable.
    """

    X_train, X_test, y_train, y_test = train_test_split(
        data.drop(target_variable_name, axis=1),
        data[target_variable_name],
        test_size=0.33,
        random_state=42
    )

    # normalizing data first
    scaler_X = StandardScaler().fit(X_train)
    scaler_y = StandardScaler().fit(y_train.values.reshape(-1, 1))
    X_train = scaler_X.transform(X_train)
    y_train = scaler_y.transform(y_train.values.reshape(-1, 1))
    X_test = scaler_X.transform(X_test)
    y_test = scaler_y.transform(y_test.values.reshape(-1, 1))

    forest = RandomForestRegressor(
        n_estimators=100,
        random_state=42,
        n_jobs=-1,
        max_depth=len(data.columns)-1
    )
    forest.fit(X_train, y_train.ravel())
    yfit = forest.predict(X_test)

    return dict(
        mean_absolute_error=mean_absolute_error(y_test, yfit),
        mean_squared_error=mean_squared_error(y_test, yfit),
        median_absolute_error=median_absolute_error(y_test, yfit),
        r2_score=r2_score(y_test, yfit)
    )

In [3]:
def get_performance_scores(data, target_variable_name, missing_value_imputation):
    """Builds a model using data to predict the target variable,
    returning different performance metrics.
    """

    if missing_value_imputation:
        
        # imputation on data
        fill_NaN = SimpleImputer(missing_values=np.nan, strategy='mean')
        new_data = pd.DataFrame(fill_NaN.fit_transform(data))
        new_data.columns = data.columns
        new_data.index = data.index

        # training and testing model
        return train_and_test_model(new_data, target_variable_name)

    else:
        return train_and_test_model(data, target_variable_name)

In [4]:
def after_better_before(scores_before, scores_after):
    result = dict(
        mean_absolute_error=False,
        mean_squared_error=False,
        median_absolute_error=False,
        r2_score=False
    )
    if scores_before['mean_absolute_error'] > scores_after['mean_absolute_error']:
        result['mean_absolute_error'] = True
    if scores_before['mean_squared_error'] > scores_after['mean_squared_error']:
        result['mean_squared_error'] = True
    if scores_before['median_absolute_error'] > scores_after['median_absolute_error']:
        result['median_absolute_error'] = True
    if scores_before['r2_score'] < scores_after['r2_score']:
        result['r2_score'] = True
        
    return result

## NY Taxi and Vehicle Collision Problem

### Loading Datasets

In [5]:
ny_taxi_vehicle_collision_data = pd.read_csv('data/taxi-vehicle-collision/taxi-vehicle-collision.csv')
ny_taxi_vehicle_collision_data['datetime'] = pd.to_datetime(ny_taxi_vehicle_collision_data['datetime'])
ny_taxi_vehicle_collision_data.index = ny_taxi_vehicle_collision_data['datetime']
ny_taxi_vehicle_collision_data.head()

Unnamed: 0_level_0,d3mIndex,datetime,n. trips,n. collisions
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-01-01,0,2014-01-01,420810,399
2014-01-02,1,2014-01-02,359958,603
2014-01-03,2,2014-01-03,275470,423
2014-01-04,3,2014-01-04,417499,418
2014-01-05,4,2014-01-05,388542,320


In [6]:
ny_taxi_vehicle_collision_data.drop(
    ['d3mIndex', 'datetime'],
    axis=1,
    inplace=True
)

In [7]:
ny_taxi_vehicle_collision_data.head()

Unnamed: 0_level_0,n. trips,n. collisions
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-01-01,420810,399
2014-01-02,359958,603
2014-01-03,275470,423
2014-01-04,417499,418
2014-01-05,388542,320


In [8]:
weather_data = pd.read_csv('data/taxi-vehicle-collision/weather.csv')
weather_data['time'] = pd.to_datetime(weather_data['time'])
weather_data.index = weather_data['time']
weather_data.head()

Unnamed: 0_level_0,time,Amt[PrecipHourly1],Temp[Temp],Visby[Visibility]
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-01,2010-01-01,0.536364,1.495238,8573.190476
2010-01-02,2010-01-02,0.05,-0.568571,13861.2
2010-01-03,2010-01-03,0.0,-6.626316,15141.763158
2010-01-04,2010-01-04,0.0,-4.45,16069.75
2010-01-05,2010-01-05,0.0,-3.6375,16069.75


In [9]:
weather_data.drop(
    ['time'],
    axis=1,
    inplace=True
)

In [10]:
weather_data.head()

Unnamed: 0_level_0,Amt[PrecipHourly1],Temp[Temp],Visby[Visibility]
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,0.536364,1.495238,8573.190476
2010-01-02,0.05,-0.568571,13861.2
2010-01-03,0.0,-6.626316,15141.763158
2010-01-04,0.0,-4.45,16069.75
2010-01-05,0.0,-3.6375,16069.75


### Training and Testing on Query Dataset

In [11]:
scores_query = get_performance_scores(
    ny_taxi_vehicle_collision_data,
    'n. trips',
    False
)

In [12]:
scores_query

{'mean_absolute_error': 0.6700611969965976,
 'mean_squared_error': 0.9331557660408635,
 'median_absolute_error': 0.3961090319930343,
 'r2_score': 0.056218757193401414}

### Training and Testing on Query + Candidate Dataset

In [13]:
# join dataset
join_ = ny_taxi_vehicle_collision_data.join(
    weather_data,
    how='left',
    rsuffix='_r'
)
join_.head()

Unnamed: 0_level_0,n. trips,n. collisions,Amt[PrecipHourly1],Temp[Temp],Visby[Visibility]
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-01-01,420810,399,0.0,-1.515625,16069.75
2014-01-02,359958,603,0.0,-1.134211,15365.052632
2014-01-03,275470,423,0.619512,-8.546512,4990.697674
2014-01-04,417499,418,0.226923,-9.475,16069.75
2014-01-05,388542,320,0.133333,-0.538095,12491.428571


In [14]:
scores_query_candidate = get_performance_scores(
    join_,
    'n. trips',
    True
)

In [15]:
scores_query_candidate

{'mean_absolute_error': 0.6501964940555685,
 'mean_squared_error': 0.7831904606524401,
 'median_absolute_error': 0.46647133829954623,
 'r2_score': 0.20789165838314705}

In [16]:
after_better_before(scores_query, scores_query_candidate)

{'mean_absolute_error': True,
 'mean_squared_error': True,
 'median_absolute_error': False,
 'r2_score': True}

### Saving New Datasets

In [17]:
# ny_taxi_vehicle_collision_data.index.rename('key-for-ranking', inplace=True)
# weather_data.index.rename('key-for-ranking', inplace=True)

ny_taxi_vehicle_collision_data.to_csv('data/taxi-vehicle-collision/taxi-vehicle-collision-v2.csv', index=True)
weather_data.to_csv('data/taxi-vehicle-collision/weather-v2.csv', index=True)
join_.to_csv('data/taxi-vehicle-collision/join.csv', index=True)

### Generating Training Record

In [18]:
training_record_ny_taxi = dict(
    query_dataset=os.path.abspath('data/taxi-vehicle-collision/taxi-vehicle-collision-v2.csv'),
    query_key='datetime',
    target='n. trips',
    candidate_dataset=os.path.abspath('data/taxi-vehicle-collision/weather-v2.csv'),
    candidate_key='time',
    joined_dataset=os.path.abspath('data/taxi-vehicle-collision/join.csv'),
    imputation_strategy='mean',
    mean_absolute_error=[scores_query['mean_absolute_error'], scores_query_candidate['mean_absolute_error']],
    mean_squared_error=[scores_query['mean_squared_error'], scores_query_candidate['mean_squared_error']],
    median_absolute_error=[scores_query['median_absolute_error'], scores_query_candidate['median_absolute_error']],
    r2_score=[scores_query['r2_score'], scores_query_candidate['r2_score']]
)

## NY Taxi Demand Problem

### Loading Datasets

In [19]:
taxi = pd.read_csv('data/ny-taxi-demand/yellow-taxi-2017.csv')
taxi['tpep_pickup_datetime'] = pd.to_datetime(taxi['tpep_pickup_datetime'])
taxi.index = taxi['tpep_pickup_datetime']
taxi.head()

Unnamed: 0_level_0,tpep_pickup_datetime,PULocationID,n. trips
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,2017-01-01,4,136
2017-01-01,2017-01-01,7,78
2017-01-01,2017-01-01,12,3
2017-01-01,2017-01-01,13,104
2017-01-01,2017-01-01,14,4


In [20]:
taxi.drop(['tpep_pickup_datetime'], axis=1, inplace=True)

# removing categorical attribute
taxi.drop(['PULocationID'], axis=1, inplace=True)

In [21]:
taxi.head()

Unnamed: 0_level_0,n. trips
tpep_pickup_datetime,Unnamed: 1_level_1
2017-01-01,136
2017-01-01,78
2017-01-01,3
2017-01-01,104
2017-01-01,4


In [22]:
taxi['month_sin'] = np.sin((taxi.index.month-1)*(2.*np.pi/24))
taxi['month_cos'] = np.cos((taxi.index.month-1)*(2.*np.pi/24))
taxi['day_sin'] = np.sin((taxi.index.day-1)*(2.*np.pi/12))
taxi['day_cos'] = np.cos((taxi.index.day-1)*(2.*np.pi/12))
taxi['hour_sin'] = np.sin((taxi.index.hour-1)*(2.*np.pi/12))
taxi['hour_cos'] = np.cos((taxi.index.hour-1)*(2.*np.pi/12))
taxi['weekday_sin'] = np.sin(taxi.index.weekday*(2.*np.pi/7))
taxi['weekday_cos'] = np.cos(taxi.index.weekday*(2.*np.pi/7))

In [23]:
taxi.head()

Unnamed: 0_level_0,n. trips,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos,weekday_sin,weekday_cos
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-01-01,136,0.0,1.0,0.0,1.0,-0.5,0.866025,-0.781831,0.62349
2017-01-01,78,0.0,1.0,0.0,1.0,-0.5,0.866025,-0.781831,0.62349
2017-01-01,3,0.0,1.0,0.0,1.0,-0.5,0.866025,-0.781831,0.62349
2017-01-01,104,0.0,1.0,0.0,1.0,-0.5,0.866025,-0.781831,0.62349
2017-01-01,4,0.0,1.0,0.0,1.0,-0.5,0.866025,-0.781831,0.62349


In [24]:
weather_data = pd.read_csv('data/ny-taxi-demand/weather.csv')
weather_data['time'] = pd.to_datetime(weather_data['time'])
weather_data.index = weather_data['time']
weather_data.head()

Unnamed: 0_level_0,time,Amt[PrecipHourly1],Temp[Temp],Visby[Visibility]
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-01 01:00:00,2010-01-01 01:00:00,0.3,1.066667,11176.666667
2010-01-01 02:00:00,2010-01-01 02:00:00,0.4,0.8,11265.0
2010-01-01 03:00:00,2010-01-01 03:00:00,1.3,0.6,9656.0
2010-01-01 04:00:00,2010-01-01 04:00:00,1.4,0.6,8523.5
2010-01-01 05:00:00,2010-01-01 05:00:00,1.925,0.866667,8047.0


In [25]:
weather_data.drop(
    ['time'],
    axis=1,
    inplace=True
)

In [26]:
weather_data.head()

Unnamed: 0_level_0,Amt[PrecipHourly1],Temp[Temp],Visby[Visibility]
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01 01:00:00,0.3,1.066667,11176.666667
2010-01-01 02:00:00,0.4,0.8,11265.0
2010-01-01 03:00:00,1.3,0.6,9656.0
2010-01-01 04:00:00,1.4,0.6,8523.5
2010-01-01 05:00:00,1.925,0.866667,8047.0


### Training and Testing on Query Dataset

In [27]:
scores_query = get_performance_scores(
    taxi,
    'n. trips',
    False
)

In [28]:
scores_query

{'mean_absolute_error': 0.7084014043190949,
 'mean_squared_error': 0.9736173054910823,
 'median_absolute_error': 0.5097699113289245,
 'r2_score': 0.021099349696040326}

### Training and Testing on Query + Candidate Dataset

In [86]:
# join dataset
join_ = taxi.join(
    weather_data,
    how='left',
    rsuffix='_r'
)
join_.index.name = 'tpep_pickup_datetime'
join_.head()

Unnamed: 0_level_0,n. trips,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos,weekday_sin,weekday_cos,Amt[PrecipHourly1],Temp[Temp],Visby[Visibility]
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2017-01-01,136,0.0,1.0,0.0,1.0,-0.5,0.866025,-0.781831,0.62349,0.0,7.2,16093.0
2017-01-01,78,0.0,1.0,0.0,1.0,-0.5,0.866025,-0.781831,0.62349,0.0,7.2,16093.0
2017-01-01,3,0.0,1.0,0.0,1.0,-0.5,0.866025,-0.781831,0.62349,0.0,7.2,16093.0
2017-01-01,104,0.0,1.0,0.0,1.0,-0.5,0.866025,-0.781831,0.62349,0.0,7.2,16093.0
2017-01-01,4,0.0,1.0,0.0,1.0,-0.5,0.866025,-0.781831,0.62349,0.0,7.2,16093.0


In [30]:
scores_query_candidate = get_performance_scores(
    join_,
    'n. trips',
    True
)

In [31]:
scores_query_candidate

{'mean_absolute_error': 0.7002631507115964,
 'mean_squared_error': 0.9580213573867866,
 'median_absolute_error': 0.5331032401335237,
 'r2_score': 0.03677992938099295}

In [32]:
after_better_before(scores_query, scores_query_candidate)

{'mean_absolute_error': True,
 'mean_squared_error': True,
 'median_absolute_error': False,
 'r2_score': True}

### Saving New Datasets

In [87]:
# taxi.index.rename('key-for-ranking', inplace=True)
# weather_data.index.rename('key-for-ranking', inplace=True)

taxi.to_csv('data/ny-taxi-demand/yellow-taxi-2017-v2.csv', index=True)
weather_data.to_csv('data/ny-taxi-demand/weather-v2.csv', index=True)
join_.to_csv('data/ny-taxi-demand/join.csv', index=True)

### Generating Training Record

In [34]:
training_record_ny_taxi_demand = dict(
    query_dataset=os.path.abspath('data/ny-taxi-demand/yellow-taxi-2017-v2.csv'),
    query_key='tpep_pickup_datetime',
    target='n. trips',
    candidate_dataset=os.path.abspath('data/ny-taxi-demand/weather-v2.csv'),
    candidate_key='time',
    joined_dataset=os.path.abspath('data/ny-taxi-demand/join.csv'),
    imputation_strategy='mean',
    mean_absolute_error=[scores_query['mean_absolute_error'], scores_query_candidate['mean_absolute_error']],
    mean_squared_error=[scores_query['mean_squared_error'], scores_query_candidate['mean_squared_error']],
    median_absolute_error=[scores_query['median_absolute_error'], scores_query_candidate['median_absolute_error']],
    r2_score=[scores_query['r2_score'], scores_query_candidate['r2_score']]
)

## FIFA 2018 Problem

### Loading Datasets

In [35]:
man_of_match = pd.read_csv('data/fifa-2018/fifa-2018-man-of-match.csv')
man_of_match.index = man_of_match['GameID']
man_of_match.drop(['d3mIndex', 'GameID'], axis=1, inplace=True)
man_of_match.head()

Unnamed: 0_level_0,Date,Team,Opponent,Ball Possession %,Off-Target,Blocked,Offsides,Saves,Pass Accuracy %,Passes,Distance Covered (Kms),Yellow & Red,Man of the Match,1st Goal,Round,PSO,Goals in PSO,Own goals
GameID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
55,23-06-2018,Mexico,Korea Republic,59,6,2,0,5,89,485,97,0,1,26.0,Group Stage,No,0,
40,21-06-2018,Denmark,Australia,49,5,0,1,4,88,458,112,0,1,7.0,Group Stage,No,0,
19,17-06-2018,Mexico,Germany,40,6,2,2,9,82,281,106,0,0,35.0,Group Stage,No,0,
31,19-06-2018,Senegal,Poland,43,4,2,3,3,81,328,107,0,1,60.0,Group Stage,No,0,
98,30-06-2018,Uruguay,Portugal,39,2,1,0,4,69,269,106,0,1,7.0,Round of 16,No,0,


In [36]:
man_of_match.dtypes

Date                       object
Team                       object
Opponent                   object
Ball Possession %           int64
Off-Target                  int64
Blocked                     int64
Offsides                    int64
Saves                       int64
Pass Accuracy %             int64
Passes                      int64
Distance Covered (Kms)      int64
Yellow & Red                int64
Man of the Match            int64
1st Goal                  float64
Round                      object
PSO                        object
Goals in PSO                int64
Own goals                 float64
dtype: object

In [37]:
man_of_match.drop(['Date', 'Team', 'Opponent', 'Round', 'PSO'], axis=1, inplace=True)

In [38]:
man_of_match.dtypes

Ball Possession %           int64
Off-Target                  int64
Blocked                     int64
Offsides                    int64
Saves                       int64
Pass Accuracy %             int64
Passes                      int64
Distance Covered (Kms)      int64
Yellow & Red                int64
Man of the Match            int64
1st Goal                  float64
Goals in PSO                int64
Own goals                 float64
dtype: object

In [39]:
game_stats = pd.read_csv('data/fifa-2018/fifa-2018-game-stats-data.csv')
game_stats.index = game_stats['GameID']
game_stats.drop('GameID', axis=1, inplace=True)
game_stats.head()

Unnamed: 0_level_0,Goal Scored,Attempts,On-Target,Corners,Free Kicks,Fouls Committed,Yellow Card,Red,Own goal Time
GameID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,5,13,7,6,11,22,0,0,
1,0,6,0,2,25,10,0,0,
2,0,8,3,0,7,12,2,0,
3,1,14,4,5,13,6,0,0,
4,0,13,3,5,14,22,1,0,90.0


In [40]:
game_stats.dtypes

Goal Scored          int64
Attempts             int64
On-Target            int64
Corners              int64
Free Kicks           int64
Fouls Committed      int64
Yellow Card          int64
Red                  int64
Own goal Time      float64
dtype: object

Unfortunately, this is a classification, and not regression, problem ...

## College Debt Problem

### Loading Datasets

In [41]:
college_debt = pd.read_csv('data/college-debt/college-debt.csv')
college_debt.index = college_debt['UNITID']
college_debt.drop(['d3mIndex', 'UNITID'], axis=1, inplace=True)
college_debt.head()

Unnamed: 0_level_0,INSTNM,PCTFLOAN,CONTROL,STABBR,PCIP16,MD_EARN_WNE_P10,PPTUG_EF,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,SATMTMID,SATVRMID,SATWRMID,UGDS,PREDDEG,DEBT_EARNINGS_RATIO
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
12268508,San Joaquin Valley College-Rancho Cordova,,3,CA,,28300,,,,,,,,,,0,49
207564,Oklahoma State University Institute of Technology,0.475,1,OK,0.0,35300,0.2297,0.2953,0.0291,0.0647,0.0051,,,,2164.0,2,36
420024,Centura College-Chesapeake,0.8125,3,VA,0.0,21900,0.2315,0.2808,0.5665,0.0493,0.0,,,,203.0,2,127
164492,Anna Maria College,0.7465,2,MA,0.0,44800,0.2621,0.6518,0.1258,0.1022,0.0123,,,,1057.0,3,76
234085,Virginia Military Institute,0.4589,1,VA,0.0321,65700,0.0,0.7992,0.0607,0.0584,0.042,575.0,575.0,,1713.0,3,53


In [42]:
college_debt.dtypes

INSTNM                  object
PCTFLOAN               float64
CONTROL                  int64
STABBR                  object
PCIP16                 float64
MD_EARN_WNE_P10         object
PPTUG_EF               float64
UGDS_WHITE             float64
UGDS_BLACK             float64
UGDS_HISP              float64
UGDS_ASIAN             float64
SATMTMID               float64
SATVRMID               float64
SATWRMID               float64
UGDS                   float64
PREDDEG                  int64
DEBT_EARNINGS_RATIO      int64
dtype: object

In [43]:
# dropping strings
college_debt.drop(['INSTNM', 'STABBR', 'MD_EARN_WNE_P10'], axis=1, inplace=True)

# dropping categorical data
college_debt.drop(['CONTROL', 'PREDDEG'], axis=1, inplace=True)

In [44]:
college_debt.dtypes

PCTFLOAN               float64
PCIP16                 float64
PPTUG_EF               float64
UGDS_WHITE             float64
UGDS_BLACK             float64
UGDS_HISP              float64
UGDS_ASIAN             float64
SATMTMID               float64
SATVRMID               float64
SATWRMID               float64
UGDS                   float64
DEBT_EARNINGS_RATIO      int64
dtype: object

In [45]:
scorecard = pd.read_csv('data/college-debt/college-scorecard.csv')
scorecard.index = scorecard['UNITID']
scorecard.drop(['UNITID'], axis=1, inplace=True)
scorecard.head()

Unnamed: 0_level_0,OPEID,OPEID6,INSTNM,STABBR,INSTURL,NPCURL,HCM2,PREDDEG,LOCALE,CONTROL,...,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GT_25K_P6,GRAD_DEBT_MDN_SUPP,GRAD_DEBT_MDN10YR_SUPP,RPY_3YR_RT_SUPP,C150_L4_POOLED_SUPP,C150_4_POOLED_SUPP,MapLocation
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
141936,3072500,30725,World Medicine Institute,HI,http://www.wmi.edu,www.wmi.edu,0,4,21.0,2,...,,,,,PrivacySuppressed,PrivacySuppressed,PrivacySuppressed,,,"Honolulu, HI\n(21.30992, -157.858158)"
108269,3288300,32883,Academy of Chinese Culture and Health Sciences,CA,http://www.acchs.edu,www.acchs.edu/npcalc.htm,0,4,11.0,2,...,,,,,PrivacySuppressed,PrivacySuppressed,,,,"Oakland, CA\n(37.804828, -122.27248)"
130581,2546000,25460,Tri-State College of Acupuncture,NY,http://www.TSCA.edu,www.tsca.edu,0,4,11.0,3,...,,,PrivacySuppressed,PrivacySuppressed,PrivacySuppressed,PrivacySuppressed,PrivacySuppressed,,,"New York, NY\n(40.713054, -74.007228)"
156222,195300,1953,Asbury Theological Seminary,KY,http://www.asburyseminary.edu,,0,4,31.0,2,...,,,42500,0.746,PrivacySuppressed,PrivacySuppressed,PrivacySuppressed,,,"Wilmore, KY\n(37.861905, -84.661723)"
157298,197400,1974,Louisville Presbyterian Theological Seminary,KY,http://www.lpts.edu,www.lpts.edu,0,4,21.0,2,...,,,PrivacySuppressed,PrivacySuppressed,PrivacySuppressed,PrivacySuppressed,,,,"Louisville, KY\n(38.256078, -85.751569)"


In [46]:
# Looking at metadata and removing categorical data
categorical = [
    'MAIN',
    'PREDDEG',
    'HIGHDEG',
    'CONTROL',
    'ST_FIPS',
    'REGION',
    'LOCALE',
    'LOCALE2',
    'CCBASIC',
    'CCUGPROF',
    'CCSIZSET',
    'HBCU',
    'PBI',
    'ANNHI',
    'TRIBAL',
    'AANAPII',
    'HSI',
    'NANTI',
    'MENONLY',
    'WOMENONLY',
    'RELAFFIL',
    'DISTANCEONLY',
    'CURROPER',
    'ICLEVEL',
    'OPENADMP',
    'SCHTYPE',
    'OPEFLAG'
]

for categorical_var in categorical:
    if categorical_var in scorecard.columns:
        scorecard.drop(categorical_var, axis=1, inplace=True)

In [47]:
scorecard.head()

Unnamed: 0_level_0,OPEID,OPEID6,INSTNM,STABBR,INSTURL,NPCURL,HCM2,SATVR25,SATVR75,SATMT25,...,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GT_25K_P6,GRAD_DEBT_MDN_SUPP,GRAD_DEBT_MDN10YR_SUPP,RPY_3YR_RT_SUPP,C150_L4_POOLED_SUPP,C150_4_POOLED_SUPP,MapLocation
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
141936,3072500,30725,World Medicine Institute,HI,http://www.wmi.edu,www.wmi.edu,0,,,,...,,,,,PrivacySuppressed,PrivacySuppressed,PrivacySuppressed,,,"Honolulu, HI\n(21.30992, -157.858158)"
108269,3288300,32883,Academy of Chinese Culture and Health Sciences,CA,http://www.acchs.edu,www.acchs.edu/npcalc.htm,0,,,,...,,,,,PrivacySuppressed,PrivacySuppressed,,,,"Oakland, CA\n(37.804828, -122.27248)"
130581,2546000,25460,Tri-State College of Acupuncture,NY,http://www.TSCA.edu,www.tsca.edu,0,,,,...,,,PrivacySuppressed,PrivacySuppressed,PrivacySuppressed,PrivacySuppressed,PrivacySuppressed,,,"New York, NY\n(40.713054, -74.007228)"
156222,195300,1953,Asbury Theological Seminary,KY,http://www.asburyseminary.edu,,0,,,,...,,,42500,0.746,PrivacySuppressed,PrivacySuppressed,PrivacySuppressed,,,"Wilmore, KY\n(37.861905, -84.661723)"
157298,197400,1974,Louisville Presbyterian Theological Seminary,KY,http://www.lpts.edu,www.lpts.edu,0,,,,...,,,PrivacySuppressed,PrivacySuppressed,PrivacySuppressed,PrivacySuppressed,,,,"Louisville, KY\n(38.256078, -85.751569)"


In [48]:
# dropping repeated columns
for col in set(college_debt.columns).intersection(set(scorecard.columns)):
    scorecard.drop(col, axis=1, inplace=True)

In [49]:
scorecard.dtypes

OPEID                      int64
OPEID6                     int64
INSTNM                    object
STABBR                    object
INSTURL                   object
                           ...  
GRAD_DEBT_MDN10YR_SUPP    object
RPY_3YR_RT_SUPP           object
C150_L4_POOLED_SUPP       object
C150_4_POOLED_SUPP        object
MapLocation               object
Length: 95, dtype: object

In [50]:
scorecard = scorecard.select_dtypes(exclude=['object'])

In [51]:
scorecard.dtypes

OPEID         int64
OPEID6        int64
HCM2          int64
SATVR25     float64
SATVR75     float64
             ...   
RET_FT4     float64
RET_FTL4    float64
RET_PT4     float64
RET_PTL4    float64
UG25ABV     float64
Length: 83, dtype: object

### Training and Testing on Query Dataset

In [52]:
scores_query = get_performance_scores(
    college_debt,
    'DEBT_EARNINGS_RATIO',
    True
)

In [53]:
scores_query

{'mean_absolute_error': 0.5627884527255671,
 'mean_squared_error': 0.6494054642491789,
 'median_absolute_error': 0.3917623992184802,
 'r2_score': 0.39565581099809455}

### Training and Testing on Query + Candidate Dataset

In [54]:
# join dataset
join_ = college_debt.join(
    scorecard,
    how='left',
    rsuffix='_r'
)
join_.head()

Unnamed: 0_level_0,PCTFLOAN,PCIP16,PPTUG_EF,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,SATMTMID,SATVRMID,SATWRMID,...,NPT42_PRIV,NPT43_PRIV,NPT44_PRIV,NPT45_PRIV,PCTPELL,RET_FT4,RET_FTL4,RET_PT4,RET_PTL4,UG25ABV
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12268508,,,,,,,,,,,...,,,,,,,,,,
207564,0.475,0.0,0.2297,0.2953,0.0291,0.0647,0.0051,,,,...,,,,,0.4884,0.2105,,0.6,,0.3101
420024,0.8125,0.0,0.2315,0.2808,0.5665,0.0493,0.0,,,,...,23218.0,,,,0.6566,,0.3333,,0.125,0.6506
164492,0.7465,0.0,0.2621,0.6518,0.1258,0.1022,0.0123,,,,...,24589.0,27220.0,28708.0,29912.0,0.35,0.6612,,0.0,,0.2948
234085,0.4589,0.0321,0.0,0.7992,0.0607,0.0584,0.042,575.0,575.0,,...,,,,,0.1463,0.8902,,,,0.0024


In [55]:
scores_query_candidate = get_performance_scores(
    join_,
    'DEBT_EARNINGS_RATIO',
    True
)

In [56]:
scores_query_candidate

{'mean_absolute_error': 0.39258750178581153,
 'mean_squared_error': 0.3400036491742917,
 'median_absolute_error': 0.27084835909322236,
 'r2_score': 0.6835886962307993}

In [57]:
after_better_before(scores_query, scores_query_candidate)

{'mean_absolute_error': True,
 'mean_squared_error': True,
 'median_absolute_error': True,
 'r2_score': True}

### Saving Datasets

In [58]:
# college_debt.index.rename('key-for-ranking', inplace=True)
# scorecard.index.rename('key-for-ranking', inplace=True)

college_debt.to_csv('data/college-debt/college-debt-v2.csv')
scorecard.to_csv('data/college-debt/college-scorecard-v2.csv')
join_.to_csv('data/college-debt/join.csv')

### Generating Training Record

In [59]:
training_record_college_debt = dict(
    query_dataset=os.path.abspath('data/college-debt/college-debt-v2.csv'),
    query_key='UNITID',
    target='DEBT_EARNINGS_RATIO',
    candidate_dataset=os.path.abspath('data/college-debt/college-scorecard-v2.csv'),
    candidate_key='UNITID',
    joined_dataset=os.path.abspath('data/college-debt/join.csv'),
    imputation_strategy='mean',
    mean_absolute_error=[scores_query['mean_absolute_error'], scores_query_candidate['mean_absolute_error']],
    mean_squared_error=[scores_query['mean_squared_error'], scores_query_candidate['mean_squared_error']],
    median_absolute_error=[scores_query['median_absolute_error'], scores_query_candidate['median_absolute_error']],
    r2_score=[scores_query['r2_score'], scores_query_candidate['r2_score']]
)

## Poverty Estimation Problem

### Loading Datasets

In [60]:
poverty_estimation = pd.read_csv('data/poverty-estimation/poverty-estimation.csv')
poverty_estimation.index = poverty_estimation['FIPS']
poverty_estimation.drop(['d3mIndex', 'FIPS'], axis=1, inplace=True)
poverty_estimation = poverty_estimation.select_dtypes(exclude=['object'])
poverty_estimation.head()

Unnamed: 0_level_0,RUCCode,POVALL_2016
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1
35005,5,13974
13297,1,11385
13137,6,6500
54017,9,1460
55055,4,7618


In [61]:
# removing categorical data
poverty_estimation.drop(['RUCCode'], axis=1, inplace=True)

In [62]:
population_estimate = pd.read_csv('data/poverty-estimation/PopulationEstimates.csv', encoding = 'ISO-8859-1')
population_estimate = population_estimate[pd.notnull(population_estimate['FIPS'])]
population_estimate['FIPS'] = pd.to_numeric(population_estimate['FIPS'], downcast='integer')
population_estimate.index = population_estimate['FIPS']
population_estimate.drop(['FIPS'], axis=1, inplace=True)
population_estimate = population_estimate.select_dtypes(exclude=['object'])
population_estimate.head()

Unnamed: 0_level_0,Rural-urban_Continuum Code_2003,Rural-urban_Continuum Code_2013,Urban_Influence_Code_2003,Urban_Influence_Code_2013,Economic_typology_2015,RESIDUAL_2016,RESIDUAL_2017,RESIDUAL_2018,R_birth_2011,R_birth_2012,...,R_DOMESTIC_MIG_2017,R_DOMESTIC_MIG_2018,R_NET_MIG_2011,R_NET_MIG_2012,R_NET_MIG_2013,R_NET_MIG_2014,R_NET_MIG_2015,R_NET_MIG_2016,R_NET_MIG_2017,R_NET_MIG_2018
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,0.0,0.0,0.0,,,...,,,,,,,,,,
1000,,,,,,-114.0,-106.0,-102.0,12.5,12.3,...,0.4,1.2,0.5,1.2,1.6,0.6,0.6,0.8,1.1,1.9
1001,2.0,2.0,2.0,2.0,0.0,-2.0,-4.0,0.0,11.6,11.1,...,1.1,0.7,6.0,-6.1,-3.9,2.0,-1.9,5.3,1.0,0.6
1003,4.0,3.0,5.0,2.0,5.0,17.0,13.0,27.0,11.8,11.1,...,22.0,24.3,16.3,17.6,22.9,20.2,17.9,21.5,22.5,24.8
1005,6.0,6.0,6.0,6.0,3.0,3.0,2.0,1.0,12.2,11.0,...,-25.5,-9.1,0.3,-6.8,-8.1,-5.1,-15.5,-18.2,-25.0,-8.6


In [63]:
cols_remove = list()
for col in list(population_estimate.columns):
    if '2017' in col or '2018' in col:
        cols_remove.append(col)
population_estimate.drop(cols_remove, axis=1, inplace=True)

In [64]:
population_estimate.head()

Unnamed: 0_level_0,Rural-urban_Continuum Code_2003,Rural-urban_Continuum Code_2013,Urban_Influence_Code_2003,Urban_Influence_Code_2013,Economic_typology_2015,RESIDUAL_2016,R_birth_2011,R_birth_2012,R_birth_2013,R_birth_2014,...,R_DOMESTIC_MIG_2013,R_DOMESTIC_MIG_2014,R_DOMESTIC_MIG_2015,R_DOMESTIC_MIG_2016,R_NET_MIG_2011,R_NET_MIG_2012,R_NET_MIG_2013,R_NET_MIG_2014,R_NET_MIG_2015,R_NET_MIG_2016
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,0.0,,,,,...,,,,,,,,,,
1000,,,,,,-114.0,12.5,12.3,12.0,12.2,...,0.5,-0.2,-0.3,-0.4,0.5,1.2,1.6,0.6,0.6,0.8
1001,2.0,2.0,2.0,2.0,0.0,-2.0,11.6,11.1,10.5,11.7,...,-4.1,1.8,-2.0,4.8,6.0,-6.1,-3.9,2.0,-1.9,5.3
1003,4.0,3.0,5.0,2.0,5.0,17.0,11.8,11.1,11.2,11.2,...,21.8,19.7,17.1,20.4,16.3,17.6,22.9,20.2,17.9,21.5
1005,6.0,6.0,6.0,6.0,3.0,3.0,12.2,11.0,10.5,9.8,...,-7.7,-5.3,-16.1,-18.9,0.3,-6.8,-8.1,-5.1,-15.5,-18.2


In [65]:
poverty_estimation = poverty_estimation.join(
    population_estimate,
    how='inner',
    rsuffix='_r'
)

In [66]:
poverty_estimation.head()

Unnamed: 0_level_0,POVALL_2016,Rural-urban_Continuum Code_2003,Rural-urban_Continuum Code_2013,Urban_Influence_Code_2003,Urban_Influence_Code_2013,Economic_typology_2015,RESIDUAL_2016,R_birth_2011,R_birth_2012,R_birth_2013,...,R_DOMESTIC_MIG_2013,R_DOMESTIC_MIG_2014,R_DOMESTIC_MIG_2015,R_DOMESTIC_MIG_2016,R_NET_MIG_2011,R_NET_MIG_2012,R_NET_MIG_2013,R_NET_MIG_2014,R_NET_MIG_2015,R_NET_MIG_2016
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
35005,13974,5.0,5.0,8.0,8.0,0.0,-1.0,14.5,14.0,14.3,...,-2.1,-7.6,-5.9,-7.6,-4.3,-3.0,-0.6,-7.2,-4.2,-6.8
13297,11385,1.0,1.0,1.0,1.0,0.0,1.0,12.9,12.8,12.2,...,8.2,12.7,6.6,13.5,4.0,-0.1,8.9,13.2,7.2,14.4
13137,6500,6.0,6.0,5.0,5.0,3.0,-2.0,12.0,11.5,12.0,...,-7.1,6.9,3.0,6.0,-3.1,5.7,-6.8,7.2,4.1,7.0
54017,1460,9.0,9.0,8.0,8.0,2.0,2.0,8.0,7.7,8.0,...,31.0,-7.8,29.0,-13.9,7.7,8.5,30.8,-8.0,28.9,-14.0
55055,7618,4.0,4.0,3.0,3.0,3.0,-2.0,11.2,10.6,10.4,...,-0.4,-6.4,-1.2,-2.4,-2.3,2.8,0.2,-5.8,-0.6,-1.8


In [67]:
unemployment = pd.read_csv('data/poverty-estimation/Unemployment.csv', encoding = 'ISO-8859-1')
unemployment = unemployment[pd.notnull(unemployment['FIPS'])]
unemployment['FIPS'] = pd.to_numeric(unemployment['FIPS'], downcast='integer')
unemployment.index = unemployment['FIPS']
unemployment.drop(['FIPS'], axis=1, inplace=True)
unemployment = unemployment.select_dtypes(exclude=['object'])
unemployment.head()

Unnamed: 0_level_0,Rural_urban_continuum_code_2013,Urban_influence_code_2013,Metro_2013,Unemployment_rate_2007,Unemployment_rate_2008,Unemployment_rate_2009,Unemployment_rate_2010,Unemployment_rate_2011,Unemployment_rate_2012,Unemployment_rate_2013,Unemployment_rate_2014,Unemployment_rate_2015,Unemployment_rate_2016,Unemployment_rate_2017,Unemployment_rate_2018,Med_HH_Income_Percent_of_State_Total_2018
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,,,,4.6,5.8,9.3,9.6,9.0,8.1,7.4,6.2,5.3,4.9,4.4,3.9,
1000,,,,4.0,5.7,11.0,10.5,9.6,8.0,7.2,6.8,6.1,5.8,4.4,3.9,100.0
1001,2.0,2.0,1.0,3.3,5.1,9.7,8.9,8.4,6.9,6.2,5.8,5.2,5.1,3.9,3.6,119.0
1003,3.0,2.0,1.0,3.1,4.6,9.8,10.0,9.0,7.5,6.6,6.1,5.5,5.3,4.1,3.6,115.5
1005,6.0,6.0,0.0,6.3,8.8,14.3,12.3,11.5,11.5,10.2,10.5,8.9,8.3,5.8,5.2,68.9


In [68]:
unemployment.shape

(3275, 16)

In [69]:
cols_remove = list()
for col in list(unemployment.columns):
    if '2017' in col or '2018' in col:
        cols_remove.append(col)
unemployment.drop(cols_remove, axis=1, inplace=True)

In [70]:
unemployment.shape

(3275, 13)

In [71]:
unemployment.head()

Unnamed: 0_level_0,Rural_urban_continuum_code_2013,Urban_influence_code_2013,Metro_2013,Unemployment_rate_2007,Unemployment_rate_2008,Unemployment_rate_2009,Unemployment_rate_2010,Unemployment_rate_2011,Unemployment_rate_2012,Unemployment_rate_2013,Unemployment_rate_2014,Unemployment_rate_2015,Unemployment_rate_2016
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,,,,4.6,5.8,9.3,9.6,9.0,8.1,7.4,6.2,5.3,4.9
1000,,,,4.0,5.7,11.0,10.5,9.6,8.0,7.2,6.8,6.1,5.8
1001,2.0,2.0,1.0,3.3,5.1,9.7,8.9,8.4,6.9,6.2,5.8,5.2,5.1
1003,3.0,2.0,1.0,3.1,4.6,9.8,10.0,9.0,7.5,6.6,6.1,5.5,5.3
1005,6.0,6.0,0.0,6.3,8.8,14.3,12.3,11.5,11.5,10.2,10.5,8.9,8.3


In [72]:
cols_remove = list()
for col_1 in list(poverty_estimation.columns):
    for col_2 in list(unemployment.columns):
        new_col_1 = '-'.join(re.split(r'[-_\s]', col_1.strip().lower()))
        new_col_2 = '-'.join(re.split(r'[-_\s]', col_2.strip().lower()))
        if new_col_1 == new_col_2:
            cols_remove.append(col_2)
unemployment.drop(cols_remove, axis=1, inplace=True)

In [73]:
unemployment.shape

(3275, 11)

In [74]:
unemployment.head()

Unnamed: 0_level_0,Metro_2013,Unemployment_rate_2007,Unemployment_rate_2008,Unemployment_rate_2009,Unemployment_rate_2010,Unemployment_rate_2011,Unemployment_rate_2012,Unemployment_rate_2013,Unemployment_rate_2014,Unemployment_rate_2015,Unemployment_rate_2016
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,,4.6,5.8,9.3,9.6,9.0,8.1,7.4,6.2,5.3,4.9
1000,,4.0,5.7,11.0,10.5,9.6,8.0,7.2,6.8,6.1,5.8
1001,1.0,3.3,5.1,9.7,8.9,8.4,6.9,6.2,5.8,5.2,5.1
1003,1.0,3.1,4.6,9.8,10.0,9.0,7.5,6.6,6.1,5.5,5.3
1005,0.0,6.3,8.8,14.3,12.3,11.5,11.5,10.2,10.5,8.9,8.3


### Training and Testing on Query Dataset

In [75]:
scores_query = get_performance_scores(
    poverty_estimation,
    'POVALL_2016',
    True
)

In [76]:
scores_query

{'mean_absolute_error': 0.22694683606871766,
 'mean_squared_error': 1.5849719521793815,
 'median_absolute_error': 0.046719856122556414,
 'r2_score': 0.40191462025967317}

### Training and Testing on Query + Candidate Datasets

In [77]:
# join dataset
join_ = poverty_estimation.join(
    unemployment,
    how='left',
    rsuffix='_r'
)
join_.head()

Unnamed: 0_level_0,POVALL_2016,Rural-urban_Continuum Code_2003,Rural-urban_Continuum Code_2013,Urban_Influence_Code_2003,Urban_Influence_Code_2013,Economic_typology_2015,RESIDUAL_2016,R_birth_2011,R_birth_2012,R_birth_2013,...,Unemployment_rate_2007,Unemployment_rate_2008,Unemployment_rate_2009,Unemployment_rate_2010,Unemployment_rate_2011,Unemployment_rate_2012,Unemployment_rate_2013,Unemployment_rate_2014,Unemployment_rate_2015,Unemployment_rate_2016
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
35005,13974,5.0,5.0,8.0,8.0,0.0,-1.0,14.5,14.0,14.3,...,3.9,4.3,7.0,7.5,6.8,6.5,6.3,6.1,6.3,6.7
13297,11385,1.0,1.0,1.0,1.0,0.0,1.0,12.9,12.8,12.2,...,4.6,6.4,10.4,10.6,10.4,9.1,7.8,6.4,5.4,4.9
13137,6500,6.0,6.0,5.0,5.0,3.0,-2.0,12.0,11.5,12.0,...,4.0,5.7,10.0,10.7,10.5,9.5,8.0,6.9,5.7,5.3
54017,1460,9.0,9.0,8.0,8.0,2.0,2.0,8.0,7.7,8.0,...,4.9,4.9,7.8,7.1,7.1,6.3,5.3,5.0,5.5,5.2
55055,7618,4.0,4.0,3.0,3.0,3.0,-2.0,11.2,10.6,10.4,...,4.9,4.9,9.2,8.7,7.6,6.7,6.3,5.0,4.2,3.9


In [78]:
join_.shape

(3136, 54)

In [79]:
scores_query_candidate = get_performance_scores(
    join_,
    'POVALL_2016',
    True
)

In [80]:
scores_query_candidate

{'mean_absolute_error': 0.21736854888387458,
 'mean_squared_error': 1.5548846233468254,
 'median_absolute_error': 0.041761776951117435,
 'r2_score': 0.4132680019175934}

In [81]:
after_better_before(scores_query, scores_query_candidate)

{'mean_absolute_error': True,
 'mean_squared_error': True,
 'median_absolute_error': True,
 'r2_score': True}

### Saving Datasets

In [82]:
# poverty_estimation.index.rename('key-for-ranking', inplace=True)
# unemployment.index.rename('key-for-ranking', inplace=True)

poverty_estimation.to_csv('data/poverty-estimation/poverty-estimation-v2.csv')
unemployment.to_csv('data/poverty-estimation/unemployment-v2.csv')
join_.to_csv('data/poverty-estimation/join.csv')

### Generating Training Record

In [83]:
training_record_poverty_estimation = dict(
    query_dataset=os.path.abspath('data/poverty-estimation/poverty-estimation-v2.csv'),
    query_key='FIPS',
    target='POVALL_2016',
    candidate_dataset=os.path.abspath('data/poverty-estimation/unemployment-v2.csv'),
    candidate_key='FIPS',
    joined_dataset=os.path.abspath('data/poverty-estimation/join.csv'),
    imputation_strategy='mean',
    mean_absolute_error=[scores_query['mean_absolute_error'], scores_query_candidate['mean_absolute_error']],
    mean_squared_error=[scores_query['mean_squared_error'], scores_query_candidate['mean_squared_error']],
    median_absolute_error=[scores_query['median_absolute_error'], scores_query_candidate['median_absolute_error']],
    r2_score=[scores_query['r2_score'], scores_query_candidate['r2_score']]
)

## Generating file with training records

In [84]:
if os.path.exists('data/training-records/'):
    shutil.rmtree('data/training-records/')
os.mkdir('data/training-records/')

In [85]:
training_records = open('data/training-records/training-records', 'w')
training_records.write(json.dumps(training_record_ny_taxi) + "\n")
training_records.write(json.dumps(training_record_ny_taxi_demand) + "\n")
training_records.write(json.dumps(training_record_college_debt) + "\n")
training_records.write(json.dumps(training_record_poverty_estimation) + "\n")
training_records.close()