# Table of Contents
* [Imports, file creation, list creations](#Imports,-file-creation-list-creations)
* [Features for ML Model Creation](#Features-for-ML-Model-Creation)
* [Dataframe for ML Model Creation](#Dataframe-for-ML-Model-Creation)
* [Intersection Proximity](#Intersection-Proximity)
* [CV Analysis](#CV-Analysis)
* [Population Density](#Population-Density)
* [Zone type](#Zone-type)
* [Classification](#Classification)
* [Analysis of min labels vs. precision](#Analysis)

 # Imports, file creation, list creations


In [4]:
import csv
import json
import multiprocessing as mp
import sys
import time
import warnings
from datetime import datetime, timezone

import intersection_proximity
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import parse as str_parse
import requests
import scipy
import seaborn as sns
import sklearn.feature_selection
from dateutil import parser as parser
from imblearn.ensemble import (BalancedBaggingClassifier,
                               BalancedRandomForestClassifier,
                               EasyEnsembleClassifier)
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from IPython import get_ipython
from sklearn import linear_model, svm, tree
from sklearn.base import BaseEstimator
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import (AdaBoostClassifier, BaggingClassifier,
                              BaggingRegressor, ExtraTreesClassifier,
                              ExtraTreesRegressor, RandomForestClassifier,
                              RandomForestRegressor, VotingClassifier)
from sklearn.feature_selection import RFECV, SelectFromModel, VarianceThreshold
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import (accuracy_score, confusion_matrix, precision_score,
                             r2_score, recall_score)
# from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import (KFold, StratifiedKFold, cross_val_score,
                                     train_test_split)
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, export_graphviz

from region_stats import RegionStats


Using TensorFlow backend.


In [0]:
# Loads all of the data that we need
users_old = pd.read_csv('ml-users.csv')
users = pd.read_csv('users_one_mission.csv')
# users = pd.read_csv('cv-accuracy.csv')
labels = pd.read_csv('ml-label-correctness-one-mission.csv')
grouped_labels = labels.groupby('user_id')



In [0]:
# Keep the user that meet a certain criteria to analyze
to_remove = []
for index, current in enumerate(users['labels_validated']):
    if current < 16:
        to_remove.append(index)
users.drop(users.index[to_remove], inplace=True)
users = users.reset_index(drop = True)



In [0]:
# Lists of all the tags belonging to each label type
curb_ramp_tags = ['narrow', 'points into traffic', 'missing friction strip', 'steep', 'not enough landing space']
obstacle_tags = ['fire hydrant', 'pole', 'tree', 'vegetation', 'trash/recycling can', 'parked car', 'parked bike']
missing_curb_ramp_tags = ['alternate route present', 'no alternate route', 'unclear if needed']
surface_problem_tags = ['bumpy', 'uneven', 'cracks', 'grass', 'narrow sidewalk']
no_sidewalk_tags = ['ends abruptly', 'street has a sidewalk', 'street has no sidewalks']
other_tags = ['missing crosswalk', 'no bus stop access']


 Run the next two cells just once (warning: will take a super long time)

In [0]:
# Takes in the row and appends the row into the given file
def file_appender(row, created, user_info):
    if user_info not in created:
        with open('{0}_new.csv'.format(user_info), 'w', newline = '') as new_user:
            writer = csv.writer(new_user)
            created.append(user_info)
            writer.writerow(header)
    with open('{0}_new.csv'.format(user_info), 'a', newline = '') as edit_user:
        editor = csv.writer(edit_user)
        editor.writerow(row)
    return created



In [0]:
# Parses through the csv and creates new csv for every user & action
# This is needed to be able to avoid storing all of the interaction information as a variable
# These csv files will be referenced in future analysis
# Please unzip ml-interactions.tar.gz
with open('ml-interactions.csv', newline = '') as data:
    reader = csv.reader(data)
    header = []
    users_created = []
    events_created = []
    for row in reader:
        if row[1] == 'user_id':
            header = row
        else:
            user_id = row[1]
            current_event = row[4]
            users_created = file_appender(row, users_created, user_id)
            events_created = file_appender(row, events_created, current_event)



In [0]:
# parses through each user's csv and figures out how many unique panos and missions seen
# This is used to normalize the data to be by per missions or per pano
user_panos = {}
user_missions = {}
for current_user in users['user_id']:
    df_current = pd.read_csv('{0}_new.csv'.format(current_user))
    user_panos[current_user] = df_current['gsv_panorama_id'].nunique()
    user_missions[current_user] = df_current['mission_id'].nunique()



In [0]:
panos_seen = ['panos seen']
df_user_panos = pd.DataFrame(list(user_panos.values()), columns = panos_seen, index = user_panos.keys())
df_user_panos.to_csv('panos_seen.csv', encoding='utf-8', index=True)



In [0]:
# Takes all label data from a csv and creates csv of each user's calculated label data
#  Including number of correct and false labels, total labels and number of validated labels
user_labels = {}
labels_grouped = labels.groupby('user_id')
for current_id, current_group in labels_grouped:
    total = len(current_group)
    accuracy = 0
    correct = sum(current_group['correct'] == 't')
    false = sum(current_group['correct'] == 'f')
    validated = correct + false
    if validated != 0:
        accuracy = float(correct) / float(validated) * 100
    missions_completed = user_missions[current_id]
    user_labels[current_id] = current_id, total, validated, correct, false,  accuracy, missions_completed
header = ['user_id', 'total_labels', 'labels_validated', 'correct_labels', 'false_labels', 'accuracy', 'missions_completed']
with open('users_one_mission.csv', 'w', newline = '') as new_user:
    writer = csv.writer(new_user)
    writer.writerow(header)
for user in user_labels.values():
    with open('users_one_mission.csv', 'a', newline = '') as edit_user:
        editor = csv.writer(edit_user)
        editor.writerow(user)


 # Features for ML Model Creation

In [0]:
# Creates a lists of each user's outputs (Quality 0 = Bad, 1 = Good, and accuracy)
# Also creates an list for features
# for the DC data, I don't have an 'accuracy' because that is based off of Seattel validations
# so their precision, recall, and worker type is recorded
def create_groups():
    for index, entry in enumerate(users['user_id']):
        current_accuracy = users['accuracy'][index]
#         current_recall = users['recall'][index]
#         current_precision = users['precision'][index]
#         current_f.measure = users['f.measure'][index]
#         worker_type = users['worker.type'][index]
        if current_accuracy <= 65:
            quality = '0'
        else:
            quality = '1'
        all_output.append([quality, current_accuracy])
        all_data[entry] = [0]
        all_id.append(entry)



In [0]:
# Adds a feature into the features list of all of the data points
# Feature is the name of the user action that is recorded
# info_type is the metric of how the feature is calculated
# feature_title is the name of the feature
# multi is if the feature should be normalized to be measured by missions & panos
def add_feature(feature, info_type, feature_title, multi):
    if feature_title not in all_feature_header:
        for index, entry in enumerate(users['user_id']):
            df_user = pd.read_csv('{0}_new.csv'.format(entry))
            if info_type == 'count':
                if multi == True:
                    by_pano = df_user.groupby(('mission_id', 'gsv_panorama_id')).apply(lambda x: sum(x['action'] == feature))
                    pano_mean = by_pano.mean()
                    pano_std = by_pano.std()
                    by_mission = df_user.groupby('mission_id').apply(lambda x: sum(x['action'] == feature))
                    mission_mean = by_mission.mean()
                    mission_std = by_mission.std()
                else:
                    info = action_count[feature] 
            elif info_type == 'pitch':
                info = df_user['pitch'].mean()     
            elif info_type == 'heading':
                df_grouped = df_user.groupby(['gsv_panorama_id'])
                current_heading = []
                for current, group in df_grouped:
                    current_heading.append(group['heading'].max() - group['heading'].min())
                info = (sum(current_heading) / float(len(current_heading))) 
            elif info_type == 'heading 350':
                df_grouped = df_user.groupby(['gsv_panorama_id'])
                full_heading_count = 0
                for current, group in df_grouped:
                    range = group['heading'].max() - group['heading'].min()
                    if range >= 350:
                        full_heading_count += 1
                info = full_heading_count / float(user_panos[entry])
            else:
                pano_mean = 0
                pano_std = 0
                mission_mean = 0
                mission_std = 0
                info = 0
            if multi:
                all_data[entry].append(mission_mean)
                all_data[entry].append(pano_mean)
                all_data[entry].append(mission_std)
                all_data[entry].append(pano_std)
            else:
                all_data[entry].append(info)
        if multi:
            all_feature_header.append(feature_title + ' per Mission')
            all_feature_header.append(feature_title + ' per Pano')
            all_feature_header.append(feature_title + ' Standard Deviation per Mission')
            all_feature_header.append(feature_title + ' Standard Deviation per Pano')
        else:
            all_feature_header.append(feature_title)



In [0]:
# Adds all of the users interaction data into a list of lists
# only needed for analysis of min panos vs. precision
def feature_data(feature, info_type, feature_title, multi):
    if feature_title not in all_feature_header:
        for index, entry in enumerate(users['user_id']):
            df_user = pd.read_csv('{0}_new.csv'.format(entry))
            if info_type == 'count':
                info = df_user.groupby(('mission_id', 'gsv_panorama_id')).apply(lambda x: sum(x['action'] == feature))
                info = info.tolist()
            elif info_type == 'pitch':
                info = df_user['pitch']     
            elif info_type == 'heading':
                df_grouped = df_user.groupby(['gsv_panorama_id'])
                current_heading = []
                for current, group in df_grouped:
                    current_heading.append(group['heading'].max() - group['heading'].min())
                info = current_heading
            elif info_type == 'heading 350':
                df_grouped = df_user.groupby(['gsv_panorama_id'])
                info = []
                for current, group in df_grouped:
                    range = group['heading'].max() - group['heading'].min()
                    if range >= 350:
                        info.append(1)
                    else :
                        info.append(0)
            else:
                info = 0
            all_data[entry].append(info)
        all_feature_header.append(feature_title)



In [0]:
# Creates empty lists to hold all of the user data
all_data = {}
all_id = []
all_output = []
all_feature_header = ['test']
output_header = ['Quality', 'accuracy']
create_groups()



In [0]:
# adds these features into the lists (mean & standard deviation,for per pano and per mission)
add_feature( None, 'heading', 'Average Heading Range', False)
add_feature( None, 'heading 350', 'Panos w/ over 350 Degrees seen', False)
add_feature( None, 'pitch', 'Average Pitch', False)
add_feature( 'ContextMenu_TagAdded', 'count', 'Tags Added', True)
add_feature( 'LabelingCanvas_FinishLabeling', 'count', 'Labels Confirmed', True)
add_feature( 'RemoveLabel', 'count', 'Labels Removed', True)
add_feature( 'LowLevelEvent_keydown', 'count', 'Key Presses', True)
add_feature( 'LowLevelEvent_mousedown', 'count', 'Mouse Clicks', True)
add_feature( 'Click_ZoomIn', 'count', 'Zoom', True)
add_feature( 'ContextMenu_TextBoxChange', 'count', 'Comments Written', True)
add_feature( 'LowLevelEvent_mousemove', 'count', 'Mouse Movements', True)



In [0]:
# Adds these interaction data in to a lists (Creates a lists of lists)
# only needed for analysis of min panos vs. precision
feature_data( None, 'heading', 'Average Heading Range', False)
feature_data( None, 'heading 350', 'Panos w/ over 350 Degrees seen', False)
feature_data( None, 'pitch', 'Average Pitch', False)
feature_data( 'ContextMenu_TagAdded', 'count', 'Tags Added', True)
feature_data( 'LabelingCanvas_FinishLabeling', 'count', 'Labels Confirmed', True)
feature_data( 'RemoveLabel', 'count', 'Labels Removed', True)
feature_data( 'LowLevelEvent_keydown', 'count', 'Key Presses', True)
feature_data( 'LowLevelEvent_mousedown', 'count', 'Mouse Clicks', True)
feature_data( 'Click_ZoomIn', 'count', 'Zoom', True)
feature_data( 'ContextMenu_TextBoxChange', 'count', 'Comments Written', True)
feature_data( 'LowLevelEvent_mousemove', 'count', 'Tags Added', True)


 # Dataframe for ML Model Creation

In [0]:
# Creates dataframes of the list of users features and outputs as the data with the 
# Column headers being the list of feature names / output names
df_all = pd.DataFrame(list(all_data.values()), columns = all_feature_header, index = all_id)
df_all_output = pd.DataFrame(all_output, columns = output_header, index = all_id)



In [0]:
# Drops any unwanted column
df_all = df_all.drop(columns=['test'])



In [0]:
# Creates csvs out of the dataframes
df_all.to_csv('all_users.csv', encoding='utf-8', index=True)
df_all_output.to_csv('all_users_new.csv', encoding='utf-8', index=True)



# Classifying features

 Load all the user information and label correctness

In [0]:
# users = pd.read_csv('ml-users.csv')
users = pd.read_csv('users_one_mission.csv')
users = users.set_index('user_id')
 
point_labels = pd.read_csv('sidewalk-seattle-label_point.csv')
point_labels.set_index('label_id', inplace=True)
 
# label_correctness = pd.read_csv('ml-label-correctness.csv')
label_correctness = pd.read_csv('ml-label-correctness-one-mission.csv')
 
label_correctness.set_index('label_id', inplace=True)
 
label_correctness = label_correctness.join(point_labels)
 
label_correctness = label_correctness[['user_id', 'label_type', 
    'correct', 'sv_image_x', 'sv_image_y', 'canvas_x', 'canvas_y', 
    'heading', 'pitch', 'zoom', 'lat', 'lng']]

 
users_for_analysis = users.index[users['labels_validated'] > 0]
label_correctness = label_correctness[label_correctness['user_id'].isin(users_for_analysis)]
users = users.loc[users_for_analysis]
 
label_correctness.update(label_correctness['correct'][~pd.isna(label_correctness['correct'])] == 't')
 
label_type_encoder = OrdinalEncoder()
 
label_correctness['label_type'] = label_type_encoder.fit_transform(label_correctness[['label_type']])


  # Intersection Proximity

 Load intersection proximity for each label

 proximity_distance is the absolute distance to the nearest intersection

 proximity_middleness is the "middleness" as measured in the intersection proximity library

In [0]:
ip = intersection_proximity.IntersectionProximity(intersection_proximity.default_settings['seattle'])



In [0]:
def get_proximity_info(label):
    try:
        distance, middleness = ip.compute_proximity(label.lat, label.lng, cache=True)
    except Exception:
        distance = -1
        middleness = -1
    
    return pd.Series({
        'proximity_distance': distance,
        'proximity_middleness': middleness
    })

label_correctness = label_correctness.join(label_correctness.apply(get_proximity_info, axis=1))


  # CV Analysis


 This is the full analysis of using CV predictions for classifying labels.

 We found that it's not very useful.

In [0]:
cv_predictions = pd.read_csv('summary_user.csv').rename(columns={
    'CVLabel': 'cv_label_type',
    'Confidence': 'cv_confidence'
},)



In [0]:
label_types = ['CurbRamp', 'NoCurbRamp', 'Obstacle', 'SurfaceProblem']



In [0]:
label_correctness = label_correctness.join(cv_predictions)



In [0]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(8, 8))
for i in range(len(label_types)):
    ax = axes[i//2][i%2]
    label_encoded = label_type_encoder.transform([[label_types[i]]])[0][0]
    selection = label_correctness[~pd.isna(label_correctness['correct']) & ~pd.isna(label_correctness['cv_confidence']) & (label_correctness['label_type'] == label_encoded)]
    ax.set_xlabel('CV Confidence')
    ax.set_ylabel('relative count')
    ax.set_title(label_types[i])
    ax.hist(selection[selection['correct'].astype(bool)]['cv_confidence'], alpha=0.5, label='correct', density=True)
    ax.hist(selection[~selection['correct'].astype(bool)]['cv_confidence'], alpha=0.5, label='incorrect', density=True)
    ax.legend()

fig.tight_layout()



In [0]:
label_correctness.loc[:, 'cv_label_type'][~pd.isna(label_correctness.loc[:, 'cv_label_type'])] =     label_type_encoder.transform(pd.DataFrame(label_correctness.loc[:, 'cv_label_type'][~pd.isna(label_correctness.loc[:, 'cv_label_type'])]))



In [0]:
prob = np.zeros((4, 4))
for i in range(len(label_types)):
    for j in range(len(label_types)):
        i_encoded = label_type_encoder.transform([[label_types[i]]])[0][0]
        j_encoded = label_type_encoder.transform([[label_types[j]]])[0][0]

        selection = label_correctness[~pd.isna(label_correctness['correct']) 
            & (label_correctness['label_type'] == i_encoded)
            & (label_correctness['cv_label_type'] == j_encoded)]
        
        try:
            prob[i][j] = np.sum(selection['correct']) / len(selection)
        except ZeroDivisionError:
            prob[i][j] = np.nan

prob


  # Population Density

 Analysis of population density as a feature. It has some correlation with accuracy

In [0]:
rs = RegionStats('data_seattle.geojson')
label_correctness = label_correctness.join(
    label_correctness.apply(lambda x: pd.Series(rs.get_properties(x.lng, x.lat)), axis=1)
)



In [0]:
selection_all = label_correctness[['density', 'correct', 'label_type']]
selection_all = selection_all[~pd.isna(selection_all).any(axis=1)]



In [0]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(8, 8))
fig2, axes2 = plt.subplots(nrows=2, ncols=2, figsize=(8, 8))

for i in range(len(label_types)):
    ax = axes[i//2][i%2]
    label_encoded = label_type_encoder.transform([[label_types[i]]])[0][0]
    selection = selection_all[selection_all['label_type'] == label_encoded]
    ax.set_xlabel('Population density (people/sq. mile)')
    ax.set_ylabel('count')
    ax.set_title(label_types[i])
    nc, bins, _ = ax.hist(selection[selection['correct'] == True]['density'], density=False, bins=20, alpha=0.5, label='correct')
    ni, _, _ = ax.hist(selection[selection['correct'] == False]['density'], density=False, bins=bins, alpha=0.5, label='incorrect')
    ax.legend()

    ax2 = axes2[i//2][i%2]
    ax2.set_xlabel('Population density (people/sq. mile)')
    ax2.set_ylabel('probability correct')
    ax2.set_title(label_types[i])
    density_vals = (bins[:-1] + bins[1:])/2
    correct_prob = nc / (ni + nc)
    mask = ~np.isnan(correct_prob)
    density_vals = density_vals[mask]
    correct_prob = correct_prob[mask]
    ax2.scatter(density_vals, correct_prob)

    
    z = np.polyfit(density_vals, correct_prob, 1)
    p = np.poly1d(z)
    ax2.plot(density_vals, p(density_vals), 
    label=f"R={r2_score(correct_prob, p(density_vals)):.3f}")

    ax2.legend()


fig.tight_layout()
fig2.tight_layout()


  # Zone type

 Zone type is even better.

In [0]:
rs = RegionStats('Zoning_Detailed.geojson')
label_correctness = label_correctness.join(
    label_correctness.apply(lambda x: pd.Series(rs.get_properties(x.lng, x.lat)), axis=1)
)



In [0]:
selection_all = label_correctness[['CATEGORY_DESC', 'correct', 'label_type']]
selection_all = selection_all[~pd.isna(selection_all).any(axis=1)]
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(8, 10))
categories = selection_all['CATEGORY_DESC'].unique()

for i in range(len(label_types)):
    ax = axes[i//2][i%2]
    label_encoded = label_type_encoder.transform([[label_types[i]]])[0][0]
    selection = selection_all[selection_all['label_type'] == label_encoded]
    # ax.set_xlabel('Population density (people/sq. mile)')
    # ax.set_ylabel('count')
    ax.set_title(label_types[i])
    prob_correct = dict()
    for category in categories:
        if np.sum(selection['CATEGORY_DESC'] == category) > 100:
            prob_correct[category] = np.mean(selection['correct'][selection['CATEGORY_DESC'] == category])
        else:
            prob_correct[category] = 0
        # num_in[category] = np.sum(selection['CATEGORY_DESC'] == category)

    ax.bar(prob_correct.keys(), prob_correct.values())
    ax.set_ylim(0, 1)
    ax.tick_params(axis='x', labelrotation=90)

fig.tight_layout()


  # Classification

 This is the basic pipeline for classification:

 The set of training users is split, 35% into a training set for the label classifier and 65% into a testing set for the accuracy classifier.

 All the label validations from the 35% are used to train the classifier, using the features in `features`.

 Then, the label classifier predicts p(correct) for all the labels in the 65% set. `prob_hist` is used to extract a small number of features from the list of probabilities.

 The accuracy classifier is trained on the features from `prob_hist` + the user features generated in Tyler's notebook for the 65% set.

 To test the final classifier, we do the same as the 65% test, but with just the testing users: we pass the results of the label classifier into `prob_hist`, add the features from Tyler's notebook, and then pass that in to the accuracy classifier.

In [0]:
def prob_hist(probabilities, n_bins=5):
    return [np.mean(probabilities), np.std(probabilities),
        np.percentile(probabilities, 25), np.percentile(probabilities, 50),
        np.percentile(probabilities, 75), np.mean(probabilities[(probabilities > 0.25) | (probabilities < 0.75)])]
def dearray(array):
    return np.array([list(l) for l in array])


  Load Tyler's features (see his notebook)

In [0]:
user_quality_features = pd.read_csv('all_users.csv')
user_quality_features.set_index('user_id', inplace=True)
# user_quality_features.set_index('Unnamed: 0', inplace=True)

In [0]:
users_filtered = users[users['labels_validated'] > 25]


In [0]:
mode = 'classification'
# mode = 'regression'



In [0]:
features = ['label_type', 'sv_image_y', 'canvas_x', 'canvas_y', 'heading', 'pitch', 'zoom', 'lat', 'lng', 'proximity_distance', 'proximity_middleness', 'CLASS_DESC', 'ZONEID']

proportion_labels = 0.35
comparisons = pd.DataFrame()
split_num = 0
np.random.seed(0)

for train_index, test_index in KFold(n_splits=5, shuffle=True, random_state=0).split(users_filtered.index):
    X_train, X_test = users_filtered.index[train_index], users_filtered.index[test_index]
    
    if mode == 'classification':
        y_train, y_test = users_filtered['accuracy'][train_index] > 65, users_filtered['accuracy'][test_index] > 65
    else:
        y_train, y_test = users_filtered['accuracy'][train_index], users_filtered['accuracy'][test_index]

#     y_train, y_test = users_filtered['missions_completed'][train_index] > 6, users_filtered['missions_completed'][test_index] > 6

    mask = np.random.permutation(np.arange(len(X_train)))
    users_labels_train = X_train[mask[:int(proportion_labels * len(mask))]]
    users_labels_test = X_train[mask[int(proportion_labels * len(mask)):]]

    train_labels = label_correctness[label_correctness['user_id'].isin(X_train)].copy()
    test_labels = label_correctness[label_correctness['user_id'].isin(X_test)].copy()
    #%%
    test_labels = test_labels.drop(columns='correct')
    #%%
    train_labels = train_labels[~pd.isna(train_labels['correct'])]
    train_labels = train_labels[~(pd.isna(train_labels[features]).any(axis=1))]
    test_labels = test_labels[~(pd.isna(test_labels[features]).any(axis=1))]
    # scaler = StandardScaler()
    # train_labels[features] = scaler.fit_transform(train_labels[features])
    
    en = OrdinalEncoder()
    en.fit(pd.concat((train_labels[['CLASS_DESC']], test_labels[['CLASS_DESC']])))
    train_labels[['CLASS_DESC']] = en.transform(train_labels[['CLASS_DESC']])
    test_labels[['CLASS_DESC']] = en.transform(test_labels[['CLASS_DESC']])

    #%%
    rfe_labels = RFECV(estimator=RandomForestClassifier(n_estimators=10), step=1, cv=StratifiedKFold(5),
           scoring='precision')
    clf_labels = RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=30)
    # clf_accuracy = BalancedBaggingClassifier(n_jobs=-1, random_state=0, n_estimators=100)
    rfe_accuracy = RFECV(estimator=RandomForestClassifier(n_estimators=10), step=1, cv=StratifiedKFold(5), scoring='f1')
    
    if mode == 'classification':
        clf_accuracy = BalancedBaggingClassifier(random_state=0, n_jobs=-1, n_estimators=30)
        rfe_accuracy = RFECV(estimator=RandomForestClassifier(n_estimators=10), step=1, cv=StratifiedKFold(5), scoring='f1')
        
    else:
        clf_accuracy = BaggingRegressor(random_state=0, n_jobs=-1, n_estimators=30)
        rfe_accuracy = RFECV(estimator=RandomForestClassifier(n_estimators=10), step=1, cv=StratifiedKFold(5), scoring='f1')

    # clf = BalancedRandomForestClassifier(random_state=0)  
    
    # TODO don't eliminate all nans

    #%%
    # clf_labels.fit(train_labels[features], train_labels['correct'].astype(int))
    print('Training label classifier...')
    rfe_labels.fit(train_labels[train_labels['user_id'].isin(users_labels_train)][features].values, 
        train_labels[train_labels['user_id'].isin(users_labels_train)]['correct'].astype(int))
    
    clf_labels.fit(train_labels[train_labels['user_id'].isin(users_labels_train)][features].values[:, rfe_labels.support_], 
        train_labels[train_labels['user_id'].isin(users_labels_train)]['correct'].astype(int))
    
    train_labels = train_labels.join(pd.Series(
        data=clf_labels.predict_proba(train_labels[train_labels['user_id'].isin(users_labels_test)][features].values[:, rfe_labels.support_])[:, 1], 
        index=train_labels[train_labels['user_id'].isin(users_labels_test)].index).rename('prob'), how='outer')
    
    prob_hist_predictions = pd.DataFrame(train_labels[train_labels['user_id'].isin(users_labels_test)]
        .groupby('user_id').apply(lambda x:\
        prob_hist(x['prob'].values)).rename('prob'))
    
    prob_hist_predictions = prob_hist_predictions.join(user_quality_features)
    
    print('Training accuracy classifier...')
    if mode == 'classification':
        rfe_accuracy.fit(np.concatenate((dearray(prob_hist_predictions['prob']), 
            prob_hist_predictions.drop(columns='prob').values), axis=1), 
            y_train.loc[prob_hist_predictions.index])
    else:
        rfe_accuracy.fit(np.concatenate((dearray(prob_hist_predictions['prob']), 
            prob_hist_predictions.drop(columns='prob').values), axis=1), 
            y_train.loc[prob_hist_predictions.index] > 65)
    
    clf_accuracy.fit(np.concatenate((dearray(prob_hist_predictions['prob']), 
        prob_hist_predictions.drop(columns='prob').values), axis=1)[:, rfe_accuracy.support_], 
        y_train.loc[prob_hist_predictions.index])
    #%%
    # Probabililty correct
    useful_test = test_labels[~pd.isna(test_labels[features]).any(axis=1)].copy()  # TODO don't eliminate all nans
    # useful_test[features] = scaler.transform(useful_test[features])
    # useful_test = useful_test.join(useful_test.apply(get_proximity_info, axis=1))
    useful_test.loc[:, 'prob'] = clf_labels.predict_proba(useful_test[features].values[:, rfe_labels.support_])[:, 1]

    # a = useful_test.groupby('user_id').apply(lambda x: prob_hist(x['prob']))
    # break
    #%%

    # Now predict accuracy

    def predict_accuracy(probs, features):
        # fig = plt.figure()
        # plt.xlim(0, 1)
        # plt.hist(probs)
        
        # selected_probs = probs[~np.isnan(probs)]
        # return np.mean(selected_probs)
        # return clf_accuracy.predict_proba([np.concatenate((prob_hist(probs), features))])[:, 1][0]
        return clf_accuracy.predict([np.concatenate((prob_hist(probs), features))[rfe_accuracy.support_]])[0]
    
    print('Making predictions...')
    mean_probs = useful_test.groupby('user_id').apply(lambda x: predict_accuracy(x['prob'].values, user_quality_features.loc[x.name])).rename('predicted')

    #%%
    comparison = pd.DataFrame((mean_probs, y_test, pd.Series(np.full((len(y_test)), split_num), name='split_num', index=y_test.index))).T
    comparison['prob_hist'] = useful_test.groupby('user_id').apply(lambda x: prob_hist(x['prob'].values))
    comparison['probs'] = useful_test.groupby('user_id').apply(lambda x: x['prob'].values)
    comparisons = comparisons.append(comparison)

    #%%

    split_num += 1

    # sys.stderr.write(f'{split_num} / 5\n')

# comparisons['accuracy'] = (comparisons['accuracy']).astype(int)

if mode == 'classification':
    mask = ~pd.isna(comparisons[['accuracy', 'predicted']]).any(axis=1)
    print(precision_score(comparisons['accuracy'][mask], comparisons['predicted'][mask]))
    print(recall_score(comparisons['accuracy'][mask], comparisons['predicted'][mask]))
    print(accuracy_score(comparisons['accuracy'][mask], comparisons['predicted'][mask]))
    print(confusion_matrix(comparisons['accuracy'][mask], comparisons['predicted'][mask]))


 If running a regression, use this to view the results

In [0]:
comparisons = comparisons[~pd.isna(comparisons[['predicted', 'accuracy']]).any(axis=1)]



In [0]:
comparisons['predicted'].max()



In [0]:
sum(users['labels_validated'] > 25)



In [0]:
fig, ax = plt.subplots(figsize=(5, 5))
r2 = scipy.stats.pearsonr(comparisons['accuracy'], comparisons['predicted'])
ax.scatter(comparisons['predicted'], comparisons['accuracy'], )
z = np.polyfit(comparisons['predicted'],comparisons['accuracy'], 1)
w = np.poly1d(z)
ax.plot(comparisons['predicted'], w(comparisons['predicted']), color='red', label=f'R={r2[0]:.3f}\n')
ax.set_xlim((0, 100))
ax.set_ylim((0, 100))
ax.set_xlabel('predicted accuracy')
ax.set_ylabel('actual accuracy')
ax.legend()


 View the results of RFE using this. You can change `rfe` to either `rfe_labels` or `rfe_accuracy`

In [0]:
rfe = rfe_accuracy
print("Optimal number of features : %d" % rfe.n_features_)
print("Optimal number of features : %s" % str(rfe.support_))

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfe.grid_scores_) + 1), rfe.grid_scores_)
plt.show()


 # Analysis of min labels vs. precision

 This takes about 30 mins to run. but there are some generated plots in GitHub. We found that >45 validated labels is optimal for the performance of the classifier

In [0]:
def get_p_r(users, label_correctness):
    features = ['label_type', 'sv_image_y', 'canvas_x', 'canvas_y', 'heading', 'pitch', 'zoom', 'lat', 'lng',]
    proportion_labels = 0.1
    comparisons = pd.DataFrame()
    split_num = 0
    np.random.seed(0)

    for train_index, test_index in KFold(n_splits=5, shuffle=True, random_state=0).split(users.index):
        X_train, X_test = users.index[train_index], users.index[test_index]
        y_train, y_test = users['accuracy'][train_index], users['accuracy'][test_index]

        mask = np.random.permutation(np.arange(len(X_train)))
        users_labels_train = X_train[mask[:int(proportion_labels * len(mask))]]
        users_labels_test = X_train[mask[int(proportion_labels * len(mask)):]]

        train_labels = label_correctness[label_correctness['user_id'].isin(X_train)]
        test_labels = label_correctness[label_correctness['user_id'].isin(X_test)]
        #%%
        test_labels = test_labels.drop(columns='correct')
        #%%
        train_labels = train_labels[~pd.isna(train_labels['correct'])]
        train_labels = train_labels[~(pd.isna(train_labels[features]).any(axis=1))]
        # scaler = StandardScaler()
        # train_labels[features] = scaler.fit_transform(train_labels[features])

        #%%
        rfe_labels = RFECV(estimator=RandomForestClassifier(n_estimators=10), step=1, cv=StratifiedKFold(5),
               scoring='precision')
        clf_labels = RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=10)
        # clf_accuracy = BalancedBaggingClassifier(n_jobs=-1, random_state=0, n_estimators=100)
        rfe_accuracy = RFECV(estimator=RandomForestClassifier(n_estimators=10), step=1, cv=StratifiedKFold(5), scoring='f1')
        clf_accuracy = BalancedBaggingClassifier(random_state=0, n_jobs=-1, n_estimators=20)
        # clf = BalancedRandomForestClassifier(random_state=0)  

        # TODO don't eliminate all nans

        #%%
        # clf_labels.fit(train_labels[features], train_labels['correct'].astype(int))
    #     print('Training label classifier...')
        rfe_labels.fit(train_labels[train_labels['user_id'].isin(users_labels_train)][features].values, 
            train_labels[train_labels['user_id'].isin(users_labels_train)]['correct'].astype(int))

        clf_labels.fit(train_labels[train_labels['user_id'].isin(users_labels_train)][features].values[:, rfe_labels.support_], 
            train_labels[train_labels['user_id'].isin(users_labels_train)]['correct'].astype(int))

        train_labels = train_labels.join(pd.Series(
            data=clf_labels.predict_proba(train_labels[train_labels['user_id'].isin(users_labels_test)][features].values[:, rfe_labels.support_])[:, 1], 
            index=train_labels[train_labels['user_id'].isin(users_labels_test)].index).rename('prob'), how='outer')

        prob_hist_predictions = pd.DataFrame(train_labels[train_labels['user_id'].isin(users_labels_test)]
            .groupby('user_id').apply(lambda x:\
            prob_hist(x['prob'].values)).rename('prob'))

        prob_hist_predictions = prob_hist_predictions.join(user_quality_features)

    #     print('Training accuracy classifier...')
        rfe_accuracy.fit(np.concatenate((dearray(prob_hist_predictions['prob']), 
            prob_hist_predictions.drop(columns='prob').values), axis=1), 
            (y_train.loc[prob_hist_predictions.index] > 65).astype(int))

        clf_accuracy.fit(np.concatenate((dearray(prob_hist_predictions['prob']), 
            prob_hist_predictions.drop(columns='prob').values), axis=1)[:, rfe_accuracy.support_], 
            (y_train.loc[prob_hist_predictions.index] > 65).astype(int))
        #%%
        # Probabililty correct
        useful_test = test_labels[~pd.isna(test_labels[features]).any(axis=1)].copy()  # TODO don't eliminate all nans
        # useful_test[features] = scaler.transform(useful_test[features])
        # useful_test = useful_test.join(useful_test.apply(get_proximity_info, axis=1))
        useful_test.loc[:, 'prob'] = clf_labels.predict_proba(useful_test[features].values[:, rfe_labels.support_])[:, 1]

        # a = useful_test.groupby('user_id').apply(lambda x: prob_hist(x['prob']))
        # break
        #%%

        # Now predict accuracy

        def predict_accuracy(probs, features):
            # fig = plt.figure()
            # plt.xlim(0, 1)
            # plt.hist(probs)

            # selected_probs = probs[~np.isnan(probs)]
            # return np.mean(selected_probs)
            # return clf_accuracy.predict_proba([np.concatenate((prob_hist(probs), features))])[:, 1][0]
            return clf_accuracy.predict([np.concatenate((prob_hist(probs), features))[rfe_accuracy.support_]])[0]

    #     print('Making predictions...')
        mean_probs = useful_test.groupby('user_id').apply(lambda x: predict_accuracy(x['prob'].values, user_quality_features.loc[x.name])).rename('predicted')

        #%%
        comparison = pd.DataFrame((mean_probs, y_test, pd.Series(np.full((len(y_test)), split_num), name='split_num', index=y_test.index))).T
        comparison['prob_hist'] = useful_test.groupby('user_id').apply(lambda x: prob_hist(x['prob'].values))
        comparison['probs'] = useful_test.groupby('user_id').apply(lambda x: x['prob'].values)
        comparisons = comparisons.append(comparison)

        #%%

        split_num += 1

        # sys.stderr.write(f'{split_num} / 5\n')

    comparisons['accuracy'] = (comparisons['accuracy'] > 65).astype(int)

    mask = ~pd.isna(comparisons[['accuracy', 'predicted']]).any(axis=1)
    # print(precision_score(comparisons['accuracy'][mask], comparisons['predicted'][mask]))
    # print(recall_score(comparisons['accuracy'][mask], comparisons['predicted'][mask]))
    # print(accuracy_score(comparisons['accuracy'][mask], comparisons['predicted'][mask]))
    # print(confusion_matrix(comparisons['accuracy'][mask], comparisons['predicted'][mask]))

    return precision_score(comparisons['accuracy'][mask], comparisons['predicted'][mask]), recall_score(comparisons['accuracy'][mask], comparisons['predicted'][mask])



In [0]:
get_ipython().run_cell_magic('time', '', "warnings.filterwarnings('ignore')\ndef run_range(min_labels):\n    try:\n        print(min_labels)\n        users_for_analysis = users.index[users['labels_validated'] > min_labels]\n        users2 = users.loc[users_for_analysis]\n        return get_p_r(users2, label_correctness)\n    except Exception:\n        return (-1, -1)\n    \nwith mp.Pool(4) as p:\n    results = p.map(run_range, np.arange(1,75))")



In [0]:
results = np.array(results)



In [0]:
results_with_min = np.concatenate((np.arange(2, 76).reshape(-1, 1), results), axis=1)
results_invalid = results_with_min[~(results == -1).any(axis=1)]
results_df = pd.DataFrame(data=results_invalid, columns=('min', 'precision', 'recall')).set_index('min')



In [0]:
results_df = results_df.iloc[results_df.index < 45]



In [0]:
plt.figure()
# plt.scatter(results_df.index, results_df['precision'])
plt.scatter(results_df.index, results_df['recall'])
# plt.scatter(results_df.index, 2/(1/results_df['precision'] + 1/results_df['recall']))
plt.xlabel('min number of labels')
plt.ylabel('recall')



In [0]:
results[44]



In [0]:
get_p_r



In [0]:
fig = plt.figure()
plt.vlines(np.arange(len(users['labels_validated'])), 0, np.sort(users['labels_validated'])[::-1], color='C0')
plt.axis((0, None, 0, None))
plt.xlabel('user number')
plt.ylabel('number of validated labels')
fig.savefig('a.svg')



In [0]:
np.sort(users['labels_validated'])[::-1]


 # DC Classification

 Our attempts at classifying DC data. It didn't go very well (~55% p/r using the same features as Seattle)

In [0]:
dc_features = pd.read_csv('all_users_dc_with_interaction.csv')
dc_users = pd.read_csv('all_users_outputs_dc_with_interaction.csv')



In [0]:
dc_features.set_index("Unnamed: 0", inplace=True)
dc_users.set_index("Unnamed: 0", inplace=True)





In [0]:
'R = ' + str(f'{pears[0]:.2f}')



In [0]:
for x in dc_features.columns:
    if 'Tag' not in x:
        plt.figure()
        plt.title(x)
        z = np.polyfit(dc_features[x],dc_users['f1'], 1)
        w = np.poly1d(z)
        pears = scipy.stats.pearsonr(dc_features[x],dc_users['f1'])
        plt.scatter( dc_features[x],dc_users['f1'])
        plt.plot(dc_features[x], w(dc_features[x]))
        print('R = ' + str(f'{pears[0]:.2f}'))



In [0]:
dc_users['f1'] = 2 / (1/dc_users['precision'] + 1/dc_users['recall'])



In [0]:
dc_users['good'] = (dc_users['f1'] > 0.65)



In [0]:
np.median(dc_users['f1'])



In [0]:
comparisons = pd.DataFrame()
for train_index, test_index in KFold(n_splits=5, shuffle=True, random_state=0).split(dc_users.index):
    X_train, X_test = dc_features.iloc[train_index], dc_features.iloc[test_index]
    y_train, y_test = dc_users['good'].iloc[train_index], dc_users['good'].iloc[test_index]
    
#     rfe = RFECV(estimator=RandomForestClassifier(n_estimators=10), step=1, cv=StratifiedKFold(5),
#            scoring='f1')
    
#     rfe.fit(X_train, y_train)
#     clf = RandomForestClassifier(random_state=0, n_estimators=10)
    clf = BalancedRandomForestClassifier(random_state=0, n_estimators=10)
#     clf.fit(X_train.values[:, rfe.support_], y_train)
    clf.fit(X_train.values, y_train)

    comparisons = comparisons.append(pd.DataFrame(
#         data=np.stack((clf.predict(X_test.values[:, rfe.support_]).reshape(-1), y_test.values.reshape(-1)), axis=1), 
        data=np.stack((clf.predict(X_test.values).reshape(-1), y_test.values.reshape(-1)), axis=1), 
        columns=('predicted', 'actual'))
    )

comparisons.reset_index(inplace=True, drop=True)



In [0]:
mask = ~pd.isna(comparisons[['actual', 'predicted']]).any(axis=1)
print(precision_score(comparisons['actual'][mask], comparisons['predicted'][mask]))
print(recall_score(comparisons['actual'][mask], comparisons['predicted'][mask]))
print(accuracy_score(comparisons['actual'][mask], comparisons['predicted'][mask]))
print(confusion_matrix(comparisons['actual'][mask], comparisons['predicted'][mask]))



In [0]:
print("Optimal number of features : %s" % str(rfe.support_))

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfe.grid_scores_) + 1), rfe.grid_scores_)
plt.show()


 # Analysis of number of panos vs. classifier precisison

In [0]:
all_users_lists = pd.read_pickle('all_users_lists_new')



In [0]:
all_users_lists['Average Pitch'] = all_users_lists['Average Pitch'].apply(list)



In [0]:
all_users_lists = all_users_lists.apply(lambda x: x.apply(np.array))



In [0]:
def generate_features_first_n_panos(all_users, n):
    # generate features
    return all_users.apply(lambda x: x.apply((lambda y: np.nanstd(y[:n])))).add_prefix('std_').join(
    all_users.apply(lambda x: x.apply((lambda y: np.nanmean(y[:n])))).add_prefix('mean_')).join(
    all_users.apply(lambda x: x.apply((lambda y: np.nanpercentile(y[:n], 50)))).add_prefix('median_')).join(
    all_users.apply(lambda x: x.apply((lambda y: np.nanpercentile(y[:n], 25)))).add_prefix('25%ile_')).join(
    all_users.apply(lambda x: x.apply((lambda y: np.nanpercentile(y[:n], 75)))).add_prefix('75%ile_'))
#     return all_users.apply(lambda x: x.apply((lambda y: np.nanmean(y[:n])))).add_prefix('mean_')
#     return all_users.apply(lambda x: x.apply((lambda y: np.nanpercentile(y[:n], 50)))).add_prefix('median_')
    # drop all features with too few interactions
#     return df[all_users['Zoom'].apply(len) >= n]



In [0]:
X = generate_features_first_n_panos(all_users, 50).fillna(0)
y = users['accuracy'].reindex(X.index)
user_mask = ~pd.isna(y)
X = X.loc[user_mask]
y = (y.loc[user_mask] > 65).astype(int)



In [0]:
pd.isna(X).any().any()





In [0]:
def run_analysis(n):
    X = generate_features_first_n_panos(all_users, n).fillna(0)
    X = X[all_users['Zoom'].apply(len) >= n]
    y = users['accuracy'].reindex(X.index)
    user_mask = ~pd.isna(y)
    X = X.loc[user_mask]
    y = (y.loc[user_mask] > 65).astype(int)

    comparisons = pd.DataFrame()
    for train_index, test_index in KFold(n_splits=5, shuffle=True, random_state=0).split(X.index):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

#         rfe = RFECV(estimator=RandomForestClassifier(n_estimators=20, random_state=0), step=1, cv=StratifiedKFold(5),
#                scoring='f1')
        
        rfe = RFECV(estimator=DecisionTreeClassifier(), step=1, cv=StratifiedKFold(5),
               scoring='f1')

        rfe.fit(X_train, y_train)
    #     clf = RandomForestClassifier(random_state=0, n_estimators=10)
#         clf = DecisionTreeClassifier(random_state=0)
        clf = SVC(kernel='linear')
        clf.fit(X_train.values[:, rfe.support_], y_train)

        comparisons = comparisons.append(pd.DataFrame(
            data=np.stack((clf.predict(X_test.values[:, rfe.support_]).reshape(-1), y_test.values.reshape(-1)), axis=1), 
#             data=np.stack((rfe.predict(X_test.values).reshape(-1), y_test.values.reshape(-1)), axis=1), 
            columns=('predicted', 'actual'))
        )

    comparisons.reset_index(inplace=True, drop=True)


    return (sklearn.metrics.precision_score(comparisons['actual'], comparisons['predicted']), 
            sklearn.metrics.recall_score(comparisons['actual'], comparisons['predicted']),
            sklearn.metrics.confusion_matrix(comparisons['actual'], comparisons['predicted']))



In [0]:
analysis_range = np.arange(5, 500, 20)



In [0]:
with mp.Pool(4) as p:
    analysis_results = p.map(run_analysis, analysis_range)



In [0]:
analysis_array = np.array(analysis_results)



In [0]:
plt.figure()
plt.scatter(analysis_range, analysis_array[:, 0], label='precision')
plt.scatter(analysis_range, analysis_array[:, 1], label='recall')
plt.xlabel('number of panos')
plt.legend()
