### This notebook outputs the relative importances of the conjoint analysis on the contribution of project tags combinations at two different cost levels - high and low, towards the budget utilization by equal shares and utilitarian greedy respectively

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import csv

In [None]:
# use this code block to set column and row viewing size/width
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', 100)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

pd.options.display.float_format = '{:.6f}'.format

In [None]:
# read the metadata csv
pbsummary_df = pd.read_csv('../metadata.csv', delimiter=';')
pbsummary_df = pbsummary_df.drop_duplicates()
pbsummary_df

pbsummary_df['subunit'].fillna(value='all', inplace=True)

In [None]:
# Get election id and vote type, so we know better to segragate approval and score votings
pbsummary_with_vote_type = pbsummary_df[['election_id', 'vote_type']]
print(pbsummary_with_vote_type.head())
print(pbsummary_with_vote_type['vote_type'].value_counts())

In [None]:
# reading the projects CSV and loading to dataframe
pbprojects_df = pd.read_csv('../projects.csv', delimiter=';')
pbprojects_df.drop_duplicates(inplace=True)
print(pbprojects_df.shape)

# merge the column vote_type into pbprojects_df
pbprojects_df = pd.merge(pbprojects_df, pbsummary_with_vote_type, on='election_id', how='inner')
print(pbprojects_df.shape)

pbprojects_df.head()

In [None]:
# There are citywide elections (unit-level); i.e. subunit is na, for such records fill na columns with values for subunit as all
pbprojects_df['subunit'].fillna(value='all', inplace=True)

# Cross checking for projects where by we have filled with subunit equalling the value 'all'
print(pbprojects_df[pbprojects_df['subunit'] == 'all'].shape)
print("Valid PB projects are: ", pbprojects_df.shape)

In [None]:
valid_pbprojects_df = pbprojects_df[(~pbprojects_df['category'].isna())] # we want to select only those instances for which there are category tags

total_election_projects_cost = valid_pbprojects_df.groupby(['election_id'])['cost'].sum().reset_index()
total_election_projects_cost.rename(columns={'cost': 'total_election_projects_cost'}, inplace=True)
valid_pbprojects_df = valid_pbprojects_df.merge(total_election_projects_cost, on='election_id', how='inner')

total_mes_winners_cost = valid_pbprojects_df[(valid_pbprojects_df['is_mes_winner'] == True)].groupby(['election_id'])['cost'].sum().reset_index()
total_mes_winners_cost.rename(columns={'cost': 'total_mes_winners_cost'}, inplace=True)
valid_pbprojects_df = valid_pbprojects_df.merge(total_mes_winners_cost, on='election_id', how='inner')

total_ug_winners_cost = valid_pbprojects_df[(valid_pbprojects_df['is_greedy_winner'] == True)].groupby(['election_id'])['cost'].sum().reset_index()
total_ug_winners_cost.rename(columns={'cost': 'total_ug_winners_cost'}, inplace=True)
valid_pbprojects_df = valid_pbprojects_df.merge(total_ug_winners_cost, on='election_id', how='inner')

num_projects = valid_pbprojects_df.groupby(['election_id'])['project_id'].count().reset_index()
num_projects.rename(columns={'project_id': 'num_projects'}, inplace=True)
valid_pbprojects_df = valid_pbprojects_df.merge(num_projects, on='election_id', how='inner')

num_mes_winners = valid_pbprojects_df[(valid_pbprojects_df['is_mes_winner'] == True)].groupby(['election_id'])['project_id'].count().reset_index()
num_mes_winners.rename(columns={'project_id': 'num_mes_winners'}, inplace=True)
valid_pbprojects_df = valid_pbprojects_df.merge(num_mes_winners, on='election_id', how='inner')

num_ug_winners = valid_pbprojects_df[(valid_pbprojects_df['is_greedy_winner'] == True)].groupby(['election_id'])['project_id'].count().reset_index()
num_ug_winners.rename(columns={'project_id': 'num_ug_winners'}, inplace=True)
valid_pbprojects_df = valid_pbprojects_df.merge(num_ug_winners, on='election_id', how='inner')

In [None]:
valid_pbprojects_df[valid_pbprojects_df['election_id'] == 332]

In [None]:
## additional columns for whether a project lies in a given cost quartile or not
q2_df = valid_pbprojects_df.groupby(['election_id'])['cost'].quantile(0.5).reset_index()
q2_df.rename(columns={'cost': 'cost_margin'}, inplace=True)
valid_pbprojects_df = valid_pbprojects_df.merge(q2_df, on='election_id', how='inner')


# Initialize 4 different columns for labels
valid_pbprojects_df['low_cost'] = 0
valid_pbprojects_df['high_cost'] = 0

# Iterate through rows and update the new columns
for index, row in valid_pbprojects_df.iterrows():
    if row['cost'] <= row['cost_margin']:
        valid_pbprojects_df.at[index, 'low_cost'] = 1
    else:
        valid_pbprojects_df.at[index, 'high_cost'] = 1

In [None]:
valid_pbprojects_df

In [None]:
# dependent variables 
valid_pbprojects_df['mes_budget_utilization'] = 1.000000 * valid_pbprojects_df['total_mes_winners_cost'] / valid_pbprojects_df['total_budget']
valid_pbprojects_df['mes_cost_representation'] = 1.000000 * valid_pbprojects_df['total_mes_winners_cost'] / valid_pbprojects_df['total_election_projects_cost']
valid_pbprojects_df['mes_winning_rate'] = 1.000000 * valid_pbprojects_df['num_mes_winners'] / valid_pbprojects_df['num_projects']

valid_pbprojects_df['greedy_budget_utilization'] = 1.000000 * valid_pbprojects_df['total_ug_winners_cost'] / valid_pbprojects_df['total_budget']
valid_pbprojects_df['greedy_cost_representation'] = 1.000000 * valid_pbprojects_df['total_ug_winners_cost'] / valid_pbprojects_df['total_election_projects_cost']
valid_pbprojects_df['greedy_winning_rate'] = 1.000000 * valid_pbprojects_df['num_ug_winners'] / valid_pbprojects_df['num_projects']

In [None]:
# Function to sort and join the values in each row
def unifyCategories(row):
    values = row['category'].split(',')
    sorted_values = sorted(set(values))
    return ','.join(sorted_values)

# There are duplicated order of categories such as culture, welfare is also put while welfare, culture is also put, create a new column called set_categories that stores set of distinct elements
valid_pbprojects_df['sorted_categories'] = valid_pbprojects_df.apply(lambda row: unifyCategories(row), axis=1)
distinct_category_counts = valid_pbprojects_df['sorted_categories'].value_counts()

print(valid_pbprojects_df.columns)

print(valid_pbprojects_df.shape)

In [None]:
## function to identify whether a given set of tag is in the winning outcome by mes or ug or not
## we select top five columns out of all combinations such that all kinds of projects are covered
# the top five combination of categories is fetched from a different notebook; please contact if details required
top_five_categories_set = [ 'culture,education', 
                           'culture,education,welfare', 
                           'environmental protection,public space,urban greenery', 
                           'health,public space,sport', 
                           'public space,public transit and roads' ]

print(top_five_categories_set)

In [None]:
## introduce new columns for nested category winners with cost quartile levels; 
# three way check - if project is a winner; if project is a tag combo; if project's cost lies below quartile cost

# for MES
valid_pbprojects_df['is_mes_winner_combo_culture_education_and_low_cost'] = ((valid_pbprojects_df['is_mes_winner'] == True) & (valid_pbprojects_df['sorted_categories'] == 'culture,education') & (valid_pbprojects_df['low_cost'] == 1)).astype(int)
valid_pbprojects_df['is_mes_winner_combo_culture_education_and_high_cost'] = ((valid_pbprojects_df['is_mes_winner'] == True) & (valid_pbprojects_df['sorted_categories'] == 'culture,education') & (valid_pbprojects_df['high_cost'] == 1)).astype(int)

valid_pbprojects_df['is_mes_winner_combo_env_pspace_ugreen_and_low_cost'] = ((valid_pbprojects_df['is_mes_winner'] == True) & (valid_pbprojects_df['sorted_categories'] == 'environmental protection,public space,urban greenery') & (valid_pbprojects_df['low_cost'] == 1)).astype(int)
valid_pbprojects_df['is_mes_winner_combo_env_pspace_ugreen_and_high_cost'] = ((valid_pbprojects_df['is_mes_winner'] == True) & (valid_pbprojects_df['sorted_categories'] == 'environmental protection,public space,urban greenery') & (valid_pbprojects_df['high_cost'] == 1)).astype(int)

valid_pbprojects_df['is_mes_winner_combo_health_pspace_sport_and_low_cost'] = ((valid_pbprojects_df['is_mes_winner'] == True) & (valid_pbprojects_df['sorted_categories'] == 'health,public space,sport') & (valid_pbprojects_df['low_cost'] == 1)).astype(int)
valid_pbprojects_df['is_mes_winner_combo_health_pspace_sport_and_high_cost'] = ((valid_pbprojects_df['is_mes_winner'] == True) & (valid_pbprojects_df['sorted_categories'] == 'health,public space,sport') & (valid_pbprojects_df['high_cost'] == 1)).astype(int)

valid_pbprojects_df['is_mes_winner_combo_pspace_ptransit_and_low_cost'] = ((valid_pbprojects_df['is_mes_winner'] == True) & (valid_pbprojects_df['sorted_categories'] == 'public space,public transit and roads') & (valid_pbprojects_df['low_cost'] == 1)).astype(int)
valid_pbprojects_df['is_mes_winner_combo_pspace_ptransit_and_high_cost'] = ((valid_pbprojects_df['is_mes_winner'] == True) & (valid_pbprojects_df['sorted_categories'] == 'public space,public transit and roads') & (valid_pbprojects_df['high_cost'] == 1)).astype(int)

# for UG
valid_pbprojects_df['is_greedy_winner_combo_culture_education_and_low_cost'] = ((valid_pbprojects_df['is_greedy_winner'] == True) & (valid_pbprojects_df['sorted_categories'] == 'culture,education') & (valid_pbprojects_df['low_cost'] == 1)).astype(int)
valid_pbprojects_df['is_greedy_winner_combo_culture_education_and_high_cost'] = ((valid_pbprojects_df['is_greedy_winner'] == True) & (valid_pbprojects_df['sorted_categories'] == 'culture,education') & (valid_pbprojects_df['high_cost'] == 1)).astype(int)

valid_pbprojects_df['is_greedy_winner_combo_env_pspace_ugreen_and_low_cost'] = ((valid_pbprojects_df['is_greedy_winner'] == True) & (valid_pbprojects_df['sorted_categories'] == 'environmental protection,public space,urban greenery') & (valid_pbprojects_df['low_cost'] == 1)).astype(int)
valid_pbprojects_df['is_greedy_winner_combo_env_pspace_ugreen_and_high_cost'] = ((valid_pbprojects_df['is_greedy_winner'] == True) & (valid_pbprojects_df['sorted_categories'] == 'environmental protection,public space,urban greenery') & (valid_pbprojects_df['high_cost'] == 1)).astype(int)

valid_pbprojects_df['is_greedy_winner_combo_health_pspace_sport_and_low_cost'] = ((valid_pbprojects_df['is_greedy_winner'] == True) & (valid_pbprojects_df['sorted_categories'] == 'health,public space,sport') & (valid_pbprojects_df['low_cost'] == 1)).astype(int)
valid_pbprojects_df['is_greedy_winner_combo_health_pspace_sport_and_high_cost'] = ((valid_pbprojects_df['is_greedy_winner'] == True) & (valid_pbprojects_df['sorted_categories'] == 'health,public space,sport') & (valid_pbprojects_df['high_cost'] == 1)).astype(int)

valid_pbprojects_df['is_greedy_winner_combo_pspace_ptransit_and_low_cost'] = ((valid_pbprojects_df['is_greedy_winner'] == True) & (valid_pbprojects_df['sorted_categories'] == 'public space,public transit and roads') & (valid_pbprojects_df['low_cost'] == 1)).astype(int)
valid_pbprojects_df['is_greedy_winner_combo_pspace_ptransit_and_high_cost'] = ((valid_pbprojects_df['is_greedy_winner'] == True) & (valid_pbprojects_df['sorted_categories'] == 'public space,public transit and roads') & (valid_pbprojects_df['high_cost'] == 1)).astype(int)


print(valid_pbprojects_df.shape) # 16 new columns added; 10863 x 54

In [None]:
# For MES, grouping 16 combo with cost quartile levels, and thus adding another 8 columns with set values
mes_winner_combo_culture_education_and_low_cost_set = valid_pbprojects_df.groupby(['election_id'])['is_mes_winner_combo_culture_education_and_low_cost'].agg(set).reset_index()
mes_winner_combo_culture_education_and_low_cost_set.rename(columns={'is_mes_winner_combo_culture_education_and_low_cost': 'mes_combo_culture_education_low_cost_set'}, inplace=True)
valid_pbprojects_df = valid_pbprojects_df.merge(mes_winner_combo_culture_education_and_low_cost_set, on='election_id', how='inner')

mes_winner_combo_culture_education_and_high_cost_set = valid_pbprojects_df.groupby(['election_id'])['is_mes_winner_combo_culture_education_and_high_cost'].agg(set).reset_index()
mes_winner_combo_culture_education_and_high_cost_set.rename(columns={'is_mes_winner_combo_culture_education_and_high_cost': 'mes_combo_culture_education_high_cost_set'}, inplace=True)
valid_pbprojects_df = valid_pbprojects_df.merge(mes_winner_combo_culture_education_and_high_cost_set, on='election_id', how='inner')
##
mes_winner_combo_env_pspace_ugreen_and_low_cost_set = valid_pbprojects_df.groupby(['election_id'])['is_mes_winner_combo_env_pspace_ugreen_and_low_cost'].agg(set).reset_index()
mes_winner_combo_env_pspace_ugreen_and_low_cost_set.rename(columns={'is_mes_winner_combo_env_pspace_ugreen_and_low_cost': 'mes_combo_env_pspace_ugreen_low_cost_set'}, inplace=True)
valid_pbprojects_df = valid_pbprojects_df.merge(mes_winner_combo_env_pspace_ugreen_and_low_cost_set, on='election_id', how='inner')

mes_winner_combo_env_pspace_ugreen_and_high_cost_set = valid_pbprojects_df.groupby(['election_id'])['is_mes_winner_combo_env_pspace_ugreen_and_high_cost'].agg(set).reset_index()
mes_winner_combo_env_pspace_ugreen_and_high_cost_set.rename(columns={'is_mes_winner_combo_env_pspace_ugreen_and_high_cost': 'mes_combo_env_pspace_ugreen_high_cost_set'}, inplace=True)
valid_pbprojects_df = valid_pbprojects_df.merge(mes_winner_combo_env_pspace_ugreen_and_high_cost_set, on='election_id', how='inner')
##
mes_winner_combo_health_pspace_sport_and_low_cost_set = valid_pbprojects_df.groupby(['election_id'])['is_mes_winner_combo_health_pspace_sport_and_low_cost'].agg(set).reset_index()
mes_winner_combo_health_pspace_sport_and_low_cost_set.rename(columns={'is_mes_winner_combo_health_pspace_sport_and_low_cost': 'mes_combo_health_pspace_sport_low_cost_set'}, inplace=True)
valid_pbprojects_df = valid_pbprojects_df.merge(mes_winner_combo_health_pspace_sport_and_low_cost_set, on='election_id', how='inner')

mes_winner_combo_health_pspace_sport_and_high_cost_set = valid_pbprojects_df.groupby(['election_id'])['is_mes_winner_combo_health_pspace_sport_and_high_cost'].agg(set).reset_index()
mes_winner_combo_health_pspace_sport_and_high_cost_set.rename(columns={'is_mes_winner_combo_health_pspace_sport_and_high_cost': 'mes_combo_health_pspace_sport_high_cost_set'}, inplace=True)
valid_pbprojects_df = valid_pbprojects_df.merge(mes_winner_combo_health_pspace_sport_and_high_cost_set, on='election_id', how='inner')
##
mes_winner_combo_pspace_ptransit_and_low_cost_set = valid_pbprojects_df.groupby(['election_id'])['is_mes_winner_combo_pspace_ptransit_and_low_cost'].agg(set).reset_index()
mes_winner_combo_pspace_ptransit_and_low_cost_set.rename(columns={'is_mes_winner_combo_pspace_ptransit_and_low_cost': 'mes_combo_pspace_ptransit_low_cost_set'}, inplace=True)
valid_pbprojects_df = valid_pbprojects_df.merge(mes_winner_combo_pspace_ptransit_and_low_cost_set, on='election_id', how='inner')

mes_winner_combo_pspace_ptransit_and_high_cost_set = valid_pbprojects_df.groupby(['election_id'])['is_mes_winner_combo_pspace_ptransit_and_high_cost'].agg(set).reset_index()
mes_winner_combo_pspace_ptransit_and_high_cost_set.rename(columns={'is_mes_winner_combo_pspace_ptransit_and_high_cost': 'mes_combo_pspace_ptransit_high_cost_set'}, inplace=True)
valid_pbprojects_df = valid_pbprojects_df.merge(mes_winner_combo_pspace_ptransit_and_high_cost_set, on='election_id', how='inner')

print(valid_pbprojects_df.shape) # 8 new columns added for MES winners combo with quartile set; 10863 x 62

In [None]:
# For UG, grouping 16 combo with cost quartile levels, and thus adding another 8 columns with set values
greedy_winner_combo_culture_education_and_low_cost_set = valid_pbprojects_df.groupby(['election_id'])['is_greedy_winner_combo_culture_education_and_low_cost'].agg(set).reset_index()
greedy_winner_combo_culture_education_and_low_cost_set.rename(columns={'is_greedy_winner_combo_culture_education_and_low_cost': 'greedy_combo_culture_education_low_cost_set'}, inplace=True)
valid_pbprojects_df = valid_pbprojects_df.merge(greedy_winner_combo_culture_education_and_low_cost_set, on='election_id', how='inner')

greedy_winner_combo_culture_education_and_high_cost_set = valid_pbprojects_df.groupby(['election_id'])['is_greedy_winner_combo_culture_education_and_high_cost'].agg(set).reset_index()
greedy_winner_combo_culture_education_and_high_cost_set.rename(columns={'is_greedy_winner_combo_culture_education_and_high_cost': 'greedy_combo_culture_education_high_cost_set'}, inplace=True)
valid_pbprojects_df = valid_pbprojects_df.merge(greedy_winner_combo_culture_education_and_high_cost_set, on='election_id', how='inner')
##
greedy_winner_combo_env_pspace_ugreen_and_low_cost_set = valid_pbprojects_df.groupby(['election_id'])['is_greedy_winner_combo_env_pspace_ugreen_and_low_cost'].agg(set).reset_index()
greedy_winner_combo_env_pspace_ugreen_and_low_cost_set.rename(columns={'is_greedy_winner_combo_env_pspace_ugreen_and_low_cost': 'greedy_combo_env_pspace_ugreen_low_cost_set'}, inplace=True)
valid_pbprojects_df = valid_pbprojects_df.merge(greedy_winner_combo_env_pspace_ugreen_and_low_cost_set, on='election_id', how='inner')

greedy_winner_combo_env_pspace_ugreen_and_high_cost_set = valid_pbprojects_df.groupby(['election_id'])['is_greedy_winner_combo_env_pspace_ugreen_and_high_cost'].agg(set).reset_index()
greedy_winner_combo_env_pspace_ugreen_and_high_cost_set.rename(columns={'is_greedy_winner_combo_env_pspace_ugreen_and_high_cost': 'greedy_combo_env_pspace_ugreen_high_cost_set'}, inplace=True)
valid_pbprojects_df = valid_pbprojects_df.merge(greedy_winner_combo_env_pspace_ugreen_and_high_cost_set, on='election_id', how='inner')
##
greedy_winner_combo_health_pspace_sport_and_low_cost_set = valid_pbprojects_df.groupby(['election_id'])['is_greedy_winner_combo_health_pspace_sport_and_low_cost'].agg(set).reset_index()
greedy_winner_combo_health_pspace_sport_and_low_cost_set.rename(columns={'is_greedy_winner_combo_health_pspace_sport_and_low_cost': 'greedy_combo_health_pspace_sport_low_cost_set'}, inplace=True)
valid_pbprojects_df = valid_pbprojects_df.merge(greedy_winner_combo_health_pspace_sport_and_low_cost_set, on='election_id', how='inner')

greedy_winner_combo_health_pspace_sport_and_high_cost_set = valid_pbprojects_df.groupby(['election_id'])['is_greedy_winner_combo_health_pspace_sport_and_high_cost'].agg(set).reset_index()
greedy_winner_combo_health_pspace_sport_and_high_cost_set.rename(columns={'is_greedy_winner_combo_health_pspace_sport_and_high_cost': 'greedy_combo_health_pspace_sport_high_cost_set'}, inplace=True)
valid_pbprojects_df = valid_pbprojects_df.merge(greedy_winner_combo_health_pspace_sport_and_high_cost_set, on='election_id', how='inner')
##
greedy_winner_combo_pspace_ptransit_and_low_cost_set = valid_pbprojects_df.groupby(['election_id'])['is_greedy_winner_combo_pspace_ptransit_and_low_cost'].agg(set).reset_index()
greedy_winner_combo_pspace_ptransit_and_low_cost_set.rename(columns={'is_greedy_winner_combo_pspace_ptransit_and_low_cost': 'greedy_combo_pspace_ptransit_low_cost_set'}, inplace=True)
valid_pbprojects_df = valid_pbprojects_df.merge(greedy_winner_combo_pspace_ptransit_and_low_cost_set, on='election_id', how='inner')

greedy_winner_combo_pspace_ptransit_and_high_cost_set = valid_pbprojects_df.groupby(['election_id'])['is_greedy_winner_combo_pspace_ptransit_and_high_cost'].agg(set).reset_index()
greedy_winner_combo_pspace_ptransit_and_high_cost_set.rename(columns={'is_greedy_winner_combo_pspace_ptransit_and_high_cost': 'greedy_combo_pspace_ptransit_high_cost_set'}, inplace=True)
valid_pbprojects_df = valid_pbprojects_df.merge(greedy_winner_combo_pspace_ptransit_and_high_cost_set, on='election_id', how='inner')

print(valid_pbprojects_df.shape) # 8 new columns added for UG winners combo with quartile set; 10863 x 70

In [None]:
## initialize 16 independent variables (16 each for UG & MES) at election levels
# for MES
valid_pbprojects_df['mes_combo_culture_education_low_cost'] = 0
valid_pbprojects_df['mes_combo_culture_education_high_cost'] = 0

valid_pbprojects_df['mes_combo_env_pspace_ugreen_low_cost'] = 0
valid_pbprojects_df['mes_combo_env_pspace_ugreen_high_cost'] = 0

valid_pbprojects_df['mes_combo_health_pspace_sport_low_cost'] = 0
valid_pbprojects_df['mes_combo_health_pspace_sport_high_cost'] = 0

valid_pbprojects_df['mes_combo_pspace_ptransit_low_cost'] = 0
valid_pbprojects_df['mes_combo_pspace_ptransit_high_cost'] = 0

# for UG
valid_pbprojects_df['greedy_combo_culture_education_low_cost'] = 0
valid_pbprojects_df['greedy_combo_culture_education_high_cost'] = 0

valid_pbprojects_df['greedy_combo_env_pspace_ugreen_low_cost'] = 0
valid_pbprojects_df['greedy_combo_env_pspace_ugreen_high_cost'] = 0

valid_pbprojects_df['greedy_combo_health_pspace_sport_low_cost'] = 0
valid_pbprojects_df['greedy_combo_health_pspace_sport_high_cost'] = 0

valid_pbprojects_df['greedy_combo_pspace_ptransit_low_cost'] = 0
valid_pbprojects_df['greedy_combo_pspace_ptransit_high_cost'] = 0

## update the independent variables based on election results
for idx, row in valid_pbprojects_df.iterrows():
    ## condition checks for MES winners
    if 1 in row['mes_combo_culture_education_low_cost_set']:
        valid_pbprojects_df.at[idx, 'mes_combo_culture_education_low_cost'] = 1
    if 1 in row['mes_combo_culture_education_high_cost_set']:
        valid_pbprojects_df.at[idx, 'mes_combo_culture_education_high_cost'] = 1
    
    if 1 in row['mes_combo_env_pspace_ugreen_low_cost_set']:
        valid_pbprojects_df.at[idx, 'mes_combo_env_pspace_ugreen_low_cost'] = 1
    if 1 in row['mes_combo_env_pspace_ugreen_high_cost_set']:
        valid_pbprojects_df.at[idx, 'mes_combo_env_pspace_ugreen_high_cost'] = 1
    
    if 1 in row['mes_combo_health_pspace_sport_low_cost_set']:
        valid_pbprojects_df.at[idx, 'mes_combo_health_pspace_sport_low_cost'] = 1
    if 1 in row['mes_combo_health_pspace_sport_high_cost_set']:
        valid_pbprojects_df.at[idx, 'mes_combo_health_pspace_sport_high_cost'] = 1
    
    if 1 in row['mes_combo_pspace_ptransit_low_cost_set']:
        valid_pbprojects_df.at[idx, 'mes_combo_pspace_ptransit_low_cost'] = 1
    if 1 in row['mes_combo_pspace_ptransit_high_cost_set']:
        valid_pbprojects_df.at[idx, 'mes_combo_pspace_ptransit_high_cost'] = 1

    ## condition checks for UG winners
    if 1 in row['greedy_combo_culture_education_low_cost_set']:
        valid_pbprojects_df.at[idx, 'greedy_combo_culture_education_low_cost'] = 1
    if 1 in row['greedy_combo_culture_education_high_cost_set']:
        valid_pbprojects_df.at[idx, 'greedy_combo_culture_education_high_cost'] = 1

    if 1 in row['greedy_combo_env_pspace_ugreen_low_cost_set']:
        valid_pbprojects_df.at[idx, 'greedy_combo_env_pspace_ugreen_low_cost'] = 1
    if 1 in row['greedy_combo_env_pspace_ugreen_high_cost_set']:
        valid_pbprojects_df.at[idx, 'greedy_combo_env_pspace_ugreen_high_cost'] = 1

    if 1 in row['greedy_combo_health_pspace_sport_low_cost_set']:
        valid_pbprojects_df.at[idx, 'greedy_combo_health_pspace_sport_low_cost'] = 1
    if 1 in row['greedy_combo_health_pspace_sport_high_cost_set']:
        valid_pbprojects_df.at[idx, 'greedy_combo_health_pspace_sport_high_cost'] = 1

    if 1 in row['greedy_combo_pspace_ptransit_low_cost_set']:
        valid_pbprojects_df.at[idx, 'greedy_combo_pspace_ptransit_low_cost'] = 1
    if 1 in row['greedy_combo_pspace_ptransit_high_cost_set']:
        valid_pbprojects_df.at[idx, 'greedy_combo_pspace_ptransit_high_cost'] = 1

print(valid_pbprojects_df.shape) ## 32 new columns added; shape should be 10863 x 139

## OLS Fitting for Conjoint Analysis

### Preparing dataset for MES and UG

In [None]:
## Fitting should be done separately for MES and UG, even if we are predicting the same type of variable;
## because we have different number rows for which independent variables have values

mes_conjoint_dataset = valid_pbprojects_df[[
    'election_id', 
    'mes_combo_culture_education_low_cost', 'mes_combo_culture_education_high_cost', ## independent variables
    'mes_combo_env_pspace_ugreen_low_cost', 'mes_combo_env_pspace_ugreen_high_cost', ## independent variables
    'mes_combo_health_pspace_sport_low_cost', 'mes_combo_health_pspace_sport_high_cost', ## independent variables
    'mes_combo_pspace_ptransit_low_cost', 'mes_combo_pspace_ptransit_high_cost', ## independent variables
    'mes_budget_utilization', 'mes_winning_rate', 'mes_cost_representation' ## dependent variables
]]

greedy_conjoint_dataset = valid_pbprojects_df[[
    'election_id', 
    'greedy_combo_culture_education_low_cost', 'greedy_combo_culture_education_high_cost', ## independent variables
    'greedy_combo_env_pspace_ugreen_low_cost', 'greedy_combo_env_pspace_ugreen_high_cost', ## independent variables
    'greedy_combo_health_pspace_sport_low_cost', 'greedy_combo_health_pspace_sport_high_cost', ## independent variables
    'greedy_combo_pspace_ptransit_low_cost', 'greedy_combo_pspace_ptransit_high_cost', ## independent variables
    'greedy_budget_utilization', 'greedy_winning_rate', 'greedy_cost_representation' ## dependent variables
]]

print(mes_conjoint_dataset.shape) ## should be 10863 x 12
print(greedy_conjoint_dataset.shape) ## 10863 x 12

## we want to take in only those elections in which at least one of the combos yield in results
mes_conjoint_dataset = mes_conjoint_dataset[
    (mes_conjoint_dataset['mes_combo_culture_education_low_cost'] == 1) | (mes_conjoint_dataset['mes_combo_culture_education_high_cost'] == 1) | 
    (mes_conjoint_dataset['mes_combo_env_pspace_ugreen_low_cost'] == 1) | (mes_conjoint_dataset['mes_combo_env_pspace_ugreen_high_cost'] == 1) | 
    (mes_conjoint_dataset['mes_combo_health_pspace_sport_low_cost'] == 1) | (mes_conjoint_dataset['mes_combo_health_pspace_sport_high_cost'] == 1) | 
    (mes_conjoint_dataset['mes_combo_pspace_ptransit_low_cost'] == 1) | (mes_conjoint_dataset['mes_combo_pspace_ptransit_high_cost'] == 1) ]
mes_conjoint_dataset.drop_duplicates(inplace=True)

greedy_conjoint_dataset = greedy_conjoint_dataset[
    (greedy_conjoint_dataset['greedy_combo_culture_education_low_cost'] == 1) | (greedy_conjoint_dataset['greedy_combo_culture_education_high_cost'] == 1) | 
    (greedy_conjoint_dataset['greedy_combo_env_pspace_ugreen_low_cost'] == 1) | (greedy_conjoint_dataset['greedy_combo_env_pspace_ugreen_high_cost'] == 1) | 
    (greedy_conjoint_dataset['greedy_combo_health_pspace_sport_low_cost'] == 1) | (greedy_conjoint_dataset['greedy_combo_health_pspace_sport_high_cost'] == 1) |  
    (greedy_conjoint_dataset['greedy_combo_pspace_ptransit_low_cost'] == 1) | (greedy_conjoint_dataset['greedy_combo_pspace_ptransit_high_cost'] == 1) ]
greedy_conjoint_dataset.drop_duplicates(inplace=True)

print(mes_conjoint_dataset.shape) ## should be 322 x 12
print(greedy_conjoint_dataset.shape) ## should be 316 x 12



In [None]:
mes_conjoint_dataset.head()

In [None]:
greedy_conjoint_dataset.head()

### Residuals Summary For Dependent-Independent variables by MES

In [None]:
mes_conjoint_dataset['mes_combo_health_pspace_sport_high_cost'] = 0 # setting this column to zero, to avoid the issues of collinearity

mes_feature_columns = [ 'mes_combo_culture_education_low_cost', 'mes_combo_culture_education_high_cost',
                        'mes_combo_env_pspace_ugreen_low_cost', 'mes_combo_env_pspace_ugreen_high_cost', 
                        'mes_combo_health_pspace_sport_low_cost', 'mes_combo_health_pspace_sport_high_cost',
                        'mes_combo_pspace_ptransit_low_cost', 'mes_combo_pspace_ptransit_high_cost'
                    ]

## independent variables set
X_features = mes_conjoint_dataset[mes_feature_columns]

# dependent variables
mes_cr = mes_conjoint_dataset['mes_cost_representation']
mes_bu = mes_conjoint_dataset['mes_budget_utilization']
mes_wr = mes_conjoint_dataset['mes_winning_rate']

# OLS fitting for each dependent variable for MES aggregation
mes_cr_res = sm.OLS(mes_cr, X_features, family=sm.families.Binomial()).fit()
print("++++++++++++++++++++++++++ Regression Analysis on MES Cost Representation ++++++++++++++++++++++++++")
print(mes_cr_res.summary())

mes_bu_res = sm.OLS(mes_bu, X_features, family=sm.families.Binomial()).fit()
print("++++++++++++++++++++++++++ Regression Analysis on MES Budget Utilization ++++++++++++++++++++++++++")
print(mes_bu_res.summary())

mes_wr_res = sm.OLS(mes_wr, X_features, family=sm.families.Binomial()).fit()
print("++++++++++++++++++++++++++ Regression Analysis on MES Winning Rate ++++++++++++++++++++++++++")
print(mes_wr_res.summary())

### Residuals Summary for Dependent-Independent Variables by UG

In [None]:
greedy_conjoint_dataset['greedy_combo_health_pspace_sport_high_cost'] = 0
greedy_feature_columns = [ 'greedy_combo_culture_education_low_cost', 'greedy_combo_culture_education_high_cost',
                        'greedy_combo_env_pspace_ugreen_low_cost', 'greedy_combo_env_pspace_ugreen_high_cost', 
                        'greedy_combo_health_pspace_sport_low_cost', 'greedy_combo_health_pspace_sport_high_cost', 
                        'greedy_combo_pspace_ptransit_low_cost', 'greedy_combo_pspace_ptransit_high_cost'
                    ]

## independent variables set
X_features = greedy_conjoint_dataset[greedy_feature_columns]

# dependent variables
greedy_cr = greedy_conjoint_dataset['greedy_cost_representation']
greedy_bu = greedy_conjoint_dataset['greedy_budget_utilization']
greedy_wr = greedy_conjoint_dataset['greedy_winning_rate']

# OLS fitting for each dependent variable for UG aggregation
greedy_cr_res = sm.OLS(greedy_cr, X_features, family=sm.families.Binomial()).fit()
print("++++++++++++++++++++++++++ Regression Analysis on UG Cost Representation ++++++++++++++++++++++++++")
print(greedy_cr_res.summary())

greedy_bu_res = sm.OLS(greedy_bu, X_features, family=sm.families.Binomial()).fit()
print("++++++++++++++++++++++++++ Regression Analysis on UG Budget Utilization ++++++++++++++++++++++++++")
print(greedy_bu_res.summary())

greedy_wr_res = sm.OLS(greedy_wr, X_features, family=sm.families.Binomial()).fit()
print("++++++++++++++++++++++++++ Regression Analysis on UG Winning Rate ++++++++++++++++++++++++++")
print(greedy_wr_res.summary())

In [None]:
# Store metrics, parameters, r-square values and other params together into a single dataframe
res_arr = [mes_cr_res, mes_bu_res, mes_wr_res, greedy_cr_res, greedy_bu_res, greedy_wr_res]
cj_dataset = pd.DataFrame(columns=['metric_name', 'metric', 'rsquare_value', 'result_type', 'num_instances', 'var_name', 'pval', 'param_coeff', 'range', 'avg_coeff' ]) ## initialize empty dataframe skeleton

for res in res_arr:
    metric_name = res.model.endog_names
    rsquare_value = res.rsquared
    result_type = "mes" if str(metric_name).startswith("mes") else "greedy"
    num_instances = res.nobs
    metric = metric_name.split(f'{result_type}_')[1] # removing the prefix of mes or greedy in front of the metric name
    params_names_set = res.params.keys()
    param_w_values_set = res.params.values
    p_values_set = res.pvalues


    # some secondary variables used for calcuation of relative importances
    min_param_value = np.min(param_w_values_set)
    max_param_value = np.max(param_w_values_set)
    param_range = max_param_value - min_param_value
    avg_param_value = np.average(param_w_values_set)

    assert ((len(params_names_set) == len(param_w_values_set)) and (len(param_w_values_set) == len(p_values_set))), "Unequal length of parameters dataset values"
    
    for param_name, param_w_value, p_value in zip(params_names_set, param_w_values_set, p_values_set):
        param_name = param_name.split(f'{result_type}_')[1]
        row = {'metric_name': metric_name, 'metric': metric, 'rsquare_value': rsquare_value, 'result_type': result_type, 'num_instances': num_instances, 'var_name': param_name, 'pval': p_value, 'param_coeff': param_w_value, 'range': param_range, 'avg_coeff': avg_param_value}
        cj_dataset.loc[len(cj_dataset)] = row

cj_dataset

In [None]:
cj_dataset['is_sig_95'] = (cj_dataset['pval'] < 0.05)
cj_dataset['relative_importance'] = 100 * (cj_dataset['param_coeff'] - cj_dataset['avg_coeff']) / (cj_dataset['range'])

cj_dataset

### For the line plot; we only consider the metric for budget utilization since it has better fit for both UG and ES and also all p-values are significant

In [None]:
budget_utilization_df = cj_dataset[cj_dataset['metric'] == 'budget_utilization']

In [None]:
xlabels = ['Culture\nEducation', 
           'Env. Protection\nPublic Space\nUrban Greenery', 
           'Health\nPublic Space\nSport',
           'Public Space\nPublic Transit'
           ]
plt.figure(figsize=(10, 8))

y_mes = budget_utilization_df[(budget_utilization_df['result_type'] == 'mes') & (budget_utilization_df['metric'] == 'budget_utilization')]['relative_importance'].values
y_greedy = budget_utilization_df[(budget_utilization_df['result_type'] == 'greedy') & (budget_utilization_df['metric'] == 'budget_utilization')]['relative_importance'].values

y_mes_low = [y_mes[i] for i in range(len(y_mes)) if i % 2 == 0]
y_mes_high = [y_mes[i] for i in range(len(y_mes)) if i % 2 != 0]
y_ug_low = [y_greedy[i] for i in range(len(y_greedy)) if i % 2 == 0]
y_ug_high = [y_greedy[i] for i in range(len(y_greedy)) if i % 2 != 0]

# Plotting the data
plt.plot(xlabels, y_mes_low, marker='*', linewidth=3, markersize=18, color='#5e3c99', label='ES Low Cost', alpha=0.8)
plt.plot(xlabels, y_mes_high, marker='*', linewidth=3, markersize=18, color='#b2abd2', label='ES High Cost', alpha=0.8)
plt.plot(xlabels, y_ug_low, marker='o', linewidth=3, markersize=12, color='#e66101', label='UG Low Cost', alpha=0.8)
plt.plot(xlabels, y_ug_high, marker='o', linewidth=3, markersize=12, color='#fdb863', label='UG High Cost', alpha=0.8)

# Adding labels and title
plt.xlabel('Frequent Combination of Impact Areas', fontsize=16, fontweight='bold', labelpad=12)
plt.ylabel('Relative Importance (%)', fontsize=16, fontweight='bold')

plt.gca().xaxis.set_tick_params(pad=8)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.grid(axis='both', which='major', color='gray', alpha=0.1)


# Customize legend
handles, labels = plt.gca().get_legend_handles_labels()

# Create legend outside of the main plot
plt.legend(handles, labels, loc='upper right', bbox_to_anchor=(1.01, 1.06), ncols=8, frameon=False)

# Displaying the plot

plt.tight_layout()
plt.show()
