In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
interim_dir = "../../data/interim"
processed_dir = "../../data/processed"
data_model_file = os.path.join(processed_dir, "data_model_output.csv")

output_filename = os.path.join(processed_dir, "data_model_output_select_tall_summaries.csv")

In [3]:
data_model_df = pd.read_csv(data_model_file)
df = data_model_df[(data_model_df['is_completed'] == 1) & (data_model_df['record_type_synthetic'] == 0) & (data_model_df['initial_etc_check']==True) & (data_model_df['marketsegment_label']=='PASSENGER')]

  data_model_df = pd.read_csv(data_model_file)


In [4]:
sp_feature_df = df.filter(like="sp_feature", axis=1)
sp_feature_df.columns

Index(['sp_feature_early_morning', 'sp_feature_early_morning_label',
       'sp_feature_late_night', 'sp_feature_late_night_label',
       'sp_feature_luggage_rack', 'sp_feature_luggage_rack_label',
       'sp_feature_no_delay', 'sp_feature_no_delay_label',
       'sp_feature_seats_transit', 'sp_feature_seats_transit_label',
       'sp_feature_seats_transit_stop', 'sp_feature_seats_transit_stop_label',
       'sp_feature_short_wait', 'sp_feature_short_wait_label',
       'sp_feature_weekend_frequency', 'sp_feature_weekend_frequency_label'],
      dtype='object')

In [5]:
reason_df = df.filter(like="reason", axis=1)
reason_df.columns

Index(['reasons_no_transit_dislike_crowded_trains_buses',
       'reasons_no_transit_dislike_public_transport',
       'reasons_no_transit_dislike_public_transport_with_luggage',
       'reasons_no_transit_does_not_run_when_needed',
       'reasons_no_transit_dont_know_how', 'reasons_no_transit_list',
       'reasons_no_transit_no_good_options',
       'reasons_no_transit_not_convenient',
       'reasons_no_transit_not_economical', 'reasons_no_transit_not_flexible',
       'reasons_no_transit_not_reliable', 'reasons_no_transit_not_safe',
       'reasons_no_transit_other', 'reasons_no_transit_prefer_other_mode',
       'reasons_no_transit_ride_too_long', 'reasons_no_transit_stop_too_far',
       'reasons_no_transit_too_complicated',
       'reasons_no_transit_too_many_transfers',
       'reasons_no_transit_too_much_walking_stairs',
       'reasons_no_transit_wait_too_long'],
      dtype='object')

In [6]:
weight_df = df.filter(like="weight", axis=1)
weight_df = pd.concat([df["unique_id"], weight_df], axis=1)
weight_df.columns

Index(['unique_id', 'weight_departing_and_arriving', 'weight_departing_only',
       'weight_departing_only_model_respondents',
       'weight_non_sas_departing_only',
       'weight_departing_only_with_time_of_day'],
      dtype='object')

In [7]:
working_df = pd.concat([df[["unique_id"]], reason_df, sp_feature_df], axis=1)
working_df.head()

Unnamed: 0,unique_id,reasons_no_transit_dislike_crowded_trains_buses,reasons_no_transit_dislike_public_transport,reasons_no_transit_dislike_public_transport_with_luggage,reasons_no_transit_does_not_run_when_needed,reasons_no_transit_dont_know_how,reasons_no_transit_list,reasons_no_transit_no_good_options,reasons_no_transit_not_convenient,reasons_no_transit_not_economical,...,sp_feature_no_delay,sp_feature_no_delay_label,sp_feature_seats_transit,sp_feature_seats_transit_label,sp_feature_seats_transit_stop,sp_feature_seats_transit_stop_label,sp_feature_short_wait,sp_feature_short_wait_label,sp_feature_weekend_frequency,sp_feature_weekend_frequency_label
756,757,False,False,False,,False,not convenient,False,True,False,...,,,,,,,,,,
757,758,,,,,,,,,,...,,,,,,,,,,
758,759,,,,,,,,,,...,,,,,,,,,,
759,760,,,,,,,,,,...,,,,,,,,,,
760,761,,,,,,,,,,...,,,,,,,,,,


In [8]:
tall_df = pd.melt(working_df, id_vars=["unique_id"], var_name="question", value_name="answer")
tall_df["question_type"] = "Missing"
tall_df["question_type"] = np.where(tall_df["question"].str.contains("reasons_no_transit"), "Reasons No Transit", tall_df["question_type"])
tall_df["question_type"] = np.where(tall_df["question"].str.contains("sp_feature"), "SP Feature", tall_df["question_type"])
tall_df["question"] = tall_df["question"].str.replace("reasons_no_transit_", "").str.replace("sp_feature_", "").str.replace("_"," ").str.title()
tall_df = tall_df.dropna(subset=["answer"]).reset_index(drop=True)
tall_df.head()

Unnamed: 0,unique_id,question,answer,question_type
0,757,Dislike Crowded Trains Buses,False,Reasons No Transit
1,765,Dislike Crowded Trains Buses,False,Reasons No Transit
2,766,Dislike Crowded Trains Buses,False,Reasons No Transit
3,767,Dislike Crowded Trains Buses,False,Reasons No Transit
4,768,Dislike Crowded Trains Buses,False,Reasons No Transit


In [9]:
output_df = pd.merge(tall_df, weight_df, on="unique_id", how="left")
output_df.head()

Unnamed: 0,unique_id,question,answer,question_type,weight_departing_and_arriving,weight_departing_only,weight_departing_only_model_respondents,weight_non_sas_departing_only,weight_departing_only_with_time_of_day
0,757,Dislike Crowded Trains Buses,False,Reasons No Transit,3.651478,,,,
1,765,Dislike Crowded Trains Buses,False,Reasons No Transit,2.385938,,,,
2,766,Dislike Crowded Trains Buses,False,Reasons No Transit,2.385938,,,,
3,767,Dislike Crowded Trains Buses,False,Reasons No Transit,1.910884,,,,
4,768,Dislike Crowded Trains Buses,False,Reasons No Transit,1.910884,,,,


In [10]:
output_df.to_csv(output_filename, index=False)

In [11]:
output_df.pivot_table(index=['question_type','question'], columns=['answer'], values='weight_departing_only', aggfunc='sum')

Unnamed: 0_level_0,answer,False,True,2.0,3.0,4.0,5.0,Already had refugee van waiting,Because I had a package deal,Better option,Can't go on base,...,"too many transfers, stop too far","too many transfers, too much walking stairs",too much walking stairs,"too much walking stairs, dislike public transport, prefer other mode",wait too long,"wait too long, dislike crowded trains buses, dislike public transport","wait too long, dislike public transport with luggage","wait too long, does not run when needed","wait too long, not economical","wait too long, prefer other mode"
question_type,question,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Reasons No Transit,Dislike Crowded Trains Buses,9790.062352,162.825966,,,,,,,,,...,,,,,,,,,,
Reasons No Transit,Dislike Public Transport,9762.774101,190.114217,,,,,,,,,...,,,,,,,,,,
Reasons No Transit,Dislike Public Transport With Luggage,9762.176494,190.711824,,,,,,,,,...,,,,,,,,,,
Reasons No Transit,Does Not Run When Needed,8119.37462,193.605601,,,,,,,,,...,,,,,,,,,,
Reasons No Transit,Dont Know How,8126.141835,2309.467891,,,,,,,,,...,,,,,,,,,,
Reasons No Transit,List,,,,,,,,,,,...,9.18452,1.790733,1.790733,1.790733,174.466562,3.206725,5.603054,1.790733,3.581466,5.3722
Reasons No Transit,No Good Options,10107.508235,328.101491,,,,,,,,,...,,,,,,,,,,
Reasons No Transit,Not Convenient,4513.408339,5922.201387,,,,,,,,,...,,,,,,,,,,
Reasons No Transit,Not Economical,10270.363457,165.246269,,,,,,,,,...,,,,,,,,,,
Reasons No Transit,Not Flexible,10057.413191,378.196535,,,,,,,,,...,,,,,,,,,,


In [12]:
output_df.pivot_table(index=['question_type','question','answer'], values='weight_departing_only', aggfunc='sum', margins=True, margins_name='answer').reset_index(drop=False)

Unnamed: 0,question_type,question,answer,weight_departing_only
0,Reasons No Transit,Dislike Crowded Trains Buses,False,9790.062352
1,Reasons No Transit,Dislike Crowded Trains Buses,True,162.825966
2,Reasons No Transit,Dislike Public Transport,False,9762.774101
3,Reasons No Transit,Dislike Public Transport,True,190.114217
4,Reasons No Transit,Dislike Public Transport With Luggage,False,9762.176494
...,...,...,...,...
506,SP Feature,Weekend Frequency Label,IMPORTANT,458.841844
507,SP Feature,Weekend Frequency Label,NOT_IMPORTANT,55.176398
508,SP Feature,Weekend Frequency Label,SLIGHTLY_IMPORTANT,104.347100
509,SP Feature,Weekend Frequency Label,VERY_IMPORTANT,493.501510
