In [35]:
import pandas as pd
import numpy as np
import json
import collections

In [36]:
# Pulled SQL data for each customer id and their tariffs
tariffs = pd.read_csv("customers_tariffs.csv")

In [30]:
file_names = {
    "initial_commcare": "Commcare INITIAL Survey updated w Renewvia Numbers - Copy.xlsx",
    "initial_msform": "MS Form - INITIAL SURVEY Updated w meter numbers 230216.xlsx",
    "ci_post_commcare": "Commcare - POST Connection - COMMERCIAL & INSTITUTION Connectin Impact Survey.xlsx",
    "ci_post_msform": "MS Form - POST Connection - COMMERCIAL & INSTITUTION Connection Impact Survey.xlsx",
    "hs_post_commcare": "Commcare POST Connection HOUSEHOLD.xlsx",
    "hs_post_msform": "MS Form - POST Connection HOUSEHOLD.xlsx"
    }

datasets = dict()
for name, file in file_names.items():
    datasets[name] = pd.read_excel("datasets_raw/"+file)

In [41]:
# mapping the columns names
with open('col_rename.json', 'r') as f:
    col_rename_map = json.load(f)
    
name_mapping = dict()
names = col_rename_map['col_rename']
n_col = len(names)
    
for i in range(0, n_col):
    old = names[i]['orignal_name']
    new = names[i]['new_name']
    name_mapping[old] = new

In [60]:
# mapping missing or mistyped answers
with open('col_answers_mapping.json', 'r') as f:
    col_answers_map = json.load(f)
    
answers = col_answers_map['mapping']
n_col = len(answers)

map_list = list()
for i in range(0, n_col):
    name = answers[i]['col_new_name']
    indv_map = dict()
    # if name in list(df.columns):
    original = answers[i]['original_answers']
    formatted = answers[i]['formatted_answers']
    indv_map =  dict(zip(formatted, original))
    map_list.append(indv_map)
    
print(map_list)

[{}, {}, {}, {'yes': 'Yes', 'no': 'No', 'not_sure': 'Not Sure'}, {'months': '1-3 months', 'copy-1-of-months': '3-6 months', 'copy-2-of-months': '6-12 months', 'copy-3-of-months': '12-24 months', 'copy-4-of-months': '24-36 months', 'over_36_months': 'over 36 months'}, {}, {}, {'nigeria': 'Nigeria', 'kenya': 'Kenya'}, {'akipelai': 'Akipelai', 'oloibiri': 'Oloibiri', 'ozuzu': 'Ozuzu', 'opu': 'Opu', 'balep': 'Balep', 'bendeghe-afi': 'Bendeghe-Afi', 'ekong_anaku': 'Ekong Anaku', 'emeroke': 'Emeroke'}, {'kalobeyei_settlement': 'Kalobeyei Settlement', 'kalobeyei_town': 'Kalobeyei Town', 'ndeda': 'Ndeda', 'ngurunit': 'Ngurunit', 'ringiti': 'Ringiti', 'oyamo': 'Oyamo', 'olkiramatian': 'Olkiramatian', 'lomekwi': 'Lomekwi', 'katiko': 'Katiko', 'locheremoit': 'Locheremoit', 'kangitankori': 'Kangitankori', 'kapelbok': 'Kapelbok', 'lorengelup': 'Lorengelup', 'nakukulas': 'Nakukulas'}, {}, {}, {'male': 'Male', 'female': 'Female'}, {'yes': 'Yes', 'no': 'No'}, {}, {'no_it_is_the_same': 'No, it is the s

In [53]:
df = datasets["hs_post_commcare"]
cols = list(df.columns)
cols_clean = [col.replace('\xa0', '') if '\xa0' in col else col for col in cols]
df_rename_dict = dict()
for i in cols_clean:
    if i in list(name_mapping.keys()):
        df_rename_dict[i] = name_mapping[i]

df = df.rename(mapper=df_rename_dict, axis=1)
missing = ['         ', '    ', '   ',
           '******', 'months', 'nan']
df['renewvia_id'] = df['renewvia_id'].apply(lambda x: str(x))
df['renewvia_id'].replace(missing, np.nan, inplace=True)
df['start'] = pd.to_datetime(df['start'])
df['end'] = pd.to_datetime(df['end'])
df['end_date'] = df['end'].dt.date
    
for i in range(0, n_col):
    name = answers[i]['col_new_name']
    if name in list(df.columns):
        vals_original = answers[i]['original_answers']
        vals_formatted = answers[i]['formatted_answers']
        df[name].replace(vals_formatted, vals_original, inplace=True)
    else:
        continue  

In [54]:
for col in list(df.columns):
    cats = df[col].unique()
    if len(cats) < 15:
        print(col, cats)

interviewed_before [nan 'No' 'Yes' 'Not Sure']
connection_period [nan '12-24 months' '6-12 months' '24-36 months' '3-6 months' '1-3 months']
country [nan 'Nigeria' 'Kenya']
nigeria_community [nan 'Ozuzu' 'Emeroke' 'Ekong Anaku' 'Bendeghe-Afi' 'Opu' 'Balep'
 'Oloibiri']
kenya_community [nan 'Kalobeyei Settlement']
gender [nan 'Male' 'Female']
Has your (or your spouse's) occupation changed since you've been connected to minigrid power? [nan 'no' 'yes']
Has your household income changed since your  connection to minigrid power? [nan 'yes_it_has_decreased' 'yes_it_has_increased' 'no_it_is_the_same']
household_headcount_change [nan 'No, it has stayed the same' "Yes, it's increased"
 "Yes, it's decreased"]
female_schooling_change [nan "No, it's the same" "Yes, it's increased" "Yes, it's decreased"]
male_schooling_change [nan "No, it's the same" "Yes, it's increased" "Yes, it's decreased"]
school_performance_change [nan "Yes, it's gotten better" "No, it's the same"
 "Yes, it's gotten worse"]


In [None]:
for df in datasets.values():
    col = ''
    cols = list(df.columns)
    if id_col_names[0] in cols:
        col = id_col_names[0]
    elif id_col_names[1] in cols:
        col = id_col_names[1]
    else:
        print("Couldn't find the Renewvia Account number under the following column names:")
        print(id_col_names)
        break
      
    
    # df.rename()

In [58]:
# Importing the initial surveys
status = ['Pre-Connection', 'Post-Connection']
initial_commcare = datasets['initial_commcare']
df_merged = initial_commcare.merge(tariffs, 
                                   left_on='What is your Renewvia Minigrid account number?',
                                   right_on='customerAccountNumber', how='left')
df_merged[df_merged['Select your Minigrid Connection Status'] == status[0]]
df_merged.head()
# initial_msform
# # Importing the household post connection 
# ci_post_comncare
# ci_post_msform
# # Importing the household post connection 
# hs_post_comncare
# hs_post_msform

Unnamed: 0,Start Time,Completed time,Select your Minigrid Connection Status,"If Post-Connection, select how long your have been using the mingrid",What is your Renewvia Minigrid account number?,First Name,Last Name,Country,Nigeria Community,Kenya Community,...,How far must you travel to obtain your water supply?,How long does the water collection process take on a daily basis?,Who is mainly responsible for water collection on a daily basis?,What is the average age of the person in charge of water collection?,How much do you pay per month for water?,Does your household spend time doing any of the following? Select all that apply,How close is the nearest Health Center / Clinic,Does your Health Center have access to electricity?,What hours is the Health Center / Clinic open?,Does your Health Center / Clinic have access to refrigeration?
0,2021-09-28 20:21:53,2021-09-28 20:25:03,pre-connection,,501121.0,cgf,fggg,nigeria,akipelai,,...,,,,,,,,,,
1,2021-10-24 09:49:35,2021-10-24 09:55:40,pre-connection,3-6_months,570063.0,Agbor,Egbeyip,nigeria,balep,,...,less_than_1_km,less_than_1_hour,child_male,,nkes,other_food_processing,less_than_1_km,no,12 hours,no
2,2021-10-24 11:09:10,2021-10-24 11:16:09,pre-connection,3-6_months,570028.0,Agbor,Obi,nigeria,balep,,...,less_than_1_km,,child_male,or_older,i_dont_pay_its_free,processing_ugaligari,less_than_1_km,no,12 hours,no
3,2021-10-24 11:16:50,2021-10-24 11:22:41,pre-connection,3-6_months,,Agbor,Obi,nigeria,balep,,...,less_than_1_km,less_than_1_hour,adult_female,or_older,i_dont_pay_its_free,other_food_processing,less_than_1_km,no,12 hours,no
4,2021-10-25 19:43:41,2021-10-25 19:50:12,pre-connection,3-6_months,570097.0,Agbor,Eki,nigeria,balep,,...,km,less_than_1_hour,child_female,copy-2-of-years_old,i_dont_pay_its_free,processing_palm_oil,less_than_1_km,no,12 hours,no


In [80]:
all_cols = []
for df in datasets.values():
    all_cols.extend(list(df.columns))
    
unique_cols = list(set(all_cols))
print(len(all_cols), len(unique_cols))

cols_sorted = sorted(unique_cols)
cols_sorted

389 212


['Age',
 'Approximately how many hours a day do you use kerosene lamps?',
 'Are any household members business owners?',
 'Are you the original person assigned to this Renewvia ID?\xa0',
 'Are you the primary provider for your household?',
 'Are you the primary provider of Household?',
 'Completed Time',
 'Completed time',
 'Completion time',
 'Country',
 'Current Source(s) of Power (after mini-grid connection) Select all that apply',
 'Current Source(s) of Power (before mini-grid connection) Select all that apply',
 'Current Source(s) of Power (select all that apply)',
 'Do you feel that having access to minigrid power has made you more safe? Please explain',
 'Do you feel that having access to minigrid power has made you more safe? Please explain->',
 'Do you feel that you have better access to health services because of connection to minigrid?',
 'Do you have a source for clean drinking water?',
 'Do you have any other sources of electricity other than Renewvia Minigrid?',
 'Does th

In [74]:
cols_merged_list = list(df_merged.columns)
result = collections.Counter(cols_merged_list) & collections.Counter(cols_sorted)
list(result.elements())

['Start time',
 'Completion time',
 'Select your Minigrid Connection Status',
 'If Post-Connection, select how long your have been using the mingrid',
 'What is your Renewvia Minigrid account number?',
 'What is your Kenya or Nigeria ID Number?',
 'First Name',
 'Last Name',
 'Country',
 'Nigeria Community',
 'Kenya Community',
 'Location/Neighborhood Name',
 'Age',
 'Gender',
 'What is your occupation?',
 'Are you the primary provider for your household?',
 'If you are NOT the Primary Provider of Household, what is Occupation of Primary Provider?',
 'If you selected "other" in the question above, please explain: (if not, please skip)',
 'Occupation of Secondary Income Provider of Household',
 'If you selected "other" in the question above, please explain: (if not, please skip)2',
 'Type of employment for Primary Provider',
 'What is your average monthly household income:',
 'How many people live in your household, including yourself?',
 'How many adults?',
 'How many female children?'

In [51]:
df_merged[df_merged[col] == 'Commercial']

Unnamed: 0,Start time,Completion time,Select your Minigrid Connection Status,"If Post-Connection, select how long your have been using the mingrid",What is your Renewvia Minigrid account number?,What is your Kenya or Nigeria ID Number?,First Name,Last Name,Country,Nigeria Community,...,Who is mainly responsible for water collection on a daily basis?,What is the average age of the person in charge of water collection?,How much do you pay per month for water?,Does your household spend time doing any of the following? Select all that apply,How close is the nearest Health Center/Clinic?,Does your Health Center have access to electricity?,What hours is the Health Center / Clinic open?,Does your Health Center / Clinic have access to refrigeration?,customerAccountNumber,tariff


In [23]:
# Isolate columns present in both pre and post survey for paired testing
print(len(df[col].unique()))
initial_ids = list(df[col].unique())
ids_sql_tariffs = list(tariffs['customerAccountNumber'].unique())
result = collections.Counter(initial_ids) & collections.Counter(ids_sql_tariffs)

# # Filter the datasets accordingly
print(len(list(result.elements())))
# df_pre = hs_pre[list(result.elements())]
# df_post = hs_post[list(result.elements())]

327
321
