In [5]:
import pandas as pd
import numpy as np

import collections

In [8]:
def overview(df, file_name):
    # Shape of data: No. of rows and columns
    no_rows, no_cols = df.shape

    # Check for missing values
    null_values = df.isnull().sum().sum()

    # No. of duplicates values
    duplicates = df[df.duplicated()].shape[0]
    df.drop_duplicates(inplace=True)


    # Diagnostics results
    data_check = """
    - The dataset {0} has {1} rows and {2} columns
    - There are {3} null/na values and {4} duplicate rows which were removed. 
    """ 

    print(data_check.format(file_name, no_rows, no_cols,
                            null_values, duplicates))
    
    
#     print("All duplicated rows were removed")
    
def loading_datasets(fold, file_names):
    # Loading the datasets
    datasets = []
    for name in file_names:
        data = pd.read_csv(fold + name)
        # Remove those with no renewvia_id
        data.dropna(subset=['renewvia_id'], inplace=True)
        if 'Unnamed: 0' in data.columns:
            data.drop('Unnamed: 0', axis=1, inplace=True)

        overview(data, name)
        datasets.append(data)
        
    return datasets

In [None]:
surveys_enc = loading_datasets(fold= 'datasets_encoded/',
                               file_names = [
                                    'datasets_clean/hs_post_initial_annotated.csv',
                                   'datasets_clean/'
                               ])

In [9]:
surveys_enc = loading_datasets(fold= 'datasets_encoded/',
                               file_names = [
                                    'hs_subset_pre_survey_encoded.csv', 
                                    'hs_subset_post_survey_encoded.csv',
                                    'commercial_subset_post_survey_encoded.csv'
                               ])


    - The dataset hs_subset_pre_survey_encoded.csv has 572 rows and 26 columns
    - There are 3299 null/na values and 0 duplicate rows which were removed. 
    

    - The dataset hs_subset_post_survey_encoded.csv has 2602 rows and 40 columns
    - There are 14720 null/na values and 12 duplicate rows which were removed. 
    

    - The dataset commercial_subset_post_survey_encoded.csv has 470 rows and 15 columns
    - There are 728 null/na values and 3 duplicate rows which were removed. 
    


In [12]:
# Isolate columns present in both pre and post survey for paired testing
for data in surveys_enc:
    data['renewvia_id'] = data['renewvia_id'].astype(str)
    # print(list(data['renewvia_id']))
hs_pre, hs_post = surveys_enc[0], surveys_enc[1]
pre_cols = list(hs_pre.columns)
post_cols = list(hs_post.columns)
# hs_pre[]
result = collections.Counter(pre_cols) & collections.Counter(post_cols)

# # Filter the datasets accordingly
df_pre = hs_pre[list(result.elements())]
df_post = hs_post[list(result.elements())]

# # Merging the dataset on 'renewvia_id'
df_paired = df_pre.merge(df_post, on='renewvia_id', 
                         how='inner', suffixes=('_pre', '_post'))
df_paired.to_csv("datasets_encoded/hs_subset_paired_data.csv")
df_paired.head()

162


Unnamed: 0,renewvia_id,avg_household_income_pre,appliances_count_pre,cellphones_count_pre,light_hours_current_pre,kerosene_lamps_count_pre,business_owners_count_pre,business_owners_female_pre,community_lights_pre,home_exterior_lights_pre,...,cooking_fuel_collection_time_post,cooking_energy_cost_post,feel_safe_dark_post,feel_safe_if_exterior_lights_post,water_source_post,water_collection_travel_distance_post,water_collection_time_post,water_cost_post,clinic_travel_distance_post,phone_charge_cost_post
0,141223,1000.0,3.0,2.0,5.0,0.0,1,0,1.0,1.0,...,1.0,2.0,1.0,5.0,3.0,1.0,1.0,3.0,,
1,131268,0.0,2.0,1.0,5.0,0.0,0,0,0.0,0.0,...,1.0,1.0,5.0,5.0,4.0,1.0,1.0,1.0,,
2,131268,0.0,2.0,1.0,5.0,0.0,0,0,0.0,0.0,...,1.0,3.0,1.0,5.0,4.0,2.0,1.0,1.0,2.0,
3,131427,1.0,24.0,3.0,2.0,3.0,0,0,1.0,1.0,...,1.0,1.0,5.0,5.0,4.0,1.0,1.0,1.0,,
4,141140,15000.0,3.0,3.0,5.0,0.0,1,0,1.0,1.0,...,2.0,2.0,1.0,5.0,3.0,1.0,1.0,3.0,,


In [None]:
# Isolate columns present in both pre and post survey for paired testing
for data in surveys_enc:
    data['renewvia_id'] = data['renewvia_id'].astype(str)
    # print(list(data['renewvia_id']))
hs_pre, hs_post = surveys_enc[0], surveys_enc[1]
pre_cols = list(hs_pre.columns)
post_cols = list(hs_post.columns)
# hs_pre[]
result = collections.Counter(pre_cols) & collections.Counter(post_cols)

# # Filter the datasets accordingly
df_pre = hs_pre[list(result.elements())]
df_post = hs_post[list(result.elements())]

# # Merging the dataset on 'renewvia_id'
df_paired = df_pre.merge(df_post, on='renewvia_id', 
                         how='inner', suffixes=('_pre', '_post'))
# df_paired.to_csv("datasets_encoded/hs_subset_paired_data.csv")
df_paired.head()

In [None]:
# Isolate columns present in both pre and post survey for paired testing
for data in surveys_enc:
    data['renewvia_id'] = data['renewvia_id'].astype(str)
    # print(list(data['renewvia_id']))
hs_pre, hs_post = surveys_enc[0], surveys_enc[1]
pre_cols = list(hs_pre.columns)
post_cols = list(hs_post.columns)
# hs_pre[]
result = collections.Counter(pre_cols) & collections.Counter(post_cols)

# # Filter the datasets accordingly
df_pre = hs_pre[list(result.elements())]
df_post = hs_post[list(result.elements())]

# # Merging the dataset on 'renewvia_id'
df_paired = df_pre.merge(df_post, on='renewvia_id', 
                         how='inner', suffixes=('_pre', '_post'))
# df_paired.to_csv("datasets_encoded/hs_subset_paired_data.csv")
df_paired.head()