In [1]:
import pandas as pd
import numpy as np

import collections

In [2]:
def overview(df, file_name):
    # Shape of data: No. of rows and columns
    no_rows, no_cols = df.shape

    # Check for missing values
    null_values = df.isnull().sum().sum()

    # No. of duplicates values
    duplicates = df[df.duplicated()].shape[0]
    df.drop_duplicates(inplace=True)


    # Diagnostics results
    data_check = """
    - The dataset {0} has {1} rows and {2} columns
    - There are {3} null/na values and {4} duplicate rows which were removed. 
    """ 

    print(data_check.format(file_name, no_rows, no_cols,
                            null_values, duplicates))
    
    
#     print("All duplicated rows were removed")
    
def loading_datasets(fold, file_names):
    # Loading the datasets
    datasets = []
    for name in file_names:
        data = pd.read_csv(fold + name)
        # Remove those with no renewvia_id
        data.dropna(subset=['renewvia_id'], inplace=True)
        if 'Unnamed: 0' in data.columns:
            data.drop('Unnamed: 0', axis=1, inplace=True)

        overview(data, name)
        datasets.append(data)
        
    return datasets

In [3]:
surveys_enc = loading_datasets(fold= 'data_encoded/',
                               file_names = ['household_pre_survey_encoded.csv', 
                                              'household_post_survey_encoded.csv',
                                              'commercial_post_survey_encoded.csv'])


    - The dataset household_pre_survey_encoded.csv has 1562 rows and 37 columns
    - There are 4358 null/na values and 4 duplicate rows which were removed. 
    

    - The dataset household_post_survey_encoded.csv has 2602 rows and 51 columns
    - There are 19427 null/na values and 12 duplicate rows which were removed. 
    

    - The dataset commercial_post_survey_encoded.csv has 470 rows and 20 columns
    - There are 850 null/na values and 3 duplicate rows which were removed. 
    


In [4]:
# Isolate columns present in both pre and post survey for paired testing
hs_pre, hs_post = surveys_enc[0], surveys_enc[1]
pre_cols = list(hs_pre.columns)
post_cols = list(hs_post.columns)
result = collections.Counter(pre_cols) & collections.Counter(post_cols)

# # Filter the datasets accordingly
df_pre = hs_pre[list(result.elements())]
df_post = hs_post[list(result.elements())]

# # Merging the dataset on 'renewvia_id'
df_paired = df_pre.merge(df_post, on='renewvia_id', how='inner', suffixes=('_pre', '_post'))
df_paired.to_csv("data_encoded/household_paired_data.csv")
df_paired.head()

Unnamed: 0,renewvia_id,avg_monthly_household_income_pre,electronics_count_pre,cellphones_count_pre,light_hours_current_pre,kerosene_lamp_usage_count_pre,cooking_fuel_collection_time_pre,cooking_energy_monthly_cost_pre,feel_safe_dark_pre,community_lights_0_pre,...,water_source_post,clean_drinking_water_source_post,water_collection_travel_distance_post,water_collection_time_post,water_monthly_cost_post,clinic_travel_distance_post,clinic_refrigeration_access_0_post,clinic_refrigeration_access_1_post,business_owners_count_post,business_owners_female_post
0,252102,5000.0,3.0,2.0,4.0,0.0,2.0,2.0,4.0,0.0,...,3.0,,3.0,3.0,2.0,3.0,1.0,0.0,0,0
1,252102,5000.0,3.0,2.0,4.0,0.0,2.0,2.0,4.0,0.0,...,3.0,treated__filtered_water,2.0,1.0,2.0,1.0,1.0,0.0,0,0
2,nd2224,3500.0,2.0,,2.0,3.0,4.0,2.0,1.0,0.0,...,,,,,,,,,0,0
3,161148,8000.0,3.0,,6.0,2.0,2.0,3.0,1.0,0.0,...,2.0,,,,,,,,0,0
4,161078,5000.0,7.0,,7.0,3.0,2.0,3.0,1.0,0.0,...,2.0,,,,,,,,0,0
