In [32]:
import pandas as pd
from IPython.display import display, Markdown
import re
import json
import itertools
import difflib
from slugify import slugify

In [33]:
### Get indicator-categorisation data from https://github.com/Vizzuality/i2i/blob/develop/app/assets/javascripts/collections/data_portal/ExploratorySurveyIndicatorsCollection.js
list_vars = [{ 'id': 'geographic_area', 'name': 'Geographic Area', 'category': 'categories.DEMOGRAPHICS', 'defaultChart': 'grouped bar', 'visible': 'true', 'preferedOrder': ['Rural', 'Urban'] },
             { 'id': 'age', 'name': 'Age', 'category': 'categories.DEMOGRAPHICS', 'defaultChart': 'grouped bar', 'visible': 'true', 'preferedOrder': ['18-24', '25-34', '35-44', '45-54', '55+'] },
             { 'id': 'i2i_Marital_Status', 'name': 'Marital Status', 'category': 'categories.DEMOGRAPHICS', 'defaultChart': 'grouped bar', 'visible': 'true' },
             { 'id': 'household_size', 'name': 'Mean household size', 'category': 'categories.DEMOGRAPHICS', 'defaultChart': 'grouped bar', 'visible': 'true', 'preferedOrder': ['Above mean size', 'Below mean size', 'Mean'] },
            { 'id': 'i2i_Education', 'name': 'Level of education', 'category': 'categories.DEMOGRAPHICS', 'defaultChart': 'grouped bar', 'visible': 'true', 'isFullWidth': 'true', 'preferedOrder': ['Higher education', 'Primary education', 'Secondary education', 'No formal education', 'Other'] },
            { 'id': 'main_lan', 'name': 'Main language', 'category': 'categories.DEMOGRAPHICS', 'defaultChart': 'grouped bar', 'visible': 'true', 'isFullWidth': 'true' },
            { 'id': 'english_literacy', 'name': 'English literacy', 'category': 'categories.DEMOGRAPHICS', 'defaultChart': 'grouped bar', 'visible': 'true', 'preferedOrder': ['Excellent', 'Fair', 'Good', 'Not at all', 'Poorly'] }, 
            { 'id': 'interview_lan', 'name': 'Language of interview', 'category': 'categories.DEMOGRAPHICS', 'defaultChart': 'grouped bar', 'visible': 'true' },
            { 'id': 'poverty_line', 'name': 'Poverty line', 'category': 'categories.DEMOGRAPHICS', 'defaultChart': 'grouped bar', 'visible': 'true', 'preferedOrder': ['Above poverty line', 'Below poverty line'] },
            { 'id': 'migrant_work', 'name': 'Migrant worker identity', 'category': 'categories.DEMOGRAPHICS', 'defaultChart': 'grouped bar', 'visible': 'true', 'preferedOrder': ['Me', 'No-one, all live together', 'Other family', 'Spouse'] },
            { 'id': 'land_decission', 'name': 'Land decision maker', 'category': 'categories.OWNERSHIP_AND_CONTROL', 'defaultChart': 'grouped bar', 'visible': 'true', 'preferedOrder': ['Decide together with others', 'Family/Friends without me', 'Me only'] },
            { 'id': 'land_owner', 'name': 'Land ownership', 'category': 'categories.OWNERSHIP_AND_CONTROL', 'defaultChart': 'grouped bar', 'visible': 'true', 'preferedOrder': ['Dont own', 'Own by self', 'Own with others'] },
            { 'id': 'own_phone', 'name': 'Own a phone', 'category': 'categories.OWNERSHIP_AND_CONTROL', 'defaultChart': 'grouped bar', 'visible': 'true', 'preferedOrder': ['Dont own phone', 'Own phone'] },
            { 'id': 'who_phone', 'name': 'Phone ownership', 'category': 'categories.OWNERSHIP_AND_CONTROL', 'defaultChart': 'grouped bar', 'visible': 'true' },
            { 'id': 'phone_use', 'name': 'Independent phone use', 'category': 'categories.OWNERSHIP_AND_CONTROL', 'defaultChart': 'grouped bar', 'visible': 'true', 'preferedOrder': ['Dont use mobile phone', 'No', 'Yes', 'Somewhat'] },
            { 'id': 'saving_goal_deccision', 'name': 'Savings goal decision maker', 'category': 'categories.AGENCY_AND_DECISION_MAKING', 'defaultChart': 'grouped bar', 'visible': 'true', 'preferedOrder': ['Decide together', 'No savings goal', 'Self alone', 'Spouse/Family/Others on behalf'] },
            { 'id': 'saving_goal_influence', 'name': 'Savings goal influence', 'category': 'categories.AGENCY_AND_DECISION_MAKING', 'defaultChart': 'grouped bar', 'visible': 'true', 'preferedOrder': ['Always accepted', 'Always rejected', 'Dont consult', 'No savings goal', 'Sometimes accepted/Rejected'] },
            { 'id': 'spending_decission', 'name': 'Spending decision maker', 'category': 'categories.AGENCY_AND_DECISION_MAKING', 'defaultChart': 'grouped bar', 'visible': 'true', 'preferedOrder': ['Fully on own', 'Help from someone else', 'Someone else decides'] },
            { 'id': 'raise_gni', 'name': 'Possibility to raise 1/20 GNI (Gross National Income)', 'category': 'categories.INCOME', 'defaultChart': 'grouped bar', 'visible': 'true', 'preferedOrder': ['Not at all possible', 'Not very possible', 'Somewhat possible', 'Very possible'] },
            { 'id': 'freq_inconme_spouse', 'name': 'Income frequency of spouse', 'category': 'categories.INCOME', 'defaultChart': 'grouped bar', 'visible': 'true', 'isFullWidth': 'true', 'preferedOrder': ['Daily', 'Monthly', 'Weekly/Biweekly', 'Dont work', 'Don’t know', 'No spouse', 'Work unpaid', 'When they get work'] }, 
            { 'id': 'main_income_earned', 'name': 'Main income earner', 'category': 'categories.INCOME', 'defaultChart': 'grouped bar', 'visible': 'true' },
            { 'id': 'earning_freq', 'name': 'Earning frequency', 'category': 'categories.INCOME', 'defaultChart': 'grouped bar', 'visible': 'true', 'preferedOrder': ['Daily', 'I dont work', 'Monthly', 'Weekly/Biweekly', 'When i get work', 'Work unpaid'] },
            { 'id': 'allow_spouse_work', 'name': 'Allow spouse to work', 'category': 'categories.GENDER_NORMS', 'defaultChart': 'grouped bar', 'visible': 'true', 'preferedOrder': ['No', 'Yes', 'Not married', 'Spouse already works for money'] },
            { 'id': 'permission_work', 'name': 'Permission to work', 'category': 'categories.GENDER_NORMS', 'defaultChart': 'grouped bar', 'visible': 'true', 'preferedOrder': ['Already working', 'No', 'Yes'] },
            { 'id': 'children_decission', 'name': 'Children decision maker', 'category': 'categories.GENDER_NORMS', 'defaultChart': 'grouped bar', 'visible': 'true', 'isFullWidth': 'true', 'preferedOrder': ['Decide alone', 'Decide together', 'Not in relationship', 'Spouse decides', 'We do not talk about it'] },
            { 'id': 'bank_permission', 'name': 'Permission to open bank', 'category': 'categories.GENDER_NORMS', 'defaultChart': 'grouped bar', 'visible': 'true', 'preferedOrder': ['No', 'Yes'] },
            { 'id': 'phone_use_financial', 'name': 'Informal financial usage', 'category': 'categories.FINANCIAL_INC', 'defaultChart': 'grouped bar', 'visible': 'true', 'preferedOrder': ['Yes', 'No'] },
            { 'id': 'mobile_money', 'name': 'Mobile money', 'category': 'categories.FINANCIAL_INC', 'defaultChart': 'grouped bar', 'visible': 'true', 'preferedOrder': ['Dont use mm', 'Dont use mobile phone', 'Own', 'Share/Use others'] },
            { 'id': 'bank', 'name': 'Bank', 'category': 'categories.FINANCIAL_INC', 'defaultChart': 'grouped bar', 'visible': 'true', 'preferedOrder': ['Other', 'Other fam', 'Own', 'Spouse', 'Unbanked'] },
             { 'id': 'account_money_taken', 'name': 'Money taken from account', 'category': 'categories.FINANCIAL_INC', 'defaultChart': 'grouped bar', 'visible': 'true', 'preferedOrder': ['Dont have account', 'Money not taken', 'Money taken'] }
            ]

In [34]:
#Get API indicator data from: https://github.com/Vizzuality/i2i-api/blob/feature/mobile-surveys-endpoint/app/src/data/ms-indicators.json
api_ind = {
  "geographic_area": "Geographic Area",
  "gender": "Gender",
  "age": "Age",
  "access_to_resources": "Access to Resources",
  "dwelling_type": "Dwelling type: roof/dwelling",
  "i2i_Marital_Status": "Marital Status",
  "i2i_Education": "Level of education",
  "i2i_Income_Sources": "Sources of income",
  "toilet_type": "Sanitation type",
  "cooking_energy_type": "Cooking energy source",
  "electricity_access_type": "Electricity access",
  "usd_per_day": "USD per day",
  "poverty_line": "Poverty line",
  "mobile_money": "Mobile Money",
  "water_source_type": "Water source type",
  "english_literacy": "English literacy",
  "interview_lan": "Language of interview",
  "own_phone": "Own a phone",
  "who_phone": "Phone ownership",
  "phone_use": "Independent phone use",
  "phone_use_financial": "Informal financial usage",
  "main_lan": "Main language",
  "household_size": "Mean household size",
  "bank":"Bank",
  "bank_permission":"Permission to open bank",
  "account_money_taken":"Money taken from account",
  "saving_goal_deccision":"Savings goal decision maker",
  "saving_goal_influence":"Savings goal influence",
  "land_owner":"Land ownership",
  "land_decission":"Land decision maker",
  "children_decission":"Children decision maker",
  "earning_freq":"Earning frequency",
  "spending_decission":"Spending decision maker",
  "permission_work":"Permission to work",
  "main_income_earned":"Main income earner",
  "migrant_work":"Migrant worker identity",
  "raise_gni":"Possibility to raise 1/20 GNI (Gross National Income)",
  "freq_inconme_spouse":"Income frequency of spouse",
  "allow_spouse_work":"Allow spouse to work",
"region": "Province / Region",
"lang_literacy": "Read & write in any language",
"division": "Division",
"swahili_literacy": "Swahili literacy"
}

In [35]:
#Get useJson_t dict from upload_mobile_data notebook

useJson_t =json.loads("""
{
    "geographic_area": "Geographic Area",
    "gender": "Gender",
    "age": "Age",
    "access_to_resources": "Access to Resources",
    "dwelling_type": "Dwelling type: roof/dwelling",
    "i2i_Marital_Status": "Marital Status",
    "i2i_Education": "Level of education",
    "i2i_Income_Sources": "Sources of income",
    "toilet_type": "Sanitation type",
    "cooking_energy_type": "Cooking energy source",
    "electricity_access_type": "Electricity access",
    "usd_per_day": "USD per day",
    "poverty_line": "Poverty line",
    "mobile_money": "Mobile Money",
    "water_source_type": "Water source type",
    "english_literacy": "English literacy",
    "interview_lan": "Language of interview",
    "own_phone": "Own a phone",
    "who_phone": "Phone ownership",
    "phone_use": "Independent phone use",
  "phone_use_financial": "Informal financial usage",
    "main_lan": "Main language",
    "household_size": "Mean household size",
  "mobile_money":"Mobile Money",
  "bank":"Bank",
  "bank_permission":"Permission to open bank",
  "account_money_taken":"Money taken from account",
  "saving_goal_deccision":"Savings goal decision maker",
  "saving_goal_influence":"Savings goal influence",
  "land_owner":"Land ownership",
  "land_decission":"Land decision maker",
  "children_decission":"Children decision maker",
  "earning_freq":"Earning frequency",
  "spending_decission":"Spending decision maker",
  "permission_work":"Permission to work",
  "main_income_earned":"Main income earner",
  "migrant_work":"Migrant worker identity",
  "raise_gni":"Possibility to raise 1/20 GNI (Gross National Income)",
  "freq_inconme_spouse":"Income frequency of spouse",
  "allow_spouse_work":"Allow spouse to work",
  "region":"Province / Region",
  "language_literacy":"Read & write in any language",
  "division":"Division",
  "swahili_literacy":"Swahili literacy"
}
 """)

In [36]:
#Get useJson dict from upload_mobile_data notebook

"""
This is used to generate the matching json to upload a file, the column names should match with each object
"""
useJson =json.loads("""{
    "weightColumn": "finalweight",
    "indicators":{
        "Urb_Rur": {
            "indicatorId": "geographic_area",
            "childIndicatorId": null,
            "answerId": null
        },
        "Urbanicity": {
            "indicatorId": "geographic_area",
            "childIndicatorId": null,
            "answerId": null
        },
        "Province":{
            "indicatorId": "jurisdiction",
            "childIndicatorId": null,
            "answerId": null
        },
        "M_D3": {
            "indicatorId": "gender",
            "childIndicatorId": null,
            "answerId": null
        },
        "Gender": {
            "indicatorId": "gender",
            "childIndicatorId": null,
            "answerId": null
        },
        "i2i_Age": {
            "indicatorId": "age",
            "childIndicatorId": null,
            "answerId": null
        },
        "Age Group": {
            "indicatorId": "age",
            "childIndicatorId": null,
            "answerId": null
        },
        "i2i_Marital_Status":{
            "indicatorId": "i2i_Marital_Status",
            "childIndicatorId": null,
            "answerId": null
        },
        "Relationship status":{
            "indicatorId": "i2i_Marital_Status",
            "childIndicatorId": null,
            "answerId": null
        },
        "i2i_Education": {
            "indicatorId": "i2i_Education",
            "childIndicatorId": null,
            "answerId": null
        },
        "Education": {
            "indicatorId": "i2i_Education",
            "childIndicatorId": null,
            "answerId": null
        },
        "English literacy": {
            "indicatorId": "english_literacy",
            "childIndicatorId": null,
            "answerId": null
        },
        "Language of interview": {
            "indicatorId": "interview_lan",
            "childIndicatorId": null,
            "answerId": null
        },
        "Main language": {
            "indicatorId": "main_lan",
            "childIndicatorId": null,
            "answerId": null
        },
        "Mean household size": {
            "indicatorId": "household_size",
            "childIndicatorId": null,
            "answerId": null
        },
        "Own a phone": {
            "indicatorId": "own_phone",
            "childIndicatorId": null,
            "answerId": null
        },
        "Phone ownership": {
            "indicatorId": "who_phone",
            "childIndicatorId": null,
            "answerId": null
        },
        "Independent phone use": {
            "indicatorId": "phone_use",
            "childIndicatorId": null,
            "answerId": null
        },
        "Informal financial usage": {
            "indicatorId": "phone_use_financial",
            "childIndicatorId": null,
            "answerId": null
        },
        "i2i_Source_of_Income": {
            "indicatorId": "i2i_Income_Sources",
            "childIndicatorId": null,
            "answerId": null
        },
        "Water_source_type":{
            "indicatorId": "water_source_type",
            "childIndicatorId": null,
            "answerId": null
        },
        "Toilet_type":{
            "indicatorId": "toilet_type",
            "childIndicatorId": null,
            "answerId": null
        },
        "Cooking_energy":{
            "indicatorId": "cooking_energy_type",
            "childIndicatorId": null,
            "answerId": null
        },
        "Electricity_access":{
            "indicatorId": "electricity_access_type",
            "childIndicatorId": null,
            "answerId": null
        },
        "USD_per_day":{
            "indicatorId": "usd_per_day",
            "childIndicatorId": null,
            "answerId": null
        },
        "Poverty_line":{
            "indicatorId": "poverty_line",
            "childIndicatorId": null,
            "answerId": null
        },
        "$2.50 PPP Poverty line":{
            "indicatorId": "poverty_line",
            "childIndicatorId": null,
            "answerId": null
        },
        "Land ownership":{
            "indicatorId": "land_owner",
            "childIndicatorId": null,
            "answerId": null
        },
        "Land decision maker":{
            "indicatorId": "land_decission",
            "childIndicatorId": null,
            "answerId": null
        },
        "Children decision maker":{
            "indicatorId": "children_decission",
            "childIndicatorId": null,
            "answerId": null
        },
        "Earning frequency":{
            "indicatorId": "earning_freq",
            "childIndicatorId": null,
            "answerId": null
        },
        "Spending decision maker":{
            "indicatorId": "spending_decission",
            "childIndicatorId": null,
            "answerId": null
        },
        "Permission to work":{
            "indicatorId": "permission_work",
            "childIndicatorId": null,
            "answerId": null
        },
        "Main income earner":{
            "indicatorId": "main_income_earned",
            "childIndicatorId": null,
            "answerId": null
        },
        "Migrant worker identity":{
            "indicatorId": "migrant_work",
            "childIndicatorId": null,
            "answerId": null
        },
        "Possibility to raise 1/20 GNI (Gross National Income)":{
            "indicatorId": "raise_gni",
            "childIndicatorId": null,
            "answerId": null
        },
        "Income frequency of spouse":{
            "indicatorId": "freq_inconme_spouse",
            "childIndicatorId": null,
            "answerId": null
        },
        "Allow spouse to work":{
            "indicatorId": "allow_spouse_work",
            "childIndicatorId": null,
            "answerId": null
        },
        "Bank": {
            "indicatorId": "bank",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Permission to open bank": {
            "indicatorId": "bank_permission",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Money taken from account": {
            "indicatorId": "account_money_taken",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Savings goal decision maker": {
            "indicatorId": "saving_goal_deccision",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Savings goal influence": {
            "indicatorId": "saving_goal_influence",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Banked": {
            "indicatorId": "fas_strand",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Other_formal": {
            "indicatorId": "fas_strand",
            "childIndicatorId": null,
            "answerId": 2
        },
        "Informal": {
            "indicatorId": "fas_strand",
            "childIndicatorId": null,
            "answerId": 3
        },
        "fas_access": {
            "indicatorId": "fas_strand",
            "childIndicatorId": null,
            "answerId": 4
        },
        "Saving_B":{
            "indicatorId": "savings_strand",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Saving_F":{
            "indicatorId": "savings_strand",
            "childIndicatorId": null,
            "answerId": 2
        },
        "Saving_Inf":{
            "indicatorId": "savings_strand",
            "childIndicatorId": null,
            "answerId": 3
        },
        "Saving_AH":{
            "indicatorId": "savings_strand",
            "childIndicatorId": null,
            "answerId": 4
        },
        "saving_access":{
            "indicatorId": "savings_strand",
            "childIndicatorId": null,
            "answerId": 5
        },
        "Remittances_B":{
            "indicatorId": "remittances_strand",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Remittances_F":{
            "indicatorId": "remittances_strand",
            "childIndicatorId": null,
            "answerId": 2
        },
        "Remittances_Inf":{
            "indicatorId": "remittances_strand",
            "childIndicatorId": null,
            "answerId": 3
        },
        "Remittances_FF":{
            "indicatorId": "remittances_strand",
            "childIndicatorId": null,
            "answerId": 4
        },
        "remittances_access":{
            "indicatorId": "remittances_strand",
            "childIndicatorId": null,
            "answerId": 5
        },
        "Insurance_F":{
            "indicatorId": "insurance_strand",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Insurance_Inf":{
            "indicatorId": "insurance_strand",
            "childIndicatorId": null,
            "answerId": 2
        },
        "Insurance_B":{
            "indicatorId": "insurance_strand",
            "childIndicatorId": null,
            "answerId": 3
        },
        "insurance_access":{
            "indicatorId": "insurance_strand",
            "childIndicatorId": null,
            "answerId": 4
        },
        "Credit_B":{
            "indicatorId": "credit_strand",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Credit_F":{
            "indicatorId": "credit_strand",
            "childIndicatorId": null,
            "answerId": 2
        },
        "Credit_Inf":{
            "indicatorId": "credit_strand",
            "childIndicatorId": null,
            "answerId": 3
        },
        "Credit_FF":{
            "indicatorId": "credit_strand",
            "childIndicatorId": null,
            "answerId": 4
        },
        "credit_access":{
            "indicatorId": "credit_strand",
            "childIndicatorId": null,
            "answerId": 5
        },
        "Weight_Ind":{
            "indicatorId": "weight_ind",
            "childIndicatorId": null,
            "answerId": null
        },
        "FAS":{
            "indicatorId": "total_fas_strand",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Saving_Strand":{
            "indicatorId": "total_saving_strand",
            "childIndicatorId": null,
            "answerId": 2
        },
        "Remittances_Strand":{
            "indicatorId": "total_remittances_strand",
            "childIndicatorId": null,
            "answerId": 3
        },
        "Insurance_Strand":{
            "indicatorId": "total_insurance_strand",
            "childIndicatorId": null,
            "answerId": 4
        },
        "Credit_Strand":{
            "indicatorId": "total_credit_strand",
            "childIndicatorId": null,
            "answerId": 5
        },
        "Mobile_money":{
            "indicatorId": "mobile_money",
            "childIndicatorId": null,
            "answerId": null
        },
        "Mobile Money":{
            "indicatorId": "mobile_money",
            "childIndicatorId": null,
            "answerId": null
        },
        "Marital Status": {
            "indicatorId": "i2i_Marital_Status", 
            "childIndicatorId": null, 
            "answerId": null
        }, 
        "Geographic Area": {
            "indicatorId": "geographic_area", 
            "childIndicatorId": null, 
            "answerId": null
        }, 
        "Age": {
            "indicatorId": "age", 
            "childIndicatorId": null, 
            "answerId": null
        }, 
        "Level of education": {
            "indicatorId": "i2i_Education", 
            "childIndicatorId": null, 
            "answerId": null
        }, 
        "Poverty line": {
            "indicatorId": "poverty_line", 
            "childIndicatorId": null, 
            "answerId": null
        }, 
        "Province / Region": {
            "indicatorId": "region", 
            "childIndicatorId": null, 
            "answerId": null
        }, 
        "Read & write in any language": {
            "indicatorId": "lang_literacy", 
            "childIndicatorId": null, 
            "answerId": null
        }, 
        "Division": {
            "indicatorId": "division", 
            "childIndicatorId": null, 
            "answerId": null
        }, 
        "Swahili literacy": {
            "indicatorId": "swahili_literacy", 
            "childIndicatorId": null, 
            "answerId": null
        }
    }
}""")

In [37]:
def capitalize(matchobj):
    return matchobj.group(0).upper()

def checkDataset(datasetPath, iso, year, kargs={'decimal':',', 'sep':';'}, clean=True):
    """
    This will open the dataset and make some itial checks: 
    (data types, column names and will try to solve some common cleaning issues)
    """
    if datasetPath.split('.')[-1] in ['xlsx']:
        dataset =  pd.read_excel(datasetPath, **kargs) \
                .convert_dtypes()
    elif datasetPath.split('.')[-1] in ['csv']:
        dataset = pd.read_csv(datasetPath, **kargs, low_memory=False) \
                .convert_dtypes()
    else:
            raise Exception('format not allowed')
    dataset_columns = set(dataset.columns.sort_values(ascending=True).values)
    display(Markdown(f'## {iso.lower()}{year}'))
    display(dataset.head(2))
    display(dataset.describe(include='all').transpose())
    display(Markdown(f'### Columns descriptions'))
    for col in dataset.columns.values:
        display(Markdown(f'#### **column:** {col} ----- **dtype**: {dataset[col].dtype}'))
        if len(dataset[col].unique()) < 50:
            display(Markdown(f'##### Before clean applied'))
            print(dataset[col].unique())
        ## This is for preliminary clean
        if pd.api.types.is_string_dtype(dataset[col]) and clean:
            dataset[col] = dataset[col].str.strip() \
                                .str.strip(',') \
                                .str.rstrip('\,') \
                                .str.lower() \
                                .str.capitalize() \
                                .str.replace(re.compile(r'( years)$', flags=re.IGNORECASE), '') \
                                .str.replace(re.compile(r'(55)$', flags=re.IGNORECASE), '55+') \
                                .str.replace(re.compile(r'(Urbain)', flags=re.IGNORECASE), 'Urban') \
                                .str.replace(re.compile(r'(Above 60)', flags=re.IGNORECASE), '>60') \
                                .str.replace(re.compile(r'(Under 18)', flags=re.IGNORECASE), '<18') \
                                .str.replace(re.compile(r'\/[aA-zZ]', flags=re.IGNORECASE), capitalize) \
                                .str.replace(re.compile(r'( i )'), ' I ') \
                                .str.replace('dont','don’t') \
                                .str.replace('Dont','Don’t') \
                                .str.replace('Informally','Informal')
            if col == 'Land decision maker':
                dataset[col] = dataset[col].str.strip() \
                                .str.lower() \
                                .str.capitalize()
                dataset[col] = dataset[col].replace('','Don’t own land')
            if col == 'Spending decision maker':
                dataset[col] = dataset[col].str.strip() \
                                .str.lower() \
                                .str.capitalize()
            display(Markdown(f'##### After clean applied'))
            print(dataset[col].unique())
        
    return [dataset, dataset_columns, f'{iso.lower()}{year}']

def calculatePopulation(data):
    print('Population over which it was estimated')
    poblacion = sum(data[0]['finalweight'])
    mujeres = data[0][data[0]['Gender'] == 'Female']
    total_mujeres = sum(mujeres['finalweight'])
    hombres = data[0][data[0]['Gender'] == 'Male']
    total_hombres = sum(hombres['finalweight'])
    print('_________________ ',f'{data[2]}')
    print('Mujeres: ',total_mujeres)
    print('Hombres: ',total_hombres)
    print('Total: ', poblacion)
    print('_________________ ')
    

def compareDatasets(dfvaris:list, column_remap = {'stratum': 'Stratum',
                                              #'D3': 'M_D3',
                                              'Allow spouse to work,,':'Allow spouse to work',
                                              ' Phone ownership':'Phone ownership',
                                              #'Gender': 'gender',
                                              'Urban_Rural': 'Urb_Rur',
                                                'Marital / relationship status': 'Marital Status',
                                             'Urbanicity': 'Geographic Area',
                                             'Age Group': 'Age',
                                             'Education': 'Level of education',
                                             '$2.50 PPP Poverty line': 'Poverty line'}):
    """
    dfvaris list of lists [[dataset, datasetCols,'isoYEAR']]
    column_remap if you need to remap columns
    """
    issueCols=set()
    for i, data in enumerate(dfvaris):
        ## Rename columns if needed    
        data[0].rename(inplace=True, columns=column_remap)
        
        dfvaris[i][1]=set(data[0].columns.sort_values(ascending=True).values)
        print(dfvaris[i][1])
        issueCols = dfvaris[i][1] - issueCols
        display(data[0].info())
    
    valuesCols={}
    closetsCols={}
    #################### this code will compare pairs of column outputs to homogeinaze the data.
    for a, b in itertools.combinations(dfvaris, 2):
        al=set(map(str.lower,a[1]))
        bl=set(map(str.lower,b[1]))
        c=a[1]-b[1]
        cl=al-bl
        d=b[1]-a[1]
        dl=bl-al
        symCols = a[1] & b[1]
        print('=========================  ', a[2], ' - ', b[2])
        print('Good matched columns')
        for i in symCols:
            if str(a[0].dtypes[i]) != str(b[0].dtypes[i]):
                print('+++++++++++++++++++++')
                print(a[2],a[0][i].dtype,' different dtype ',b[2],b[0][i].dtype)
                print('+++++++++++++++++++++')

            if len(a[0][i].unique()) < 20 and len(b[0][i].unique()) < 20:
                print('-',i)

                print('> {0}: {1}'.format(a[2],a[0][i].unique()))
                print('> {0}: {1}'.format(b[2],b[0][i].unique()))
                r = difflib.get_close_matches(i, possibilities = valuesCols.keys(), n = 1, cutoff = 0.98)
                if not r:
                    valuesCols[i]=set()
                    closetsCols[i]=set()
                else:
                    print('*********')
                    print(r)
                    print('*********')

                    valuesCols[r[0]].update(a[0][i].unique())
                    valuesCols[r[0]].update(b[0][i].unique())
                    print(r[0])
                    print(i)
                    print(round(difflib.SequenceMatcher(None, r[0], i).ratio(),3))
                    closetsCols[i].update(set(r))
            
    
        print("Columns that doesn't have a full match from both datasets", al^bl)
        for search in cl:
            matches = sorted(dl, key=lambda x: difflib.SequenceMatcher(None, x.lower(), search.lower()).ratio(), reverse=True)  
            if round(difflib.SequenceMatcher(None, matches[0], search).ratio(),3) > 0.8:
                print('--------')
                display(Markdown("**{0}** se compara con {1} el más parecido es {2} con un ratio de: {3}".format(search, matches, matches[0], round(difflib.SequenceMatcher(None, matches[0], search).ratio(),3)))) 


In [38]:
### Requirements that need to be set:
basePath = 'i2i_ms'
OutPath = f'{basePath}/data/out'
InPath = f'{basePath}/data'
DatasetList = [
    [f'{InPath}/200928 Kenya CAPI weighted outputs label update_revised poverty and labels.csv', 'ken', '2020'],
    [f'{InPath}/200928 Pakistan CAPI weighted outputs.csv', 'pak', '2020'],
    [f'{InPath}/200928 Uganda CAPI weighted outputs.csv', 'uga', '2020'],
    [f'{InPath}/200928 Tanzania CAPI weighted outputs.csv', 'tza', '2020']
    ]

## 'vizzuality', '<check Last Pass>'
#user = getpass.getpass(prompt='User: ')
#password = getpass.getpass(prompt='Password: ')
#auth =requests.auth.HTTPBasicAuth(user, password)

In [None]:
### this will generate the initial check of the dataset. 
### if you need to add new checks and cleaning options for now add them in checDataset function
datasets = []
for dataset in DatasetList:
    datasets.append(checkDataset(*dataset, clean=True))

In [43]:
#Create summary dataframe to find inconsistencies among datasets

data_overview = pd.DataFrame(columns = ['fields', 'ken2020', 'pak2020', 'uga2020', 'tza2020'])

In [44]:
issueCols=set()
column_remap = {'stratum': 'Stratum',#'D3': 'M_D3',
                                              'Allow spouse to work,,':'Allow spouse to work',
                                              ' Phone ownership':'Phone ownership',
                                              #'Gender': 'gender',
                                              'Urban_Rural': 'Urb_Rur',
                                            'Marital / relationship status': 'Marital Status',
                                             'Urbanicity': 'Geographic Area',
                                             'Age Group': 'Age',
                                             'Education': 'Level of education',
                                             '$2.50 PPP Poverty line': 'Poverty line'}
fields = []
dfcols = ['ken2020', 'pak2020', 'uga2020', 'tza2020']
for i, data, dfcol in zip(range(0,4), datasets, dfcols):
    ## Rename columns if needed    
    data[0].rename(inplace=True, columns=column_remap)
    fields = fields + list(data[0].keys())
    fields = list(dict.fromkeys(fields))

data_overview['fields']=fields[2:]
    
for i, data, dfcol in zip(range(len(dfcols)), datasets, dfcols):
    for field in fields[2:]:
        try:
            uni_vals = list(data[0][field].unique())
            data_overview.loc[data_overview['fields']==field, dfcol]= str(uni_vals)
        except:
            data_overview.loc[data_overview['fields']==field, dfcol]= 'MISSING'

In [45]:
data_overview.head()

Unnamed: 0,fields,ken2020,pak2020,uga2020,tza2020
0,Gender,"['Male', 'Female']","['Male', 'Female']","['Female', 'Male']","['Female', 'Male']"
1,Marital Status,"['Married/Relationship', 'Not married/Relation...","['Married', 'Not married']","['Married/Relationship', 'Not married/Relation...","['Married/Relationship', 'Not married/Relation..."
2,Geographic Area,"['Urban', 'Rural']","['Rural', 'Urban']","['Urban', 'Rural']","['Urban', 'Rural']"
3,Age,"['25-34', '35-44', '18-24', '55+', '45-54']","['55+', '45-54', '35-44', '18-24', '25-34']","['18-24', '35-44', '25-34', '55+', '45-54']","['25-34', '18-24', '35-44', '45-54', '55+']"
4,Level of education,"['Secondary education', 'Primary education', '...","['No formal education', 'Secondary education',...","['Secondary education', 'No formal education',...","['Primary education', 'Secondary education', '..."


In [172]:
#data_overview.to_excel('i2i_ms_overview_29092020.xlsx')

In [None]:
## this will compare the columns for different dataset to help harmonize the results.
compareDatasets(datasets)

In [272]:
#update useJson_t
not_in_usejsont = []
for field in fields:
    if field not in list(useJson_t.values()):
        not_in_usejsont.append(field)
print(not_in_usejsont)

['InstanceID', 'finalweight']


In [205]:
#Rename fields that already exist in useJson_t with a different name, taken from not_in_usejsont
correct_dict = {'Marital / relationship status': 'Marital Status',
 'Urbanicity': 'Geographic Area',
 'Age Group': 'Age',
 'Education': 'Level of education',
 '$2.50 PPP Poverty line': 'Poverty line'}

['InstanceID',
 'finalweight',
 'Marital / relationship status',
 'Urbanicity',
 'Age Group',
 'Education',
 '$2.50 PPP Poverty line']

In [256]:
#Update useJson
updating_useJson={}
for field in fields[2:]:
    if field not in list(useJson['indicators'].keys()):
        updating_fields["%s" % field]= {
            "indicatorId": dict(map(reversed, useJson_t.items()))[field],
            "childIndicatorId": 'null',
            "answerId": 'null'
        }
print(updating_fields)

{'Marital Status': {'indicatorId': 'i2i_Marital_Status', 'childIndicatorId': 'null', 'answerId': 'null'}, 'Geographic Area': {'indicatorId': 'geographic_area', 'childIndicatorId': 'null', 'answerId': 'null'}, 'Age': {'indicatorId': 'age', 'childIndicatorId': 'null', 'answerId': 'null'}, 'Level of education': {'indicatorId': 'i2i_Education', 'childIndicatorId': 'null', 'answerId': 'null'}, 'Poverty line': {'indicatorId': 'poverty_line', 'childIndicatorId': 'null', 'answerId': 'null'}, 'Province / Region': {'indicatorId': 'region', 'childIndicatorId': 'null', 'answerId': 'null'}, 'Read & write in any language': {'indicatorId': 'lang_literacy', 'childIndicatorId': 'null', 'answerId': 'null'}, 'Division': {'indicatorId': 'division', 'childIndicatorId': 'null', 'answerId': 'null'}, 'Swahili literacy': {'indicatorId': 'swahili_literacy', 'childIndicatorId': 'null', 'answerId': 'null'}}


In [230]:
#Update indicator-categorisation 
existing_vars = []
for item in list_vars:
    existing_vars.append(item['name'])

print('MISSING FIELDS')
for field in existing_vars:
    if field not in data_overview['fields'].unique():
        print(field)

print('')
new_fields = []
print('NEW FIELDS')
for field in fields:
    if field not in existing_vars:
        new_fields.append(field)
print(new_fields[2:])

MISSING FIELDS
Mobile money

NEW FIELDS
['Gender', 'Province / Region', 'Mobile Money', 'Read & write in any language', 'Division', 'Swahili literacy']


In [271]:
#Update API indicator dict
for key_id in api_ind.keys():
    if key_id not in useJson_t.keys():
        print('missing', key_id)
    else:
        if api_ind[key_id] != useJson_t[key_id]:
            print('wrong name, api:', api_ind[key_id], 'useJson_t:', useJson_t[key_id])

print('MISSING IN API_IND')
for key_id in useJson_t.keys():
    if key_id not in api_ind.keys():
        print(f"\"{key_id}\": \"{useJson_t[key_id]}\",")
    else:
        if useJson_t[key_id] != api_ind[key_id]:
            print('wrong name, useJson_t:', useJson_t[key_id], 'api_ind:', api_ind[key_id])

MISSING IN API_IND


In [290]:
ken = pd.read_csv('i2i_ms/data/200928 Kenya CAPI weighted outputs label update_revised poverty and labels.csv', sep=';', decimal=',')

In [298]:
for ur in ken['Urbanicity'].unique():
    df = ken[ken['Urbanicity']==ur]
    for gen in df['Gender'].unique():
        gendf = df[df['Gender']==gen]
        print(ur, gen, sum(list(gendf['finalweight']))*100/sum(ken['finalweight']))

Urban Male 21.231272580423852
Urban Female 19.433059251425455
Rural Female 31.84122993939189
Rural Male 27.49443822875889


In [8]:
import pandas as pd

In [9]:
newken = pd.read_csv('/Users/oscaresbri/Documents/Notebooks/i2i_ms/data/out/ken2020.csv')

In [10]:
newken.keys()

Index(['Unnamed: 0', 'Main language', 'Level of education', 'Bank',
       'Mobile Money', 'Money taken from account', 'Spending decision maker',
       'Earning frequency', 'Poverty line', 'Phone ownership',
       'Income frequency of spouse', 'Marital Status', 'Land ownership',
       'Language of interview', 'Permission to work',
       'Possibility to raise 1/20 GNI (Gross National Income)',
       'Land decision maker', 'Allow spouse to work', 'Independent phone use',
       'Gender', 'Mean household size', 'Informal financial usage',
       'English literacy', 'Geographic Area', 'Main income earner',
       'Permission to open bank', 'Province / Region', 'finalweight',
       'Savings goal decision maker', 'Age', 'Migrant worker identity',
       'Own a phone', 'Children decision maker', 'Savings goal influence'],
      dtype='object')

In [28]:
field = 'Mobile Money'
for ur in newken[field].unique():
    df = newken[newken[field]==ur]
    for gen in df['Gender'].unique():
        gendf = df[df['Gender']==gen]
        print(ur, gen, sum(list(gendf['finalweight']))*100/sum(newken['finalweight']))

Yes Male 45.978449008888106
Yes Female 47.69013486267771
No Male 2.7472618002944738
No Female 3.584154328139649


In [29]:
newken[field].unique()

array(['Yes', 'No'], dtype=object)