# Process to upload data.

## Steps to follow to make an import:
1.- Check the data: format, column labeling, data types, typos inside data categories for each indicator etc.  
2.- Once we are happy with the data we need to produce the matchJson for indicators; this json will map the colum name indicators with the harmonize ones we have on the DB.


In [1]:
import os
import zipfile
import pandas as pd
import requests
import numpy as np
import itertools
import difflib
import json
import re
import getpass

import pycountry
from IPython.display import display, Markdown
pd.set_option('display.max_row', None)
pd.set_option('display.max_columns', None)
basePath= os.getcwd()

#### management tips
* Not all datasets will have the same columns.  
* Allways trim and substitute empty values by null ones.  

#### Functions for management

In [2]:
def capitalize(matchobj):
    return matchobj.group(0).upper()

def checkDataset(datasetPath, iso, year, kargs={'decimal':',', 'sep':';'}, clean=True):
    """
    This will open the dataset and make some itial checks: 
    (data types, column names and will try to solve some common cleaning issues)
    """
    if datasetPath.split('.')[-1] in ['xlsx']:
        dataset =  pd.read_excel(datasetPath, **kargs) \
                .convert_dtypes()
    elif datasetPath.split('.')[-1] in ['csv']:
        dataset = pd.read_csv(datasetPath, **kargs, low_memory=False) \
                .convert_dtypes()
    else:
            raise Exception('format not allowed')
    dataset_columns = set(dataset.columns.sort_values(ascending=True).values)
    display(Markdown(f'## {iso.lower()}{year}'))
    display(dataset.head(2))
    display(dataset.describe(include='all').transpose())
    display(Markdown(f'### Columns descriptions'))
    for col in dataset.columns.values:
        display(Markdown(f'#### **column:** {col} ----- **dtype**: {dataset[col].dtype}'))
        if len(dataset[col].unique()) < 50:
            display(Markdown(f'##### Before clean applied'))
            print(dataset[col].unique())
        ## This is for preliminary clean
        if pd.api.types.is_string_dtype(dataset[col]) and clean:
            dataset[col] = dataset[col].str.strip() \
                                .str.strip(',') \
                                .str.rstrip('\,') \
                                .str.lower() \
                                .str.capitalize() \
                                .str.replace(re.compile(r'( years)$', flags=re.IGNORECASE), '') \
                                .str.replace(re.compile(r'(55)$', flags=re.IGNORECASE), '55+') \
                                .str.replace(re.compile(r'(Urbain)', flags=re.IGNORECASE), 'Urban') \
                                .str.replace(re.compile(r'(Above 60)', flags=re.IGNORECASE), '>60') \
                                .str.replace(re.compile(r'(Under 18)', flags=re.IGNORECASE), '<18') \
                                .str.replace(re.compile(r'\/[aA-zZ]', flags=re.IGNORECASE), capitalize) \
                                .str.replace(re.compile(r'( i )'), ' I ') \
                                .str.replace('dont','don’t') \
                                .str.replace('Dont','Don’t') \
                                .str.replace('Informally','Informal')
            if col == 'Land decision maker':
                dataset[col] = dataset[col].str.strip() \
                                .str.lower() \
                                .str.capitalize()
                dataset[col] = dataset[col].replace('','Don’t own land')
            if col == 'Spending decision maker':
                dataset[col] = dataset[col].str.strip() \
                                .str.lower() \
                                .str.capitalize()
                dataset[col] = dataset[col].replace('Someone else decides for me','Someone else decides')
            display(Markdown(f'##### After clean applied'))
            print(dataset[col].unique())
        
    return [dataset, dataset_columns, f'{iso.lower()}{year}']

def calculatePopulation(data):
    print('Population over which it was estimated')
    poblacion = sum(data[0]['finalweight'])
    mujeres = data[0][data[0]['Gender'] == 'Female']
    total_mujeres = sum(mujeres['finalweight'])
    hombres = data[0][data[0]['Gender'] == 'Male']
    total_hombres = sum(hombres['finalweight'])
    print('_________________ ',f'{data[2]}')
    print('Mujeres: ',total_mujeres)
    print('Hombres: ',total_hombres)
    print('Total: ', poblacion)
    print('_________________ ')
    

def compareDatasets(dfvaris:list, column_remap = {'stratum': 'Stratum',
                                              #'D3': 'M_D3',
                                              'Allow spouse to work,,':'Allow spouse to work',
                                              ' Phone ownership':'Phone ownership',
                                              #'Gender': 'gender',
                                              'Urban_Rural': 'Urb_Rur',
                                            'Marital / relationship status': 'Marital Status',
                                             'Urbanicity': 'Geographic Area',
                                             'Age Group': 'Age',
                                             'Education': 'Level of education',
                                             '$2.50 PPP Poverty line': 'Poverty line'}):
    """
    dfvaris list of lists [[dataset, datasetCols,'isoYEAR']]
    column_remap if you need to remap columns
    """
    issueCols=set()
    for i, data in enumerate(dfvaris):
        ## Rename columns if needed    
        data[0].rename(inplace=True, columns=column_remap)
        
        dfvaris[i][1]=set(data[0].columns.sort_values(ascending=True).values)
        print(dfvaris[i][1])
        issueCols = dfvaris[i][1] - issueCols
        display(data[0].info())
    
    valuesCols={}
    closetsCols={}
    #################### this code will compare pairs of column outputs to homogeinaze the data.
    for a, b in itertools.combinations(dfvaris, 2):
        al=set(map(str.lower,a[1]))
        bl=set(map(str.lower,b[1]))
        c=a[1]-b[1]
        cl=al-bl
        d=b[1]-a[1]
        dl=bl-al
        symCols = a[1] & b[1]
        print('=========================  ', a[2], ' - ', b[2])
        print('Good matched columns')
        for i in symCols:
            if str(a[0].dtypes[i]) != str(b[0].dtypes[i]):
                print('+++++++++++++++++++++')
                print(a[2],a[0][i].dtype,' different dtype ',b[2],b[0][i].dtype)
                print('+++++++++++++++++++++')

            if len(a[0][i].unique()) < 20 and len(b[0][i].unique()) < 20:
                print('-',i)

                print('> {0}: {1}'.format(a[2],a[0][i].unique()))
                print('> {0}: {1}'.format(b[2],b[0][i].unique()))
                r = difflib.get_close_matches(i, possibilities = valuesCols.keys(), n = 1, cutoff = 0.98)
                if not r:
                    valuesCols[i]=set()
                    closetsCols[i]=set()
                else:
                    print('*********')
                    print(r)
                    print('*********')

                    valuesCols[r[0]].update(a[0][i].unique())
                    valuesCols[r[0]].update(b[0][i].unique())
                    print(r[0])
                    print(i)
                    print(round(difflib.SequenceMatcher(None, r[0], i).ratio(),3))
                    closetsCols[i].update(set(r))
            
    
        print("Columns that doesn't have a full match from both datasets", al^bl)
        for search in cl:
            matches = sorted(dl, key=lambda x: difflib.SequenceMatcher(None, x.lower(), search.lower()).ratio(), reverse=True)  
            if round(difflib.SequenceMatcher(None, matches[0], search).ratio(),3) > 0.8:
                print('--------')
                display(Markdown("**{0}** se compara con {1} el más parecido es {2} con un ratio de: {3}".format(search, matches, matches[0], round(difflib.SequenceMatcher(None, matches[0], search).ratio(),3)))) 


#### Functions to upload to the api

Indicators needs to be on the API first; to do so you need to edit this files:
* [National surveys](https://github.com/Vizzuality/i2i-api/blob/develop/app/src/data/indicators.json)
* [MSME](https://github.com/Vizzuality/i2i-api/blob/develop/app/src/data/msme-indicators.json)
* [Mobile surveys](https://github.com/Vizzuality/i2i-api/blob/develop/app/src/data/ms-indicators.json)

After editing the above one make sure they are categorized on the fron here:
* [National surveys](https://github.com/Vizzuality/i2i/blob/develop/app/assets/javascripts/collections/data_portal)
* [MSME](https://github.com/Vizzuality/i2i/blob/develop/app/assets/javascripts/collections/data_portal)
* [Mobile surveys](https://github.com/Vizzuality/i2i/blob/develop/app/assets/javascripts/collections/data_portal/ExploratorySurveyIndicatorsCollection.js)

In [3]:
"""
This is used to generate the matching json to upload a file, the column names should match with each object
"""
useJson =json.loads("""{
    "weightColumn": "finalweight",
    "indicators":{
        "Urb_Rur": {
            "indicatorId": "geographic_area",
            "childIndicatorId": null,
            "answerId": null
        },
        "Urbanicity": {
            "indicatorId": "geographic_area",
            "childIndicatorId": null,
            "answerId": null
        },
        "Province":{
            "indicatorId": "jurisdiction",
            "childIndicatorId": null,
            "answerId": null
        },
        "M_D3": {
            "indicatorId": "gender",
            "childIndicatorId": null,
            "answerId": null
        },
        "Gender": {
            "indicatorId": "gender",
            "childIndicatorId": null,
            "answerId": null
        },
        "i2i_Age": {
            "indicatorId": "age",
            "childIndicatorId": null,
            "answerId": null
        },
        "Age Group": {
            "indicatorId": "age",
            "childIndicatorId": null,
            "answerId": null
        },
        "i2i_Marital_Status":{
            "indicatorId": "i2i_Marital_Status",
            "childIndicatorId": null,
            "answerId": null
        },
        "Relationship status":{
            "indicatorId": "i2i_Marital_Status",
            "childIndicatorId": null,
            "answerId": null
        },
        "i2i_Education": {
            "indicatorId": "i2i_Education",
            "childIndicatorId": null,
            "answerId": null
        },
        "Education": {
            "indicatorId": "i2i_Education",
            "childIndicatorId": null,
            "answerId": null
        },
        "English literacy": {
            "indicatorId": "english_literacy",
            "childIndicatorId": null,
            "answerId": null
        },
        "Language of interview": {
            "indicatorId": "interview_lan",
            "childIndicatorId": null,
            "answerId": null
        },
        "Main language": {
            "indicatorId": "main_lan",
            "childIndicatorId": null,
            "answerId": null
        },
        "Mean household size": {
            "indicatorId": "household_size",
            "childIndicatorId": null,
            "answerId": null
        },
        "Own a phone": {
            "indicatorId": "own_phone",
            "childIndicatorId": null,
            "answerId": null
        },
        "Phone ownership": {
            "indicatorId": "who_phone",
            "childIndicatorId": null,
            "answerId": null
        },
        "Independent phone use": {
            "indicatorId": "phone_use",
            "childIndicatorId": null,
            "answerId": null
        },
        "Informal financial usage": {
            "indicatorId": "phone_use_financial",
            "childIndicatorId": null,
            "answerId": null
        },
        "i2i_Source_of_Income": {
            "indicatorId": "i2i_Income_Sources",
            "childIndicatorId": null,
            "answerId": null
        },
        "Water_source_type":{
            "indicatorId": "water_source_type",
            "childIndicatorId": null,
            "answerId": null
        },
        "Toilet_type":{
            "indicatorId": "toilet_type",
            "childIndicatorId": null,
            "answerId": null
        },
        "Cooking_energy":{
            "indicatorId": "cooking_energy_type",
            "childIndicatorId": null,
            "answerId": null
        },
        "Electricity_access":{
            "indicatorId": "electricity_access_type",
            "childIndicatorId": null,
            "answerId": null
        },
        "USD_per_day":{
            "indicatorId": "usd_per_day",
            "childIndicatorId": null,
            "answerId": null
        },
        "Poverty_line":{
            "indicatorId": "poverty_line",
            "childIndicatorId": null,
            "answerId": null
        },
        "$2.50 PPP Poverty line":{
            "indicatorId": "poverty_line",
            "childIndicatorId": null,
            "answerId": null
        },
        "Land ownership":{
            "indicatorId": "land_owner",
            "childIndicatorId": null,
            "answerId": null
        },
        "Land decision maker":{
            "indicatorId": "land_decission",
            "childIndicatorId": null,
            "answerId": null
        },
        "Children decision maker":{
            "indicatorId": "children_decission",
            "childIndicatorId": null,
            "answerId": null
        },
        "Earning frequency":{
            "indicatorId": "earning_freq",
            "childIndicatorId": null,
            "answerId": null
        },
        "Spending decision maker":{
            "indicatorId": "spending_decission",
            "childIndicatorId": null,
            "answerId": null
        },
        "Permission to work":{
            "indicatorId": "permission_work",
            "childIndicatorId": null,
            "answerId": null
        },
        "Main income earner":{
            "indicatorId": "main_income_earned",
            "childIndicatorId": null,
            "answerId": null
        },
        "Migrant worker identity":{
            "indicatorId": "migrant_work",
            "childIndicatorId": null,
            "answerId": null
        },
        "Possibility to raise 1/20 GNI (Gross National Income)":{
            "indicatorId": "raise_gni",
            "childIndicatorId": null,
            "answerId": null
        },
        "Income frequency of spouse":{
            "indicatorId": "freq_inconme_spouse",
            "childIndicatorId": null,
            "answerId": null
        },
        "Allow spouse to work":{
            "indicatorId": "allow_spouse_work",
            "childIndicatorId": null,
            "answerId": null
        },
        "Bank": {
            "indicatorId": "bank",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Permission to open bank": {
            "indicatorId": "bank_permission",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Money taken from account": {
            "indicatorId": "account_money_taken",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Savings goal decision maker": {
            "indicatorId": "saving_goal_deccision",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Savings goal influence": {
            "indicatorId": "saving_goal_influence",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Banked": {
            "indicatorId": "fas_strand",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Other_formal": {
            "indicatorId": "fas_strand",
            "childIndicatorId": null,
            "answerId": 2
        },
        "Informal": {
            "indicatorId": "fas_strand",
            "childIndicatorId": null,
            "answerId": 3
        },
        "fas_access": {
            "indicatorId": "fas_strand",
            "childIndicatorId": null,
            "answerId": 4
        },
        "Saving_B":{
            "indicatorId": "savings_strand",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Saving_F":{
            "indicatorId": "savings_strand",
            "childIndicatorId": null,
            "answerId": 2
        },
        "Saving_Inf":{
            "indicatorId": "savings_strand",
            "childIndicatorId": null,
            "answerId": 3
        },
        "Saving_AH":{
            "indicatorId": "savings_strand",
            "childIndicatorId": null,
            "answerId": 4
        },
        "saving_access":{
            "indicatorId": "savings_strand",
            "childIndicatorId": null,
            "answerId": 5
        },
        "Remittances_B":{
            "indicatorId": "remittances_strand",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Remittances_F":{
            "indicatorId": "remittances_strand",
            "childIndicatorId": null,
            "answerId": 2
        },
        "Remittances_Inf":{
            "indicatorId": "remittances_strand",
            "childIndicatorId": null,
            "answerId": 3
        },
        "Remittances_FF":{
            "indicatorId": "remittances_strand",
            "childIndicatorId": null,
            "answerId": 4
        },
        "remittances_access":{
            "indicatorId": "remittances_strand",
            "childIndicatorId": null,
            "answerId": 5
        },
        "Insurance_F":{
            "indicatorId": "insurance_strand",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Insurance_Inf":{
            "indicatorId": "insurance_strand",
            "childIndicatorId": null,
            "answerId": 2
        },
        "Insurance_B":{
            "indicatorId": "insurance_strand",
            "childIndicatorId": null,
            "answerId": 3
        },
        "insurance_access":{
            "indicatorId": "insurance_strand",
            "childIndicatorId": null,
            "answerId": 4
        },
        "Credit_B":{
            "indicatorId": "credit_strand",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Credit_F":{
            "indicatorId": "credit_strand",
            "childIndicatorId": null,
            "answerId": 2
        },
        "Credit_Inf":{
            "indicatorId": "credit_strand",
            "childIndicatorId": null,
            "answerId": 3
        },
        "Credit_FF":{
            "indicatorId": "credit_strand",
            "childIndicatorId": null,
            "answerId": 4
        },
        "credit_access":{
            "indicatorId": "credit_strand",
            "childIndicatorId": null,
            "answerId": 5
        },
        "Weight_Ind":{
            "indicatorId": "weight_ind",
            "childIndicatorId": null,
            "answerId": null
        },
        "FAS":{
            "indicatorId": "total_fas_strand",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Saving_Strand":{
            "indicatorId": "total_saving_strand",
            "childIndicatorId": null,
            "answerId": 2
        },
        "Remittances_Strand":{
            "indicatorId": "total_remittances_strand",
            "childIndicatorId": null,
            "answerId": 3
        },
        "Insurance_Strand":{
            "indicatorId": "total_insurance_strand",
            "childIndicatorId": null,
            "answerId": 4
        },
        "Credit_Strand":{
            "indicatorId": "total_credit_strand",
            "childIndicatorId": null,
            "answerId": 5
        },
        "Mobile_money":{
            "indicatorId": "mobile_money",
            "childIndicatorId": null,
            "answerId": null
        },
        "Mobile Money":{
            "indicatorId": "mobile_money",
            "childIndicatorId": null,
            "answerId": null
        },
        "Marital Status": {
            "indicatorId": "i2i_Marital_Status", 
            "childIndicatorId": null, 
            "answerId": null
        }, 
        "Geographic Area": {
            "indicatorId": "geographic_area", 
            "childIndicatorId": null, 
            "answerId": null
        }, 
        "Age": {
            "indicatorId": "age", 
            "childIndicatorId": null, 
            "answerId": null
        }, 
        "Level of education": {
            "indicatorId": "i2i_Education", 
            "childIndicatorId": null, 
            "answerId": null
        }, 
        "Poverty line": {
            "indicatorId": "poverty_line", 
            "childIndicatorId": null, 
            "answerId": null
        }, 
        "Province / Region": {
            "indicatorId": "region", 
            "childIndicatorId": null, 
            "answerId": null
        }, 
        "Read & write in any language": {
            "indicatorId": "lang_literacy", 
            "childIndicatorId": null, 
            "answerId": null
        }, 
        "Division": {
            "indicatorId": "division", 
            "childIndicatorId": null, 
            "answerId": null
        }, 
        "Swahili literacy": {
            "indicatorId": "swahili_literacy", 
            "childIndicatorId": null, 
            "answerId": null
        }
    }
}""")

In [4]:
def saveData(data, dataOutPath, env='staging', section='mobile'):
    
    base = {
        'staging':'https://staging.i2ifacility.org',
        'production':'http://i2i.vizzuality.com'
        }
    sect = {
        'ns':  'api/v1',
        'msme':'msme-api/v1',
        'mobile': 'ms-api/v2'
        }
    columnst = list(useJson['indicators'].keys())
    columnst.append('finalweight')
    AceptedColumns = set(columnst)
    calculatePopulation(data)
    baseUrl = f'{base[env]}/{sect[section]}'
    columns = (AceptedColumns & set(data[0].columns.sort_values(ascending=True).values))
    info = {
        'iso': data[2][0:3].upper(),
        'year': int(data[2][3:]),
        'total': sum(data[0]['finalweight']),
        'fileUrl': f'{dataOutPath}/{data[2]}.csv',
        'jsonUrl': f'{dataOutPath}/{data[2]}.json',
        'baseUrl': f'{base[env]}/{sect[section]}/country'
        }
    info.update({'url': f"{info['baseUrl']}/{info['iso']}/{info['year']}/dataset",
                 'dataUrl':f"https://s3-us-west-2.amazonaws.com/i2ifacility.org/{info['iso']}/{sect[section]}/{info['year']}.zip"
                })
    #save the data
    data[0][list(columns)].to_csv(info['fileUrl'])
    #save the key json
    outCol = dict((key,value) for key, value in useJson["indicators"].items() if key in columns)
    myJson = {
        "weightColumn": "finalweight",
        "indicators": outCol}
    with open(info['jsonUrl'], 'w') as outfile:
        json.dump(myJson, outfile)
    
    return info

def upsertCountry(info, auth, op='POST'):
    """
    Recive info as an dict with:
    {
    iso:
    year:
    total:
    fileUrl:
    jsonUrl: 
    url:
    baseUrl:
    }
    """
    wanted_keys = ['iso', 'year', 'total', 'dataUrl'] # The keys you want
    body = dict((k, info[k]) for k in wanted_keys if k in info)
    countryName = pycountry.countries.get(alpha_3=info['iso'])
    if not countryName:
        raise ValueError(f"{info['iso']} not a country alpha_3 iso code")

    
    body.update({"name":countryName.name})
    
    try:
        url= f"{info['baseUrl']}" if op == 'POST' else f"{info['baseUrl']}/{info['iso']}"
        s = requests.request(op, url, auth=auth, json=body)
        s.raise_for_status()
        print(f'Country {body["name"]} {("created" if op=="POST" else "updated")} succesfully')
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 400:
            s = upsertCountry(info, auth,  op='PATCH')
        else:
            print(e.response.text)
    return s
    
def uploadDataFunction(info, auth):
    """
    Recive info as an dict with:
    {
    iso:
    year:
    fileUrl:
    jsonUrl: 
    url:
    baseUrl:
    }
    """ 
    files1 = {'json': open(info["jsonUrl"], 'rb'), 'csv': open(info["fileUrl"], 'rb')}
    r = requests.post(info["url"], auth=auth, files=files1)
    return r.json()


### FULL PIPELINE

In [35]:
### Requirements that need to be set:
basePath = 'i2i_ms'
OutPath = f'{basePath}/data/out'
InPath = f'{basePath}/data'
DatasetList = [
    [f'{InPath}/200928 Kenya CAPI weighted outputs label update_revised poverty and labels.csv', 'ken', '2020'],
    [f'{InPath}/200928 Pakistan CAPI weighted outputs.csv', 'pak', '2020'],
    [f'{InPath}/200928 Uganda CAPI weighted outputs.csv', 'uga', '2020'],
    [f'{InPath}/200928 Tanzania CAPI weighted outputs.csv', 'tza', '2020']
    ]

## 'vizzuality', '<check Last Pass>'
user = getpass.getpass(prompt='User: ')
password = getpass.getpass(prompt='Password: ')
auth =requests.auth.HTTPBasicAuth(user, password)

User:  ··········
Password:  ············


In [36]:
# if new indicators don't forget to add them here too
useJson_t =json.loads("""
{
    "geographic_area": "Geographic Area",
    "gender": "Gender",
    "age": "Age",
    "access_to_resources": "Access to Resources",
    "dwelling_type": "Dwelling type: roof/dwelling",
    "i2i_Marital_Status": "Marital Status",
    "i2i_Education": "Level of education",
    "i2i_Income_Sources": "Sources of income",
    "toilet_type": "Sanitation type",
    "cooking_energy_type": "Cooking energy source",
    "electricity_access_type": "Electricity access",
    "usd_per_day": "USD per day",
    "poverty_line": "Poverty line",
    "mobile_money": "Mobile Money",
    "water_source_type": "Water source type",
    "english_literacy": "English literacy",
    "interview_lan": "Language of interview",
    "own_phone": "Own a phone",
    "who_phone": "Phone ownership",
    "phone_use": "Independent phone use",
  "phone_use_financial": "Informal financial usage",
    "main_lan": "Main language",
    "household_size": "Mean household size",
  "mobile_money":"Mobile Money",
  "bank":"Bank",
  "bank_permission":"Permission to open bank",
  "account_money_taken":"Money taken from account",
  "saving_goal_deccision":"Savings goal decision maker",
  "saving_goal_influence":"Savings goal influence",
  "land_owner":"Land ownership",
  "land_decission":"Land decision maker",
  "children_decission":"Children decision maker",
  "earning_freq":"Earning frequency",
  "spending_decission":"Spending decision maker",
  "permission_work":"Permission to work",
  "main_income_earned":"Main income earner",
  "migrant_work":"Migrant worker identity",
  "raise_gni":"Possibility to raise 1/20 GNI (Gross National Income)",
  "freq_inconme_spouse":"Income frequency of spouse",
  "allow_spouse_work":"Allow spouse to work",
  "region":"Province / Region",
  "language_literacy":"Read & write in any language",
  "division":"Division",
  "swahili_literacy":"Swahili literacy"
}
 """)
with open(f'{OutPath}/indicators.json', 'w') as outfile:
    json.dump(useJson_t, outfile, sort_keys=True, indent=4)

In [37]:
### this will generate the initial check of the dataset. 
### if you need to add new checks and cleaning options for now add them in checDataset function
datasets = []
for dataset in DatasetList:
    datasets.append(checkDataset(*dataset, clean=True))

## ken2020

Unnamed: 0,InstanceID,finalweight,Gender,Marital / relationship status,Urbanicity,Age Group,Education,English literacy,Own a phone,$2.50 PPP Poverty line,Language of interview,Main language,Mean household size,Province / Region,Phone ownership,Independent phone use,Informal financial usage,Mobile Money,Bank,Permission to open bank,Money taken from account,Savings goal decision maker,Savings goal influence,Land ownership,Land decision maker,Children decision maker,Earning frequency,Spending decision maker,Permission to work,Main income earner,Migrant worker identity,Possibility to raise 1/20 GNI (Gross National Income),Income frequency of spouse,"Allow spouse to work,,"
0,770458,10776.66146,Male,Married/relationship,Urban,25-34,Secondary education,Fair,own phone,Above poverty line,English,Swahili,Below mean size,Nairobi,own phone/business/employer,Yes,no,Yes,unbanked,no,money not taken,others on behalf,sometimes accepted/rejected,dont own,,decide together,Daily,Help from someone else,Already working,respondent,"No-one, all live together",Not very possible,dont work,Yes
1,770459,9983.318465,Female,Married/relationship,Urban,25-34,Primary education,Good,own phone,Above poverty line,Swahili,Swahili,Below mean size,Nairobi,own phone/business/employer,Yes,no,Yes,unbanked,no,money not taken,self alone,sometimes accepted/rejected,own by self,Others decide without me,decide together,Weekly/biweekly,Help from someone else,Already working,spouse,"No-one, all live together",Not at all possible,Monthly,Yes


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
InstanceID,3000,,,,775220.0,2804.73,770458.0,773092.0,774944.0,777020.0,788781.0
finalweight,3000,,,,7323.37,6821.96,119.348,3513.72,6031.08,8937.02,91545.8
Gender,3000,2.0,Female,1731.0,,,,,,,
Marital / relationship status,3000,2.0,Married/relationship,2101.0,,,,,,,
Urbanicity,3000,2.0,Rural,1920.0,,,,,,,
Age Group,3000,5.0,25-34,1021.0,,,,,,,
Education,3000,5.0,Secondary education,1182.0,,,,,,,
English literacy,3000,5.0,Good,971.0,,,,,,,
Own a phone,3000,2.0,own phone,2670.0,,,,,,,
$2.50 PPP Poverty line,3000,2.0,Below poverty line,1596.0,,,,,,,


### Columns descriptions

#### **column:** InstanceID ----- **dtype**: Int64

#### **column:** finalweight ----- **dtype**: float64

#### **column:** Gender ----- **dtype**: string

##### Before clean applied

<StringArray>
['Male', 'Female']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Male', 'Female']
Length: 2, dtype: string


#### **column:** Marital / relationship status ----- **dtype**: string

##### Before clean applied

<StringArray>
['Married/relationship', 'Not married/relationship']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Married/Relationship', 'Not married/Relationship']
Length: 2, dtype: string


#### **column:** Urbanicity ----- **dtype**: string

##### Before clean applied

<StringArray>
['Urban', 'Rural']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Urban', 'Rural']
Length: 2, dtype: string


#### **column:** Age Group ----- **dtype**: string

##### Before clean applied

<StringArray>
['25-34', '35-44', '18-24', '55', '45-54']
Length: 5, dtype: string


##### After clean applied

<StringArray>
['25-34', '35-44', '18-24', '55+', '45-54']
Length: 5, dtype: string


#### **column:** Education ----- **dtype**: string

##### Before clean applied

<StringArray>
['Secondary education',   'Primary education',    'Higher education',
 'No formal education',               'Other']
Length: 5, dtype: string


##### After clean applied

<StringArray>
['Secondary education',   'Primary education',    'Higher education',
 'No formal education',               'Other']
Length: 5, dtype: string


#### **column:** English literacy ----- **dtype**: string

##### Before clean applied

<StringArray>
['Fair', 'Good', 'Excellent', 'Not at all', 'Poorly']
Length: 5, dtype: string


##### After clean applied

<StringArray>
['Fair', 'Good', 'Excellent', 'Not at all', 'Poorly']
Length: 5, dtype: string


#### **column:** Own a phone ----- **dtype**: string

##### Before clean applied

<StringArray>
['own phone', 'dont own phone']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Own phone', 'Don’t own phone']
Length: 2, dtype: string


#### **column:** $2.50 PPP Poverty line ----- **dtype**: string

##### Before clean applied

<StringArray>
['Above poverty line', 'Below poverty line']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Above poverty line', 'Below poverty line']
Length: 2, dtype: string


#### **column:** Language of interview ----- **dtype**: string

##### Before clean applied

<StringArray>
[ 'English',  'Swahili',      'Luo',    'Kisii',   'Kikuyu',   'Somali',
     'Meru',    'Kamba',    'Luhya', 'Kalenjin']
Length: 10, dtype: string


##### After clean applied

<StringArray>
[ 'English',  'Swahili',      'Luo',    'Kisii',   'Kikuyu',   'Somali',
     'Meru',    'Kamba',    'Luhya', 'Kalenjin']
Length: 10, dtype: string


#### **column:** Main language ----- **dtype**: string

##### Before clean applied

<StringArray>
[  'Swahili',     'Luhya',       'Luo',  'Kalenjin',   'English', 'Congolese',
    'Kikuyu',      'Teso',     'Kisii',    'Somali',     'Kamba',      'Meru',
  'Kiduruma', 'Mijikenda',      'Gari',    'Chonyi',   'Turkana',   'Gariama',
    'Kidigo',    'Borana',   'Kiganda',    'Maasai',    'Kiembu',     'Taita',
  'Kimbeere',  'Kibajuni',   'Kikuria']
Length: 27, dtype: string


##### After clean applied

<StringArray>
[  'Swahili',     'Luhya',       'Luo',  'Kalenjin',   'English', 'Congolese',
    'Kikuyu',      'Teso',     'Kisii',    'Somali',     'Kamba',      'Meru',
  'Kiduruma', 'Mijikenda',      'Gari',    'Chonyi',   'Turkana',   'Gariama',
    'Kidigo',    'Borana',   'Kiganda',    'Maasai',    'Kiembu',     'Taita',
  'Kimbeere',  'Kibajuni',   'Kikuria']
Length: 27, dtype: string


#### **column:** Mean household size ----- **dtype**: string

##### Before clean applied

<StringArray>
['Below mean size', 'Above mean size', 'Mean']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Below mean size', 'Above mean size', 'Mean']
Length: 3, dtype: string


#### **column:** Province / Region ----- **dtype**: string

##### Before clean applied

<StringArray>
[      'Nairobi',        'Nyanza',       'Central',   'Rift Valley',
         'Coast', 'North Eastern',       'Western',       'Eastern']
Length: 8, dtype: string


##### After clean applied

<StringArray>
[      'Nairobi',        'Nyanza',       'Central',   'Rift valley',
         'Coast', 'North eastern',       'Western',       'Eastern']
Length: 8, dtype: string


#### **column:**  Phone ownership ----- **dtype**: string

##### Before clean applied

<StringArray>
['own phone/business/employer',                     'Sibling',
       'Dont use mobile phone',               'Husband/wives',
             'Other man/women',                      'Parent',
                    'Children']
Length: 7, dtype: string


##### After clean applied

<StringArray>
['Own phone/Business/Employer',                     'Sibling',
      'Don’t use mobile phone',               'Husband/Wives',
             'Other man/Women',                      'Parent',
                    'Children']
Length: 7, dtype: string


#### **column:** Independent phone use ----- **dtype**: string

##### Before clean applied

<StringArray>
['Yes', 'No', 'Somewhat']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Yes', 'No', 'Somewhat']
Length: 3, dtype: string


#### **column:** Informal financial usage ----- **dtype**: string

##### Before clean applied

<StringArray>
['no', 'yes']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['No', 'Yes']
Length: 2, dtype: string


#### **column:** Mobile Money ----- **dtype**: string

##### Before clean applied

<StringArray>
['Yes', 'No']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Yes', 'No']
Length: 2, dtype: string


#### **column:** Bank ----- **dtype**: string

##### Before clean applied

<StringArray>
['unbanked', 'own', 'spouse', 'other', 'other fam']
Length: 5, dtype: string


##### After clean applied

<StringArray>
['Unbanked', 'Own', 'Spouse', 'Other', 'Other fam']
Length: 5, dtype: string


#### **column:** Permission to open bank ----- **dtype**: string

##### Before clean applied

<StringArray>
['no', 'yes']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['No', 'Yes']
Length: 2, dtype: string


#### **column:** Money taken from account ----- **dtype**: string

##### Before clean applied

<StringArray>
['money not taken', 'money taken', 'dont have account']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Money not taken', 'Money taken', 'Don’t have account']
Length: 3, dtype: string


#### **column:** Savings goal decision maker ----- **dtype**: string

##### Before clean applied

<StringArray>
['others on behalf', 'self alone', 'no savings goal', 'decide together']
Length: 4, dtype: string


##### After clean applied

<StringArray>
['Others on behalf', 'Self alone', 'No savings goal', 'Decide together']
Length: 4, dtype: string


#### **column:** Savings goal influence ----- **dtype**: string

##### Before clean applied

<StringArray>
['sometimes accepted/rejected',                'dont consult',
             'no savings goal',             'always accepted',
             'always rejected']
Length: 5, dtype: string


##### After clean applied

<StringArray>
['Sometimes accepted/Rejected',               'Don’t consult',
             'No savings goal',             'Always accepted',
             'Always rejected']
Length: 5, dtype: string


#### **column:** Land ownership ----- **dtype**: string

##### Before clean applied

<StringArray>
['dont own', 'own by self', 'own with others']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Don’t own', 'Own by self', 'Own with others']
Length: 3, dtype: string


#### **column:** Land decision maker ----- **dtype**: string

##### Before clean applied

<StringArray>
[' ', 'Others decide without me', 'Decide alone', 'Decide with others']
Length: 4, dtype: string


##### After clean applied

<StringArray>
[          'Don’t own land', 'Others decide without me',
             'Decide alone',       'Decide with others']
Length: 4, dtype: string


#### **column:** Children decision maker ----- **dtype**: string

##### Before clean applied

<StringArray>
[        'decide together',     'Not in relationship',
 'We do not talk about it',          'Spouse decides',
            'decide alone']
Length: 5, dtype: string


##### After clean applied

<StringArray>
[        'Decide together',     'Not in relationship',
 'We do not talk about it',          'Spouse decides',
            'Decide alone']
Length: 5, dtype: string


#### **column:** Earning frequency ----- **dtype**: string

##### Before clean applied

<StringArray>
[          'Daily', 'Weekly/biweekly',     'I dont work',         'Monthly',
 'When I get work',     'Work unpaid']
Length: 6, dtype: string


##### After clean applied

<StringArray>
[          'Daily', 'Weekly/Biweekly',    'I don’t work',         'Monthly',
 'When I get work',     'Work unpaid']
Length: 6, dtype: string


#### **column:** Spending decision maker ----- **dtype**: string

##### Before clean applied

<StringArray>
['Help from someone else', 'Fully on own', 'Someone else decides for me']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Help from someone else', 'Fully on own', 'Someone else decides']
Length: 3, dtype: string


#### **column:** Permission to work ----- **dtype**: string

##### Before clean applied

<StringArray>
['Already working', 'Yes', 'No']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Already working', 'Yes', 'No']
Length: 3, dtype: string


#### **column:** Main income earner ----- **dtype**: string

##### Before clean applied

<StringArray>
['respondent', 'spouse', 'other fam', 'dont know', 'other', 'earn equally']
Length: 6, dtype: string


##### After clean applied

<StringArray>
['Respondent', 'Spouse', 'Other fam', 'Don’t know', 'Other', 'Earn equally']
Length: 6, dtype: string


#### **column:** Migrant worker identity ----- **dtype**: string

##### Before clean applied

<StringArray>
['No-one, all live together', 'Other family', 'Spouse', 'Me']
Length: 4, dtype: string


##### After clean applied

<StringArray>
['No-one, all live together', 'Other family', 'Spouse', 'Me']
Length: 4, dtype: string


#### **column:** Possibility to raise 1/20 GNI (Gross National Income) ----- **dtype**: string

##### Before clean applied

<StringArray>
[  'Not very possible', 'Not at all possible',   'Somewhat possible',
       'Very possible']
Length: 4, dtype: string


##### After clean applied

<StringArray>
[  'Not very possible', 'Not at all possible',   'Somewhat possible',
       'Very possible']
Length: 4, dtype: string


#### **column:** Income frequency of spouse ----- **dtype**: string

##### Before clean applied

<StringArray>
[         'dont work',            'Monthly',          'no spouse',
              'Daily',    'Weekly/biweekly',          'dont know',
 'When they get work',        'Work unpaid']
Length: 8, dtype: string


##### After clean applied

<StringArray>
[        'Don’t work',            'Monthly',          'No spouse',
              'Daily',    'Weekly/Biweekly',         'Don’t know',
 'When they get work',        'Work unpaid']
Length: 8, dtype: string


#### **column:** Allow spouse to work,, ----- **dtype**: string

##### Before clean applied

<StringArray>
['Yes', 'Not married', 'Yes,', 'Not married,', 'No', 'No,']
Length: 6, dtype: string


##### After clean applied

<StringArray>
['Yes', 'Not married', 'No']
Length: 3, dtype: string


## pak2020

Unnamed: 0,InstanceID,finalweight,Gender,Marital / relationship status,Urbanicity,Age Group,Education,Read & write in any language,Own a phone,$2.50 PPP Poverty line,Language of interview,Main language,Mean household size,Division,Province / Region,Phone ownership,Independent phone use,Informal financial usage,Mobile Money,Bank,Permission to open bank,Money taken from account,Savings goal decision maker,Savings goal influence,Land ownership,Land decision maker,Children decision maker,Earning frequency,Spending decision maker,Permission to work,Main income earner,Migrant worker identity,Possibility to raise 1/20 GNI (Gross National Income),Income frequency of spouse,Allow spouse to work
0,466613,43992.04426,Male,Married,Rural,55+,No formal education,Yes,Own phone/Business/Employer,Below poverty line,Urdu,Pashto,Below mean size,Peshawar,KP,own phone,Yes,no,yes,own,no,money not taken,self alone,always accepted,dont own,Dont own land,decide alone,Monthly,Fully on own,Already working,respondent,Other family,Somewhat possible,dont work,No
1,466614,16270.362,Male,Not married,Rural,45-54,Secondary education,Yes,dont own phone,Below poverty line,Urdu,Pashto,Above mean size,Peshawar,KP,Parent,Yes,no,no,unbanked,yes,dont have account,no savings goal,no savings goal,dont own,Dont own land,Not married,I dont work,Someone else decides for me,Yes,other fam,"No-one, all live together",Not at all possible,no spouse,Not married


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
InstanceID,3668,,,,484321.0,6445.21,466613.0,480169.0,485426.0,489464.0,504335.0
finalweight,3668,,,,17927.9,15712.8,898.908,8506.39,13275.3,22398.3,274531.0
Gender,3668,2.0,Male,1838.0,,,,,,,
Marital / relationship status,3668,2.0,Married,2796.0,,,,,,,
Urbanicity,3668,2.0,Urban,1858.0,,,,,,,
Age Group,3668,5.0,25-34,1241.0,,,,,,,
Education,3668,5.0,Secondary education,1437.0,,,,,,,
Read & write in any language,3668,2.0,Yes,2787.0,,,,,,,
Own a phone,3668,2.0,Own phone/Business/Employer,2522.0,,,,,,,
$2.50 PPP Poverty line,3668,2.0,Below poverty line,2134.0,,,,,,,


### Columns descriptions

#### **column:** InstanceID ----- **dtype**: Int64

#### **column:** finalweight ----- **dtype**: float64

#### **column:** Gender ----- **dtype**: string

##### Before clean applied

<StringArray>
['Male', 'Female']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Male', 'Female']
Length: 2, dtype: string


#### **column:** Marital / relationship status ----- **dtype**: string

##### Before clean applied

<StringArray>
['Married', 'Not married']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Married', 'Not married']
Length: 2, dtype: string


#### **column:** Urbanicity ----- **dtype**: string

##### Before clean applied

<StringArray>
['Rural', 'Urban']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Rural', 'Urban']
Length: 2, dtype: string


#### **column:** Age Group ----- **dtype**: string

##### Before clean applied

<StringArray>
['55+', '45-54', '35-44', '18-24', '25-34']
Length: 5, dtype: string


##### After clean applied

<StringArray>
['55+', '45-54', '35-44', '18-24', '25-34']
Length: 5, dtype: string


#### **column:** Education ----- **dtype**: string

##### Before clean applied

<StringArray>
['No formal education', 'Secondary education',   'Primary education',
               'Other',    'Higher education']
Length: 5, dtype: string


##### After clean applied

<StringArray>
['No formal education', 'Secondary education',   'Primary education',
               'Other',    'Higher education']
Length: 5, dtype: string


#### **column:** Read & write in any language ----- **dtype**: string

##### Before clean applied

<StringArray>
['Yes', 'No']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Yes', 'No']
Length: 2, dtype: string


#### **column:** Own a phone ----- **dtype**: string

##### Before clean applied

<StringArray>
['Own phone/Business/Employer', 'dont own phone']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Own phone/Business/Employer', 'Don’t own phone']
Length: 2, dtype: string


#### **column:** $2.50 PPP Poverty line ----- **dtype**: string

##### Before clean applied

<StringArray>
['Below poverty line', 'Above poverty line']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Below poverty line', 'Above poverty line']
Length: 2, dtype: string


#### **column:** Language of interview ----- **dtype**: string

##### Before clean applied

<StringArray>
['Urdu', 'English']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Urdu', 'English']
Length: 2, dtype: string


#### **column:** Main language ----- **dtype**: string

##### Before clean applied

<StringArray>
['Pashto', 'Punjabi', 'Urdu', 'Sindhi', 'Other', 'Balochi', 'Saraiki']
Length: 7, dtype: string


##### After clean applied

<StringArray>
['Pashto', 'Punjabi', 'Urdu', 'Sindhi', 'Other', 'Balochi', 'Saraiki']
Length: 7, dtype: string


#### **column:** Mean household size ----- **dtype**: string

##### Before clean applied

<StringArray>
['Below mean size', 'Above mean size', 'Mean']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Below mean size', 'Above mean size', 'Mean']
Length: 3, dtype: string


#### **column:** Division ----- **dtype**: string

##### Before clean applied

<StringArray>
[                         'Peshawar',                        'Faisalabad',
                           'Karachi',                            'Lahore',
 'Sukkur incl. Shaheed Benazir Abad',                        'Gujranwala',
                         'Hyderabad',                         'Islamabad',
                        'Rawalpindi',                            'Quetta',
                            'Multan',                           'Larkana',
                            'Mardan',                            'Hazara',
                             'Sibbi',                         'Nasirabad',
                         'D.G. Khan',                          'Malakand',
                          'Sargodha',                            'Makran',
                             'Kalat',                              'Zhob',
                             'Kohat',                       'Mirpur Khas',
                        'Bahawalpur',                           'Sahiwal',
           

##### After clean applied

<StringArray>
[                         'Peshawar',                        'Faisalabad',
                           'Karachi',                            'Lahore',
 'Sukkur incl. shaheed benazir abad',                        'Gujranwala',
                         'Hyderabad',                         'Islamabad',
                        'Rawalpindi',                            'Quetta',
                            'Multan',                           'Larkana',
                            'Mardan',                            'Hazara',
                             'Sibbi',                         'Nasirabad',
                         'D.g. khan',                          'Malakand',
                          'Sargodha',                            'Makran',
                             'Kalat',                              'Zhob',
                             'Kohat',                       'Mirpur khas',
                        'Bahawalpur',                           'Sahiwal',
           

#### **column:** Province / Region ----- **dtype**: string

##### Before clean applied

<StringArray>
['KP', 'Punjab', 'Sindh', 'Balochistan']
Length: 4, dtype: string


##### After clean applied

<StringArray>
['Kp', 'Punjab', 'Sindh', 'Balochistan']
Length: 4, dtype: string


#### **column:** Phone ownership ----- **dtype**: string

##### Before clean applied

<StringArray>
[            'own phone',                'Parent',         'Husband/wives',
 'Dont use mobile phone',               'Sibling',              'Children',
       'Other man/women']
Length: 7, dtype: string


##### After clean applied

<StringArray>
[             'Own phone',                 'Parent',          'Husband/Wives',
 'Don’t use mobile phone',                'Sibling',               'Children',
        'Other man/Women']
Length: 7, dtype: string


#### **column:** Independent phone use ----- **dtype**: string

##### Before clean applied

<StringArray>
['Yes', 'Somewhat', 'No']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Yes', 'Somewhat', 'No']
Length: 3, dtype: string


#### **column:** Informal financial usage ----- **dtype**: string

##### Before clean applied

<StringArray>
['no', 'yes']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['No', 'Yes']
Length: 2, dtype: string


#### **column:** Mobile Money ----- **dtype**: string

##### Before clean applied

<StringArray>
['yes', 'no']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Yes', 'No']
Length: 2, dtype: string


#### **column:** Bank ----- **dtype**: string

##### Before clean applied

<StringArray>
['own', 'unbanked', 'spouse', 'other fam', 'other']
Length: 5, dtype: string


##### After clean applied

<StringArray>
['Own', 'Unbanked', 'Spouse', 'Other fam', 'Other']
Length: 5, dtype: string


#### **column:** Permission to open bank ----- **dtype**: string

##### Before clean applied

<StringArray>
['no', 'yes']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['No', 'Yes']
Length: 2, dtype: string


#### **column:** Money taken from account ----- **dtype**: string

##### Before clean applied

<StringArray>
['money not taken', 'dont have account', 'money taken']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Money not taken', 'Don’t have account', 'Money taken']
Length: 3, dtype: string


#### **column:** Savings goal decision maker ----- **dtype**: string

##### Before clean applied

<StringArray>
['self alone', 'no savings goal', 'others on behalf', 'decide together']
Length: 4, dtype: string


##### After clean applied

<StringArray>
['Self alone', 'No savings goal', 'Others on behalf', 'Decide together']
Length: 4, dtype: string


#### **column:** Savings goal influence ----- **dtype**: string

##### Before clean applied

<StringArray>
[            'always accepted',             'no savings goal',
             'always rejected', 'sometimes accepted/rejected',
                'dont consult']
Length: 5, dtype: string


##### After clean applied

<StringArray>
[            'Always accepted',             'No savings goal',
             'Always rejected', 'Sometimes accepted/Rejected',
               'Don’t consult']
Length: 5, dtype: string


#### **column:** Land ownership ----- **dtype**: string

##### Before clean applied

<StringArray>
['dont own', 'own by self', 'own with others']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Don’t own', 'Own by self', 'Own with others']
Length: 3, dtype: string


#### **column:** Land decision maker ----- **dtype**: string

##### Before clean applied

<StringArray>
[           'Dont own land',       'Decide with others',
 'Others decide without me',             'Decide alone']
Length: 4, dtype: string


##### After clean applied

<StringArray>
[          'Don’t own land',       'Decide with others',
 'Others decide without me',             'Decide alone']
Length: 4, dtype: string


#### **column:** Children decision maker ----- **dtype**: string

##### Before clean applied

<StringArray>
[           'decide alone',             'Not married',
         'decide together', 'We do not talk about it',
          'Spouse decides',           'Others decide']
Length: 6, dtype: string


##### After clean applied

<StringArray>
[           'Decide alone',             'Not married',
         'Decide together', 'We do not talk about it',
          'Spouse decides',           'Others decide']
Length: 6, dtype: string


#### **column:** Earning frequency ----- **dtype**: string

##### Before clean applied

<StringArray>
[        'Monthly',     'I dont work',           'Daily', 'Weekly/biweekly',
    'Infrequently',     'Work unpaid']
Length: 6, dtype: string


##### After clean applied

<StringArray>
[        'Monthly',    'I don’t work',           'Daily', 'Weekly/Biweekly',
    'Infrequently',     'Work unpaid']
Length: 6, dtype: string


#### **column:** Spending decision maker ----- **dtype**: string

##### Before clean applied

<StringArray>
['Fully on own', 'Someone else decides for me', 'Help from someone else']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Fully on own', 'Someone else decides', 'Help from someone else']
Length: 3, dtype: string


#### **column:** Permission to work ----- **dtype**: string

##### Before clean applied

<StringArray>
['Already working', 'Yes', 'No']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Already working', 'Yes', 'No']
Length: 3, dtype: string


#### **column:** Main income earner ----- **dtype**: string

##### Before clean applied

<StringArray>
['respondent', 'other fam', 'spouse', 'dont know', 'other']
Length: 5, dtype: string


##### After clean applied

<StringArray>
['Respondent', 'Other fam', 'Spouse', 'Don’t know', 'Other']
Length: 5, dtype: string


#### **column:** Migrant worker identity ----- **dtype**: string

##### Before clean applied

<StringArray>
['Other family', 'No-one, all live together', 'Spouse', 'Me']
Length: 4, dtype: string


##### After clean applied

<StringArray>
['Other family', 'No-one, all live together', 'Spouse', 'Me']
Length: 4, dtype: string


#### **column:** Possibility to raise 1/20 GNI (Gross National Income) ----- **dtype**: string

##### Before clean applied

<StringArray>
[  'Somewhat possible', 'Not at all possible',   'Not very possible',
       'Very possible']
Length: 4, dtype: string


##### After clean applied

<StringArray>
[  'Somewhat possible', 'Not at all possible',   'Not very possible',
       'Very possible']
Length: 4, dtype: string


#### **column:** Income frequency of spouse ----- **dtype**: string

##### Before clean applied

<StringArray>
[         'dont work',          'no spouse',              'Daily',
        'Work unpaid',            'Monthly',    'Weekly/biweekly',
          'dont know', 'When they get work']
Length: 8, dtype: string


##### After clean applied

<StringArray>
[        'Don’t work',          'No spouse',              'Daily',
        'Work unpaid',            'Monthly',    'Weekly/Biweekly',
         'Don’t know', 'When they get work']
Length: 8, dtype: string


#### **column:** Allow spouse to work ----- **dtype**: string

##### Before clean applied

<StringArray>
['No', 'Not married', 'Yes']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['No', 'Not married', 'Yes']
Length: 3, dtype: string


## uga2020

Unnamed: 0,InstanceID,finalweight,Gender,Marital / relationship status,Urbanicity,Age Group,Education,English literacy,Own a phone,$2.50 PPP Poverty line,Language of interview,Main language,Mean household size,Province / Region,Phone ownership,Independent phone use,Informal financial usage,Mobile Money,Bank,Permission to open bank,Money taken from account,Savings goal decision maker,Savings goal influence,Land ownership,Land decision maker,Children decision maker,Earning frequency,Spending decision maker,Permission to work,Main income earner,Migrant worker identity,Possibility to raise 1/20 GNI (Gross National Income),Income frequency of spouse,Allow spouse to work
0,309188,6212.304619,Female,Married/relationship,Urban,18-24,Secondary education,Excellent,own phone,Above poverty line,English,Luganda,Below mean size,Central,own phone/business/employer,Yes,no,Yes,unbanked,no,money not taken,no savings goal,no savings goal,dont own,dont own land,decide together,Work unpaid,Fully on own,Yes,other fam,"No-one, all live together",Very possible,Daily,Yes
1,309194,9193.278098,Female,Married/relationship,Urban,35-44,No formal education,Not at all,own phone,Above poverty line,English,Runyakole/Rukiga/Runyakitara/Runyoro/Rutooro,Below mean size,Central,own phone/business/employer,Yes,no,Yes,own,no,money not taken,self alone,dont consult,dont own,dont own land,decide alone,Daily,Fully on own,Already working,respondent,"No-one, all live together",Very possible,dont know,Yes


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
InstanceID,3065,,,,312884.0,2337.89,309188.0,311054.0,312366.0,314158.0,320008.0
finalweight,3065,,,,11203.2,4946.02,1364.23,7468.77,10363.3,14123.9,39179.5
Gender,3065,2.0,Female,1863.0,,,,,,,
Marital / relationship status,3065,2.0,Married/relationship,2184.0,,,,,,,
Urbanicity,3065,2.0,Rural,2189.0,,,,,,,
Age Group,3065,5.0,25-34,950.0,,,,,,,
Education,3065,5.0,Primary education,1568.0,,,,,,,
English literacy,3065,5.0,Not at all,1110.0,,,,,,,
Own a phone,3065,2.0,own phone,1973.0,,,,,,,
$2.50 PPP Poverty line,3065,2.0,Below poverty line,1731.0,,,,,,,


### Columns descriptions

#### **column:** InstanceID ----- **dtype**: Int64

#### **column:** finalweight ----- **dtype**: float64

#### **column:** Gender ----- **dtype**: string

##### Before clean applied

<StringArray>
['Female', 'Male']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Female', 'Male']
Length: 2, dtype: string


#### **column:** Marital / relationship status ----- **dtype**: string

##### Before clean applied

<StringArray>
['Married/relationship', 'Not married/relationship']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Married/Relationship', 'Not married/Relationship']
Length: 2, dtype: string


#### **column:** Urbanicity ----- **dtype**: string

##### Before clean applied

<StringArray>
['Urban', 'Rural']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Urban', 'Rural']
Length: 2, dtype: string


#### **column:** Age Group ----- **dtype**: string

##### Before clean applied

<StringArray>
['18-24', '35-44', '25-34', '55+', '45-54']
Length: 5, dtype: string


##### After clean applied

<StringArray>
['18-24', '35-44', '25-34', '55+', '45-54']
Length: 5, dtype: string


#### **column:** Education ----- **dtype**: string

##### Before clean applied

<StringArray>
['Secondary education', 'No formal education',    'Higher education',
   'Primary education',               'Other']
Length: 5, dtype: string


##### After clean applied

<StringArray>
['Secondary education', 'No formal education',    'Higher education',
   'Primary education',               'Other']
Length: 5, dtype: string


#### **column:** English literacy ----- **dtype**: string

##### Before clean applied

<StringArray>
['Excellent', 'Not at all', 'Good', 'Fair', 'Poorly']
Length: 5, dtype: string


##### After clean applied

<StringArray>
['Excellent', 'Not at all', 'Good', 'Fair', 'Poorly']
Length: 5, dtype: string


#### **column:** Own a phone ----- **dtype**: string

##### Before clean applied

<StringArray>
['own phone', 'dont own phone']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Own phone', 'Don’t own phone']
Length: 2, dtype: string


#### **column:** $2.50 PPP Poverty line ----- **dtype**: string

##### Before clean applied

<StringArray>
['Above poverty line', 'Below poverty line']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Above poverty line', 'Below poverty line']
Length: 2, dtype: string


#### **column:** Language of interview ----- **dtype**: string

##### Before clean applied

<StringArray>
[                                     'English',
                                      'Luganda',
                                        'Ateso',
 'Runyakole/Rukiga/Runyakitara/Runyoro/Rutooro',
                                      'Lugbara',
                                         'Madi',
                                      'Lugishu',
                                          'Luo',
                                  'nkaromojong',
                                    'Kupsabiny']
Length: 10, dtype: string


##### After clean applied

<StringArray>
[                                     'English',
                                      'Luganda',
                                        'Ateso',
 'Runyakole/Rukiga/Runyakitara/Runyoro/Rutooro',
                                      'Lugbara',
                                         'Madi',
                                      'Lugishu',
                                          'Luo',
                                  'Nkaromojong',
                                    'Kupsabiny']
Length: 10, dtype: string


#### **column:** Main language ----- **dtype**: string

##### Before clean applied

<StringArray>
[                                     'Luganda',
 'Runyakole/Rukiga/Runyakitara/Runyoro/Rutooro',
                                      'English',
                                       'Lusoga',
                                        'Ateso',
                                      'Lukonzo',
                                      'Lugishu',
                                      'Lugbara',
                                      'Lugwere',
                                         'Madi',
                                        'Other',
                                          'Luo',
                            'Alur / Dhopadhola',
                                  'nkaromojong',
                                    'Kupsabiny']
Length: 15, dtype: string


##### After clean applied

<StringArray>
[                                     'Luganda',
 'Runyakole/Rukiga/Runyakitara/Runyoro/Rutooro',
                                      'English',
                                       'Lusoga',
                                        'Ateso',
                                      'Lukonzo',
                                      'Lugishu',
                                      'Lugbara',
                                      'Lugwere',
                                         'Madi',
                                        'Other',
                                          'Luo',
                            'Alur / dhopadhola',
                                  'Nkaromojong',
                                    'Kupsabiny']
Length: 15, dtype: string


#### **column:** Mean household size ----- **dtype**: string

##### Before clean applied

<StringArray>
['Below mean size', 'Mean', 'Above mean size']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Below mean size', 'Mean', 'Above mean size']
Length: 3, dtype: string


#### **column:** Province / Region ----- **dtype**: string

##### Before clean applied

<StringArray>
['Central', 'Eastern', 'Western', 'Northern']
Length: 4, dtype: string


##### After clean applied

<StringArray>
['Central', 'Eastern', 'Western', 'Northern']
Length: 4, dtype: string


#### **column:** Phone ownership ----- **dtype**: string

##### Before clean applied

<StringArray>
['own phone/business/employer',                     'Sibling',
       'Dont use mobile phone',               'Husband/wives',
                      'Parent',                    'Children',
             'Other man/women']
Length: 7, dtype: string


##### After clean applied

<StringArray>
['Own phone/Business/Employer',                     'Sibling',
      'Don’t use mobile phone',               'Husband/Wives',
                      'Parent',                    'Children',
             'Other man/Women']
Length: 7, dtype: string


#### **column:** Independent phone use ----- **dtype**: string

##### Before clean applied

<StringArray>
['Yes', 'No', 'Somewhat']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Yes', 'No', 'Somewhat']
Length: 3, dtype: string


#### **column:** Informal financial usage ----- **dtype**: string

##### Before clean applied

<StringArray>
['no', 'yes']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['No', 'Yes']
Length: 2, dtype: string


#### **column:** Mobile Money ----- **dtype**: string

##### Before clean applied

<StringArray>
['Yes', 'No']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Yes', 'No']
Length: 2, dtype: string


#### **column:** Bank ----- **dtype**: string

##### Before clean applied

<StringArray>
['unbanked', 'own', 'spouse', 'other fam', 'other']
Length: 5, dtype: string


##### After clean applied

<StringArray>
['Unbanked', 'Own', 'Spouse', 'Other fam', 'Other']
Length: 5, dtype: string


#### **column:** Permission to open bank ----- **dtype**: string

##### Before clean applied

<StringArray>
['no', 'yes']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['No', 'Yes']
Length: 2, dtype: string


#### **column:** Money taken from account ----- **dtype**: string

##### Before clean applied

<StringArray>
['money not taken', 'money taken', 'dont have account']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Money not taken', 'Money taken', 'Don’t have account']
Length: 3, dtype: string


#### **column:** Savings goal decision maker ----- **dtype**: string

##### Before clean applied

<StringArray>
['no savings goal', 'self alone', 'others on behalf', 'decide together']
Length: 4, dtype: string


##### After clean applied

<StringArray>
['No savings goal', 'Self alone', 'Others on behalf', 'Decide together']
Length: 4, dtype: string


#### **column:** Savings goal influence ----- **dtype**: string

##### Before clean applied

<StringArray>
[            'no savings goal',                'dont consult',
             'always accepted', 'sometimes accepted/rejected',
             'always rejected']
Length: 5, dtype: string


##### After clean applied

<StringArray>
[            'No savings goal',               'Don’t consult',
             'Always accepted', 'Sometimes accepted/Rejected',
             'Always rejected']
Length: 5, dtype: string


#### **column:** Land ownership ----- **dtype**: string

##### Before clean applied

<StringArray>
['dont own', 'own by self', 'own with others']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Don’t own', 'Own by self', 'Own with others']
Length: 3, dtype: string


#### **column:** Land decision maker ----- **dtype**: string

##### Before clean applied

<StringArray>
[           'dont own land',             'decide alone',
       'decide with others', 'others decide without me']
Length: 4, dtype: string


##### After clean applied

<StringArray>
[          'Don’t own land',             'Decide alone',
       'Decide with others', 'Others decide without me']
Length: 4, dtype: string


#### **column:** Children decision maker ----- **dtype**: string

##### Before clean applied

<StringArray>
[        'decide together',            'decide alone',
     'Not in relationship', 'We do not talk about it',
          'Spouse decides',           'Others decide']
Length: 6, dtype: string


##### After clean applied

<StringArray>
[        'Decide together',            'Decide alone',
     'Not in relationship', 'We do not talk about it',
          'Spouse decides',           'Others decide']
Length: 6, dtype: string


#### **column:** Earning frequency ----- **dtype**: string

##### Before clean applied

<StringArray>
[    'Work unpaid',           'Daily', 'When I get work',         'Monthly',
 'Weekly/biweekly',     'I dont work']
Length: 6, dtype: string


##### After clean applied

<StringArray>
[    'Work unpaid',           'Daily', 'When I get work',         'Monthly',
 'Weekly/Biweekly',    'I don’t work']
Length: 6, dtype: string


#### **column:** Spending decision maker ----- **dtype**: string

##### Before clean applied

<StringArray>
['Fully on own', 'Help from someone else', 'Someone else decides for me']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Fully on own', 'Help from someone else', 'Someone else decides']
Length: 3, dtype: string


#### **column:** Permission to work ----- **dtype**: string

##### Before clean applied

<StringArray>
['Yes', 'Already working', 'No']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Yes', 'Already working', 'No']
Length: 3, dtype: string


#### **column:** Main income earner ----- **dtype**: string

##### Before clean applied

<StringArray>
['other fam', 'respondent', 'other', 'spouse', 'dont know']
Length: 5, dtype: string


##### After clean applied

<StringArray>
['Other fam', 'Respondent', 'Other', 'Spouse', 'Don’t know']
Length: 5, dtype: string


#### **column:** Migrant worker identity ----- **dtype**: string

##### Before clean applied

<StringArray>
['No-one, all live together', 'Other family', 'Spouse', 'Me']
Length: 4, dtype: string


##### After clean applied

<StringArray>
['No-one, all live together', 'Other family', 'Spouse', 'Me']
Length: 4, dtype: string


#### **column:** Possibility to raise 1/20 GNI (Gross National Income) ----- **dtype**: string

##### Before clean applied

<StringArray>
[      'Very possible', 'Not at all possible',   'Somewhat possible',
   'Not very possible']
Length: 4, dtype: string


##### After clean applied

<StringArray>
[      'Very possible', 'Not at all possible',   'Somewhat possible',
   'Not very possible']
Length: 4, dtype: string


#### **column:** Income frequency of spouse ----- **dtype**: string

##### Before clean applied

<StringArray>
[             'Daily',          'dont know',          'no spouse',
            'Monthly',    'Weekly/biweekly',          'dont work',
 'When they get work',        'Work unpaid']
Length: 8, dtype: string


##### After clean applied

<StringArray>
[             'Daily',         'Don’t know',          'No spouse',
            'Monthly',    'Weekly/Biweekly',         'Don’t work',
 'When they get work',        'Work unpaid']
Length: 8, dtype: string


#### **column:** Allow spouse to work ----- **dtype**: string

##### Before clean applied

<StringArray>
['Yes', 'Not married', 'No']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Yes', 'Not married', 'No']
Length: 3, dtype: string


## tza2020

Unnamed: 0,InstanceID,finalweight,Gender,Marital / relationship status,Urbanicity,Age Group,Education,Swahili literacy,Own a phone,$2.50 PPP Poverty line,Language of interview,Main language,Mean household size,Province / Region,Phone ownership,Independent phone use,Informal financial usage,Mobile Money,Bank,Permission to open bank,Money taken from account,Savings goal decision maker,Savings goal influence,Land ownership,Land decision maker,Children decision maker,Earning frequency,Spending decision maker,Permission to work,Main income earner,Migrant worker identity,Possibility to raise 1/20 GNI (Gross National Income),Income frequency of spouse,Allow spouse to work
0,427138,7590.642044,Female,Married/relationship,Urban,25-34,Primary Education,Excellent,own phone,above poverty line,Swahili,Swahili,Below mean size,Dar es Salaam,own phone/business/employer,Yes,no,Yes,own,no,money not taken,self alone,always accepted,dont own,dont own land,decide together,Daily,Fully on own,Already working,respondent,Spouse,Somewhat possible,Weekly/biweekly,Yes
1,427140,5770.58158,Female,Not married/relationship,Urban,18-24,Secondary education,Excellent,own phone,below poverty line,Swahili,Swahili,Above mean size,Dar es Salaam,own phone/business/employer,Yes,no,Yes,unbanked,no,money not taken,no savings goal,no savings goal,dont own,dont own land,Not in relationship,I dont work,Fully on own,Yes,other fam,Other family,Somewhat possible,no spouse,Not married


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
InstanceID,3016,,,,432994.0,2317.33,427138.0,431524.0,433202.0,434628.0,438248.0
finalweight,3016,,,,7418.42,12459.1,11.3819,963.197,3422.78,9058.52,168821.0
Gender,3016,2.0,Female,1656.0,,,,,,,
Marital / relationship status,3016,2.0,Married/relationship,2132.0,,,,,,,
Urbanicity,3016,2.0,rural,1961.0,,,,,,,
Age Group,3016,5.0,25-34,1089.0,,,,,,,
Education,3016,4.0,Primary Education,1445.0,,,,,,,
Swahili literacy,3016,5.0,Good,1550.0,,,,,,,
Own a phone,3016,2.0,own phone,2425.0,,,,,,,
$2.50 PPP Poverty line,3016,2.0,above poverty line,1708.0,,,,,,,


### Columns descriptions

#### **column:** InstanceID ----- **dtype**: Int64

#### **column:** finalweight ----- **dtype**: float64

#### **column:** Gender ----- **dtype**: string

##### Before clean applied

<StringArray>
['Female', 'Male']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Female', 'Male']
Length: 2, dtype: string


#### **column:** Marital / relationship status ----- **dtype**: string

##### Before clean applied

<StringArray>
['Married/relationship', 'Not married/relationship']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Married/Relationship', 'Not married/Relationship']
Length: 2, dtype: string


#### **column:** Urbanicity ----- **dtype**: string

##### Before clean applied

<StringArray>
['Urban', 'rural']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Urban', 'Rural']
Length: 2, dtype: string


#### **column:** Age Group ----- **dtype**: string

##### Before clean applied

<StringArray>
['25-34', '18-24', '35-44', '45-54', '55+']
Length: 5, dtype: string


##### After clean applied

<StringArray>
['25-34', '18-24', '35-44', '45-54', '55+']
Length: 5, dtype: string


#### **column:** Education ----- **dtype**: string

##### Before clean applied

<StringArray>
[  'Primary Education', 'Secondary education',    'Higher education',
 'No formal education']
Length: 4, dtype: string


##### After clean applied

<StringArray>
[  'Primary education', 'Secondary education',    'Higher education',
 'No formal education']
Length: 4, dtype: string


#### **column:** Swahili literacy ----- **dtype**: string

##### Before clean applied

<StringArray>
['Excellent', 'Good', 'Fair', 'Not at all', 'Poorly']
Length: 5, dtype: string


##### After clean applied

<StringArray>
['Excellent', 'Good', 'Fair', 'Not at all', 'Poorly']
Length: 5, dtype: string


#### **column:** Own a phone ----- **dtype**: string

##### Before clean applied

<StringArray>
['own phone', 'dont own phone']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Own phone', 'Don’t own phone']
Length: 2, dtype: string


#### **column:** $2.50 PPP Poverty line ----- **dtype**: string

##### Before clean applied

<StringArray>
['above poverty line', 'below poverty line']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Above poverty line', 'Below poverty line']
Length: 2, dtype: string


#### **column:** Language of interview ----- **dtype**: string

##### Before clean applied

<StringArray>
['Swahili', 'English']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Swahili', 'English']
Length: 2, dtype: string


#### **column:** Main language ----- **dtype**: string

##### Before clean applied

<StringArray>
[   'Swahili',     'Chagga',   'Kiluguru',    'Kizigua',     'Others',
     'Kihaya',       'Kiha',  'Kimakonde',     'Sukuma',  'Kinyatura',
     'Kijita',     'Kigogo',    'English',   'Nyamwezi', 'Kinyakyusa',
    'Kisafwa',    'Kimasai',      'Kurya',  'Kimatengo',     'Kifipa',
     'Kiiraq',    'Kirangi',    'Kikinga',   'Kinyambo']
Length: 24, dtype: string


##### After clean applied

<StringArray>
[   'Swahili',     'Chagga',   'Kiluguru',    'Kizigua',     'Others',
     'Kihaya',       'Kiha',  'Kimakonde',     'Sukuma',  'Kinyatura',
     'Kijita',     'Kigogo',    'English',   'Nyamwezi', 'Kinyakyusa',
    'Kisafwa',    'Kimasai',      'Kurya',  'Kimatengo',     'Kifipa',
     'Kiiraq',    'Kirangi',    'Kikinga',   'Kinyambo']
Length: 24, dtype: string


#### **column:** Mean household size ----- **dtype**: string

##### Before clean applied

<StringArray>
['Below mean size', 'Above mean size', 'Mean']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Below mean size', 'Above mean size', 'Mean']
Length: 3, dtype: string


#### **column:** Province / Region ----- **dtype**: string

##### Before clean applied

<StringArray>
['Dar es Salaam',      'Morogoro',        'Mtwara',         'Tanga',
         'Rukwa',        'Mwanza',   'Kilimanjaro',        'Tabora',
        'Kigoma',        'Arusha',          'Mara',     'Shinyanga',
         'Geita',        'Simiyu',  'Mbeya/Songwe',        'Iringa',
        'Dodoma',         'Lindi',        'Katavi',        'Kagera',
        'Ruvuma',        'Njombe',    'Urban West',       'Manyara',
         'Pwani',   'South Pemba',       'Singida',   'North Pemba',
  'South Unguja',  'North Unguja']
Length: 30, dtype: string


##### After clean applied

<StringArray>
['Dar es salaam',      'Morogoro',        'Mtwara',         'Tanga',
         'Rukwa',        'Mwanza',   'Kilimanjaro',        'Tabora',
        'Kigoma',        'Arusha',          'Mara',     'Shinyanga',
         'Geita',        'Simiyu',  'Mbeya/Songwe',        'Iringa',
        'Dodoma',         'Lindi',        'Katavi',        'Kagera',
        'Ruvuma',        'Njombe',    'Urban west',       'Manyara',
         'Pwani',   'South pemba',       'Singida',   'North pemba',
  'South unguja',  'North unguja']
Length: 30, dtype: string


#### **column:** Phone ownership ----- **dtype**: string

##### Before clean applied

<StringArray>
['own phone/business/employer',               'Husband/wives',
             'Other man/women',       'Dont use mobile phone',
                      'Parent',                     'Sibling',
                    'Children']
Length: 7, dtype: string


##### After clean applied

<StringArray>
['Own phone/Business/Employer',               'Husband/Wives',
             'Other man/Women',      'Don’t use mobile phone',
                      'Parent',                     'Sibling',
                    'Children']
Length: 7, dtype: string


#### **column:** Independent phone use ----- **dtype**: string

##### Before clean applied

<StringArray>
['Yes', 'Somewhat', 'No']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Yes', 'Somewhat', 'No']
Length: 3, dtype: string


#### **column:** Informal financial usage ----- **dtype**: string

##### Before clean applied

<StringArray>
['no', 'yes']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['No', 'Yes']
Length: 2, dtype: string


#### **column:** Mobile Money ----- **dtype**: string

##### Before clean applied

<StringArray>
['Yes', 'No']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Yes', 'No']
Length: 2, dtype: string


#### **column:** Bank ----- **dtype**: string

##### Before clean applied

<StringArray>
['own', 'unbanked', 'spouse', 'other', 'other fam']
Length: 5, dtype: string


##### After clean applied

<StringArray>
['Own', 'Unbanked', 'Spouse', 'Other', 'Other fam']
Length: 5, dtype: string


#### **column:** Permission to open bank ----- **dtype**: string

##### Before clean applied

<StringArray>
['no', 'yes']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['No', 'Yes']
Length: 2, dtype: string


#### **column:** Money taken from account ----- **dtype**: string

##### Before clean applied

<StringArray>
['money not taken', 'dont have account', 'money taken']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Money not taken', 'Don’t have account', 'Money taken']
Length: 3, dtype: string


#### **column:** Savings goal decision maker ----- **dtype**: string

##### Before clean applied

<StringArray>
['self alone', 'no savings goal', 'decide together', 'others on behalf']
Length: 4, dtype: string


##### After clean applied

<StringArray>
['Self alone', 'No savings goal', 'Decide together', 'Others on behalf']
Length: 4, dtype: string


#### **column:** Savings goal influence ----- **dtype**: string

##### Before clean applied

<StringArray>
[            'always accepted',             'no savings goal',
                'dont consult', 'sometimes accepted/rejected',
             'always rejected']
Length: 5, dtype: string


##### After clean applied

<StringArray>
[            'Always accepted',             'No savings goal',
               'Don’t consult', 'Sometimes accepted/Rejected',
             'Always rejected']
Length: 5, dtype: string


#### **column:** Land ownership ----- **dtype**: string

##### Before clean applied

<StringArray>
['dont own', 'own with others', 'own by self']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Don’t own', 'Own with others', 'Own by self']
Length: 3, dtype: string


#### **column:** Land decision maker ----- **dtype**: string

##### Before clean applied

<StringArray>
[           'dont own land',       'decide with others',
             'decide alone', 'others decide without me']
Length: 4, dtype: string


##### After clean applied

<StringArray>
[          'Don’t own land',       'Decide with others',
             'Decide alone', 'Others decide without me']
Length: 4, dtype: string


#### **column:** Children decision maker ----- **dtype**: string

##### Before clean applied

<StringArray>
[        'decide together',     'Not in relationship',
 'We do not talk about it',            'decide alone',
          'Spouse decides',           'Others decide']
Length: 6, dtype: string


##### After clean applied

<StringArray>
[        'Decide together',     'Not in relationship',
 'We do not talk about it',            'Decide alone',
          'Spouse decides',           'Others decide']
Length: 6, dtype: string


#### **column:** Earning frequency ----- **dtype**: string

##### Before clean applied

<StringArray>
[          'Daily',     'I dont work',         'Monthly', 'Weekly/biweekly',
 'When I get work',     'Work unpaid']
Length: 6, dtype: string


##### After clean applied

<StringArray>
[          'Daily',    'I don’t work',         'Monthly', 'Weekly/Biweekly',
 'When I get work',     'Work unpaid']
Length: 6, dtype: string


#### **column:** Spending decision maker ----- **dtype**: string

##### Before clean applied

<StringArray>
['Fully on own', 'Help from someone else', 'Someone else decides for me']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Fully on own', 'Help from someone else', 'Someone else decides']
Length: 3, dtype: string


#### **column:** Permission to work ----- **dtype**: string

##### Before clean applied

<StringArray>
['Already working', 'Yes', 'No']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Already working', 'Yes', 'No']
Length: 3, dtype: string


#### **column:** Main income earner ----- **dtype**: string

##### Before clean applied

<StringArray>
['respondent', 'other fam', 'spouse', 'other', 'dont know', 'earn equally']
Length: 6, dtype: string


##### After clean applied

<StringArray>
['Respondent', 'Other fam', 'Spouse', 'Other', 'Don’t know', 'Earn equally']
Length: 6, dtype: string


#### **column:** Migrant worker identity ----- **dtype**: string

##### Before clean applied

<StringArray>
['Spouse', 'Other family', 'No-one, all live together', 'Me']
Length: 4, dtype: string


##### After clean applied

<StringArray>
['Spouse', 'Other family', 'No-one, all live together', 'Me']
Length: 4, dtype: string


#### **column:** Possibility to raise 1/20 GNI (Gross National Income) ----- **dtype**: string

##### Before clean applied

<StringArray>
[  'Somewhat possible', 'Not at all possible',   'Not very possible',
       'Very possible']
Length: 4, dtype: string


##### After clean applied

<StringArray>
[  'Somewhat possible', 'Not at all possible',   'Not very possible',
       'Very possible']
Length: 4, dtype: string


#### **column:** Income frequency of spouse ----- **dtype**: string

##### Before clean applied

<StringArray>
[   'Weekly/biweekly',          'no spouse',        'Work unpaid',
              'Daily',          'dont know',            'Monthly',
 'When they get work',          'dont work']
Length: 8, dtype: string


##### After clean applied

<StringArray>
[   'Weekly/Biweekly',          'No spouse',        'Work unpaid',
              'Daily',         'Don’t know',            'Monthly',
 'When they get work',         'Don’t work']
Length: 8, dtype: string


#### **column:** Allow spouse to work ----- **dtype**: string

##### Before clean applied

<StringArray>
['Yes', 'Not married', 'No']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Yes', 'Not married', 'No']
Length: 3, dtype: string


In [38]:
## this will compare the columns for different dataset to help harmonize the results.
compareDatasets(datasets)

{'Poverty line', 'Permission to open bank', 'English literacy', 'Permission to work', 'Age', 'Main income earner', 'Savings goal influence', 'Geographic Area', 'Allow spouse to work', 'Province / Region', 'Land decision maker', 'finalweight', 'Spending decision maker', 'Mean household size', 'Land ownership', 'Earning frequency', 'Income frequency of spouse', 'Marital Status', 'Mobile Money', 'InstanceID', 'Gender', 'Level of education', 'Migrant worker identity', 'Savings goal decision maker', 'Language of interview', 'Own a phone', 'Bank', 'Possibility to raise 1/20 GNI (Gross National Income)', 'Children decision maker', 'Independent phone use', 'Money taken from account', 'Phone ownership', 'Main language', 'Informal financial usage'}
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 34 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 

None

{'Poverty line', 'Permission to open bank', 'Permission to work', 'Age', 'Main income earner', 'Savings goal influence', 'Read & write in any language', 'Geographic Area', 'Allow spouse to work', 'Province / Region', 'Land decision maker', 'Division', 'finalweight', 'Spending decision maker', 'Mean household size', 'Land ownership', 'Earning frequency', 'Income frequency of spouse', 'Marital Status', 'Mobile Money', 'InstanceID', 'Gender', 'Level of education', 'Migrant worker identity', 'Savings goal decision maker', 'Language of interview', 'Own a phone', 'Bank', 'Possibility to raise 1/20 GNI (Gross National Income)', 'Children decision maker', 'Independent phone use', 'Money taken from account', 'Phone ownership', 'Main language', 'Informal financial usage'}
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3668 entries, 0 to 3667
Data columns (total 35 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                         

None

{'Poverty line', 'Permission to open bank', 'English literacy', 'Permission to work', 'Age', 'Main income earner', 'Savings goal influence', 'Geographic Area', 'Allow spouse to work', 'Province / Region', 'Land decision maker', 'finalweight', 'Spending decision maker', 'Mean household size', 'Land ownership', 'Earning frequency', 'Income frequency of spouse', 'Marital Status', 'Mobile Money', 'InstanceID', 'Gender', 'Level of education', 'Migrant worker identity', 'Savings goal decision maker', 'Language of interview', 'Own a phone', 'Bank', 'Possibility to raise 1/20 GNI (Gross National Income)', 'Children decision maker', 'Independent phone use', 'Money taken from account', 'Phone ownership', 'Main language', 'Informal financial usage'}
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3065 entries, 0 to 3064
Data columns (total 34 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 

None

{'Poverty line', 'Permission to open bank', 'Permission to work', 'Age', 'Main income earner', 'Savings goal influence', 'Geographic Area', 'Allow spouse to work', 'Province / Region', 'Land decision maker', 'finalweight', 'Spending decision maker', 'Mean household size', 'Land ownership', 'Earning frequency', 'Income frequency of spouse', 'Marital Status', 'Mobile Money', 'InstanceID', 'Gender', 'Level of education', 'Migrant worker identity', 'Savings goal decision maker', 'Language of interview', 'Own a phone', 'Bank', 'Possibility to raise 1/20 GNI (Gross National Income)', 'Children decision maker', 'Independent phone use', 'Money taken from account', 'Phone ownership', 'Main language', 'Informal financial usage', 'Swahili literacy'}
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3016 entries, 0 to 3015
Data columns (total 34 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 

None

Good matched columns
- Poverty line
> ken2020: <StringArray>
['Above poverty line', 'Below poverty line']
Length: 2, dtype: string
> pak2020: <StringArray>
['Below poverty line', 'Above poverty line']
Length: 2, dtype: string
- Permission to open bank
> ken2020: <StringArray>
['No', 'Yes']
Length: 2, dtype: string
> pak2020: <StringArray>
['No', 'Yes']
Length: 2, dtype: string
- Permission to work
> ken2020: <StringArray>
['Already working', 'Yes', 'No']
Length: 3, dtype: string
> pak2020: <StringArray>
['Already working', 'Yes', 'No']
Length: 3, dtype: string
- Age
> ken2020: <StringArray>
['25-34', '35-44', '18-24', '55+', '45-54']
Length: 5, dtype: string
> pak2020: <StringArray>
['55+', '45-54', '35-44', '18-24', '25-34']
Length: 5, dtype: string
- Main income earner
> ken2020: <StringArray>
['Respondent', 'Spouse', 'Other fam', 'Don’t know', 'Other', 'Earn equally']
Length: 6, dtype: string
> pak2020: <StringArray>
['Respondent', 'Other fam', 'Spouse', 'Don’t know', 'Other']
Lengt

In [39]:
## Once we are happy with the result of above functions we will proceed to upload the data.
for dataset in datasets:
    print('///////////////////////////////////////')
    myInfo = saveData(dataset, OutPath, env='staging', section='mobile')
    print('Creating or patching the country')
    print(upsertCountry(myInfo, auth))
    print('______________________________')
    print('Uploading the data')
    print(uploadDataFunction(myInfo, auth))

///////////////////////////////////////
Population over which it was estimated
_________________  ken2020
Mujeres:  11265010.970281385
Hombres:  10705085.832735473
Total:  21970096.803016882
_________________ 
Creating or patching the country
Country Kenya updated succesfully
<Response [200]>
______________________________
Uploading the data
{'ok': 1}
///////////////////////////////////////
Population over which it was estimated
_________________  pak2020
Mujeres:  32090667.84606279
Hombres:  33668897.154271
Total:  65759565.000333816
_________________ 
Creating or patching the country
Country Pakistan updated succesfully
<Response [200]>
______________________________
Uploading the data
{'ok': 1}
///////////////////////////////////////
Population over which it was estimated
_________________  uga2020
Mujeres:  18576699.48697699
Hombres:  15761007.512974974
Total:  34337706.999951966
_________________ 
Creating or patching the country
Country Uganda updated succesfully
<Response [200]>