# Process to upload data.

## Steps to follow to make an import:
1.- Check the data: format, column labeling, data types, typos inside data categories for each indicator etc.  
2.- Once we are happy with the data we need to produce the matchJson for indicators; this json will map the colum name indicators with the harmonize ones we have on the DB.


In [1]:
import os
import zipfile
import pandas as pd
import requests
import numpy as np
import itertools
import difflib
import json
import re
import getpass

import pycountry
from IPython.display import display, Markdown
pd.set_option('display.max_row', None)
pd.set_option('display.max_columns', None)
basePath= os.getcwd()

#### management tips
* Not all datasets will have the same columns.  
* Allways trim and substitute empty values by null ones.  

#### Functions for management

In [15]:
def capitalize(matchobj):
    return matchobj.group(0).upper()

def checkDataset(datasetPath, iso, year, kargs={'decimal':',', 'sep':';'}, clean=True):
    """
    This will open the dataset and make some itial checks: 
    (data types, column names and will try to solve some common cleaning issues)
    """
    if datasetPath.split('.')[-1] in ['xlsx']:
        dataset =  pd.read_excel(datasetPath, **kargs) \
                .convert_dtypes()
    elif datasetPath.split('.')[-1] in ['csv']:
        dataset = pd.read_csv(datasetPath, **kargs, low_memory=False) \
                .convert_dtypes()
    else:
            raise Exception('format not allowed')
    dataset_columns = set(dataset.columns.sort_values(ascending=True).values)
    display(Markdown(f'## {iso.lower()}{year}'))
    display(dataset.head(2))
    display(dataset.describe(include='all').transpose())
    display(Markdown(f'### Columns descriptions'))
    for col in dataset.columns.values:
        display(Markdown(f'#### **column:** {col} ----- **dtype**: {dataset[col].dtype}'))
        if len(dataset[col].unique()) < 50:
            display(Markdown(f'##### Before clean applied'))
            print(dataset[col].unique())
        ## This is for preliminary clean
        if pd.api.types.is_string_dtype(dataset[col]) and clean:
            dataset[col] = dataset[col].str.strip() \
                                .str.strip(',') \
                                .str.lower() \
                                .str.capitalize() \
                                .str.replace(re.compile(r'( years)$', flags=re.IGNORECASE), '') \
                                .str.replace(re.compile(r'(Urbain)', flags=re.IGNORECASE), 'Urban') \
                                .str.replace(re.compile(r'(Above 60)', flags=re.IGNORECASE), '>60') \
                                .str.replace(re.compile(r'(Under 18)', flags=re.IGNORECASE), '<18') \
                                .str.replace(re.compile(r'\/[aA-zZ]', flags=re.IGNORECASE), capitalize) \
                                .str.replace('Dont know','Don’t know') \
                                .str.replace('Informally','Informal')
            display(Markdown(f'##### After clean applied'))
            print(dataset[col].unique())
        
    return [dataset, dataset_columns, f'{iso.lower()}{year}']

def calculatePopulation(data):
    print('Population over which it was estimated')
    poblacion = sum(data[0]['finalweight'])
    mujeres = data[0][data[0]['Gender'] == 'Female']
    total_mujeres = sum(mujeres['finalweight'])
    hombres = data[0][data[0]['Gender'] == 'Male']
    total_hombres = sum(hombres['finalweight'])
    print('_________________ ',f'{data[2]}')
    print('Mujeres: ',total_mujeres)
    print('Hombres: ',total_hombres)
    print('Total: ', poblacion)
    print('_________________ ')
    

def compareDatasets(dfvaris:list, column_remap = {'stratum': 'Stratum',
                                              #'D3': 'M_D3',
                                              'Allow spouse to work,,':'Allow spouse to work',
                                              ' Phone ownership':'Phone ownership',
                                              #'Gender': 'gender',
                                              'Urban_Rural': 'Urb_Rur'}):
    """
    dfvaris list of lists [[dataset, datasetCols,'isoYEAR']]
    column_remap if you need to remap columns
    """
    issueCols=set()
    for i, data in enumerate(dfvaris):
        ## Rename columns if needed    
        data[0].rename(inplace=True, columns=column_remap)
        
        dfvaris[i][1]=set(data[0].columns.sort_values(ascending=True).values)
        print(dfvaris[i][1])
        issueCols = dfvaris[i][1] - issueCols
        display(data[0].info())
    
    valuesCols={}
    closetsCols={}
    #################### this code will compare pairs of column outputs to homogeinaze the data.
    for a, b in itertools.combinations(dfvaris, 2):
        al=set(map(str.lower,a[1]))
        bl=set(map(str.lower,b[1]))
        c=a[1]-b[1]
        cl=al-bl
        d=b[1]-a[1]
        dl=bl-al
        symCols = a[1] & b[1]
        print('=========================  ', a[2], ' - ', b[2])
        print('Good matched columns')
        for i in symCols:
            if str(a[0].dtypes[i]) != str(b[0].dtypes[i]):
                print('+++++++++++++++++++++')
                print(a[2],a[0][i].dtype,' different dtype ',b[2],b[0][i].dtype)
                print('+++++++++++++++++++++')

            if len(a[0][i].unique()) < 20 and len(b[0][i].unique()) < 20:
                print('-',i)

                print('> {0}: {1}'.format(a[2],a[0][i].unique()))
                print('> {0}: {1}'.format(b[2],b[0][i].unique()))
                r = difflib.get_close_matches(i, possibilities = valuesCols.keys(), n = 1, cutoff = 0.98)
                if not r:
                    valuesCols[i]=set()
                    closetsCols[i]=set()
                else:
                    print('*********')
                    print(r)
                    print('*********')

                    valuesCols[r[0]].update(a[0][i].unique())
                    valuesCols[r[0]].update(b[0][i].unique())
                    print(r[0])
                    print(i)
                    print(round(difflib.SequenceMatcher(None, r[0], i).ratio(),3))
                    closetsCols[i].update(set(r))
            
    
        print("Columns that doesn't have a full match from both datasets", al^bl)
        for search in cl:
            matches = sorted(dl, key=lambda x: difflib.SequenceMatcher(None, x.lower(), search.lower()).ratio(), reverse=True)  
            if round(difflib.SequenceMatcher(None, matches[0], search).ratio(),3) > 0.8:
                print('--------')
                display(Markdown("**{0}** se compara con {1} el más parecido es {2} con un ratio de: {3}".format(search, matches, matches[0], round(difflib.SequenceMatcher(None, matches[0], search).ratio(),3)))) 


#### Functions to upload to the api

Indicators needs to be on the API first; to do so you need to edit this files:
* [National surveys](https://github.com/Vizzuality/i2i-api/blob/develop/app/src/data/indicators.json)
* [MSME](https://github.com/Vizzuality/i2i-api/blob/develop/app/src/data/msme-indicators.json)
* [Mobile surveys](https://github.com/Vizzuality/i2i-api/blob/develop/app/src/data/ms-indicators.json)

After editing the above one make sure they are categorized on the fron here:
* [National surveys](https://github.com/Vizzuality/i2i/blob/develop/app/assets/javascripts/collections/data_portal)
* [MSME](https://github.com/Vizzuality/i2i/blob/develop/app/assets/javascripts/collections/data_portal)
* [Mobile surveys](https://github.com/Vizzuality/i2i/blob/develop/app/assets/javascripts/collections/data_portal/ExploratorySurveyIndicatorsCollection.js)

In [3]:
"""
This is used to generate the matching json to upload a file, the column names should match with each object
"""
useJson =json.loads("""{
    "weightColumn": "finalweight",
    "indicators":{
        "Urb_Rur": {
            "indicatorId": "geographic_area",
            "childIndicatorId": null,
            "answerId": null
        },
        "Urbanicity": {
            "indicatorId": "geographic_area",
            "childIndicatorId": null,
            "answerId": null
        },
        "Province":{
            "indicatorId": "jurisdiction",
            "childIndicatorId": null,
            "answerId": null
        },
        "M_D3": {
            "indicatorId": "gender",
            "childIndicatorId": null,
            "answerId": null
        },
        "Gender": {
            "indicatorId": "gender",
            "childIndicatorId": null,
            "answerId": null
        },
        "i2i_Age": {
            "indicatorId": "age",
            "childIndicatorId": null,
            "answerId": null
        },
        "Age Group": {
            "indicatorId": "age",
            "childIndicatorId": null,
            "answerId": null
        },
        "i2i_Marital_Status":{
            "indicatorId": "i2i_Marital_Status",
            "childIndicatorId": null,
            "answerId": null
        },
        "Relationship status":{
            "indicatorId": "i2i_Marital_Status",
            "childIndicatorId": null,
            "answerId": null
        },
        "i2i_Education": {
            "indicatorId": "i2i_Education",
            "childIndicatorId": null,
            "answerId": null
        },
        "Education": {
            "indicatorId": "i2i_Education",
            "childIndicatorId": null,
            "answerId": null
        },
        "English literacy": {
            "indicatorId": "english_literacy",
            "childIndicatorId": null,
            "answerId": null
        },
        "Language of interview": {
            "indicatorId": "interview_lan",
            "childIndicatorId": null,
            "answerId": null
        },
        "Main language": {
            "indicatorId": "main_lan",
            "childIndicatorId": null,
            "answerId": null
        },
        "Mean household size": {
            "indicatorId": "household_size",
            "childIndicatorId": null,
            "answerId": null
        },
        "Own a phone": {
            "indicatorId": "own_phone",
            "childIndicatorId": null,
            "answerId": null
        },
        "Phone ownership": {
            "indicatorId": "who_phone",
            "childIndicatorId": null,
            "answerId": null
        },
        "Independent phone use": {
            "indicatorId": "phone_use",
            "childIndicatorId": null,
            "answerId": null
        },
        "Informal financial usage": {
            "indicatorId": "phone_use_financial",
            "childIndicatorId": null,
            "answerId": null
        },
        "i2i_Source_of_Income": {
            "indicatorId": "i2i_Income_Sources",
            "childIndicatorId": null,
            "answerId": null
        },
        "Water_source_type":{
            "indicatorId": "water_source_type",
            "childIndicatorId": null,
            "answerId": null
        },
        "Toilet_type":{
            "indicatorId": "toilet_type",
            "childIndicatorId": null,
            "answerId": null
        },
        "Cooking_energy":{
            "indicatorId": "cooking_energy_type",
            "childIndicatorId": null,
            "answerId": null
        },
        "Electricity_access":{
            "indicatorId": "electricity_access_type",
            "childIndicatorId": null,
            "answerId": null
        },
        "USD_per_day":{
            "indicatorId": "usd_per_day",
            "childIndicatorId": null,
            "answerId": null
        },
        "Poverty_line":{
            "indicatorId": "poverty_line",
            "childIndicatorId": null,
            "answerId": null
        },
        "$2.50 PPP Poverty line":{
            "indicatorId": "poverty_line",
            "childIndicatorId": null,
            "answerId": null
        },
        "Land ownership":{
            "indicatorId": "land_owner",
            "childIndicatorId": null,
            "answerId": null
        },
        "Land decision maker":{
            "indicatorId": "land_decission",
            "childIndicatorId": null,
            "answerId": null
        },
        "Children decision maker":{
            "indicatorId": "children_decission",
            "childIndicatorId": null,
            "answerId": null
        },
        "Earning frequency":{
            "indicatorId": "earning_freq",
            "childIndicatorId": null,
            "answerId": null
        },
        "Spending decision maker":{
            "indicatorId": "spending_decission",
            "childIndicatorId": null,
            "answerId": null
        },
        "Permission to work":{
            "indicatorId": "permission_work",
            "childIndicatorId": null,
            "answerId": null
        },
        "Main income earner":{
            "indicatorId": "main_income_earned",
            "childIndicatorId": null,
            "answerId": null
        },
        "Migrant worker identity":{
            "indicatorId": "migrant_work",
            "childIndicatorId": null,
            "answerId": null
        },
        "Possibility to raise 1/20 GNI (Gross National Income)":{
            "indicatorId": "raise_gni",
            "childIndicatorId": null,
            "answerId": null
        },
        "Income frequency of spouse":{
            "indicatorId": "freq_inconme_spouse",
            "childIndicatorId": null,
            "answerId": null
        },
        "Allow spouse to work":{
            "indicatorId": "allow_spouse_work",
            "childIndicatorId": null,
            "answerId": null
        },
        "Bank": {
            "indicatorId": "bank",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Permission to open bank": {
            "indicatorId": "bank_permission",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Money taken from account": {
            "indicatorId": "account_money_taken",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Savings goal decision maker": {
            "indicatorId": "saving_goal_deccision",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Savings goal influence": {
            "indicatorId": "saving_goal_influence",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Banked": {
            "indicatorId": "fas_strand",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Other_formal": {
            "indicatorId": "fas_strand",
            "childIndicatorId": null,
            "answerId": 2
        },
        "Informal": {
            "indicatorId": "fas_strand",
            "childIndicatorId": null,
            "answerId": 3
        },
        "fas_access": {
            "indicatorId": "fas_strand",
            "childIndicatorId": null,
            "answerId": 4
        },
        "Saving_B":{
            "indicatorId": "savings_strand",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Saving_F":{
            "indicatorId": "savings_strand",
            "childIndicatorId": null,
            "answerId": 2
        },
        "Saving_Inf":{
            "indicatorId": "savings_strand",
            "childIndicatorId": null,
            "answerId": 3
        },
        "Saving_AH":{
            "indicatorId": "savings_strand",
            "childIndicatorId": null,
            "answerId": 4
        },
        "saving_access":{
            "indicatorId": "savings_strand",
            "childIndicatorId": null,
            "answerId": 5
        },
        "Remittances_B":{
            "indicatorId": "remittances_strand",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Remittances_F":{
            "indicatorId": "remittances_strand",
            "childIndicatorId": null,
            "answerId": 2
        },
        "Remittances_Inf":{
            "indicatorId": "remittances_strand",
            "childIndicatorId": null,
            "answerId": 3
        },
        "Remittances_FF":{
            "indicatorId": "remittances_strand",
            "childIndicatorId": null,
            "answerId": 4
        },
        "remittances_access":{
            "indicatorId": "remittances_strand",
            "childIndicatorId": null,
            "answerId": 5
        },
        "Insurance_F":{
            "indicatorId": "insurance_strand",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Insurance_Inf":{
            "indicatorId": "insurance_strand",
            "childIndicatorId": null,
            "answerId": 2
        },
        "Insurance_B":{
            "indicatorId": "insurance_strand",
            "childIndicatorId": null,
            "answerId": 3
        },
        "insurance_access":{
            "indicatorId": "insurance_strand",
            "childIndicatorId": null,
            "answerId": 4
        },
        "Credit_B":{
            "indicatorId": "credit_strand",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Credit_F":{
            "indicatorId": "credit_strand",
            "childIndicatorId": null,
            "answerId": 2
        },
        "Credit_Inf":{
            "indicatorId": "credit_strand",
            "childIndicatorId": null,
            "answerId": 3
        },
        "Credit_FF":{
            "indicatorId": "credit_strand",
            "childIndicatorId": null,
            "answerId": 4
        },
        "credit_access":{
            "indicatorId": "credit_strand",
            "childIndicatorId": null,
            "answerId": 5
        },
        "Weight_Ind":{
            "indicatorId": "weight_ind",
            "childIndicatorId": null,
            "answerId": null
        },
        "FAS":{
            "indicatorId": "total_fas_strand",
            "childIndicatorId": null,
            "answerId": 1
        },
        "Saving_Strand":{
            "indicatorId": "total_saving_strand",
            "childIndicatorId": null,
            "answerId": 2
        },
        "Remittances_Strand":{
            "indicatorId": "total_remittances_strand",
            "childIndicatorId": null,
            "answerId": 3
        },
        "Insurance_Strand":{
            "indicatorId": "total_insurance_strand",
            "childIndicatorId": null,
            "answerId": 4
        },
        "Credit_Strand":{
            "indicatorId": "total_credit_strand",
            "childIndicatorId": null,
            "answerId": 5
        },
        "Mobile_money":{
            "indicatorId": "mobile_money",
            "childIndicatorId": null,
            "answerId": null
        },
        "Mobile Money":{
            "indicatorId": "mobile_money",
            "childIndicatorId": null,
            "answerId": null
        }
    }
}""")

In [11]:
def saveData(data, dataOutPath, env='staging', section='mobile'):
    
    base = {
        'staging':'https://staging.i2ifacility.org',
        'production':'http://i2i.vizzuality.com'
        }
    sect = {
        'ns':  'api/v1',
        'msme':'msme-api/v1',
        'mobile': 'ms-api/v2'
        }
    columnst = list(useJson['indicators'].keys())
    columnst.append('finalweight')
    AceptedColumns = set(columnst)
    calculatePopulation(data)
    baseUrl = f'{base[env]}/{sect[section]}'
    columns = (AceptedColumns & set(data[0].columns.sort_values(ascending=True).values))
    info = {
        'iso': data[2][0:3].upper(),
        'year': int(data[2][3:]),
        'total': sum(data[0]['finalweight']),
        'fileUrl': f'{dataOutPath}/{data[2]}.csv',
        'jsonUrl': f'{dataOutPath}/{data[2]}.json',
        'baseUrl': f'{base[env]}/{sect[section]}/country'
        }
    info.update({'url': f"{info['baseUrl']}/{info['iso']}/{info['year']}/dataset",
                 'dataUrl':f"https://s3-us-west-2.amazonaws.com/i2ifacility.org/{info['iso']}/{sect[section]}/{info['year']}.zip"
                })
    #save the data
    data[0][list(columns)].to_csv(info['fileUrl'])
    #save the key json
    outCol = dict((key,value) for key, value in useJson["indicators"].items() if key in columns)
    myJson = {
        "weightColumn": "finalweight",
        "indicators": outCol}
    with open(info['jsonUrl'], 'w') as outfile:
        json.dump(myJson, outfile)
    
    return info

def upsertCountry(info, auth, op='POST'):
    """
    Recive info as an dict with:
    {
    iso:
    year:
    total:
    fileUrl:
    jsonUrl: 
    url:
    baseUrl:
    }
    """
    wanted_keys = ['iso', 'year', 'total', 'dataUrl'] # The keys you want
    body = dict((k, info[k]) for k in wanted_keys if k in info)
    countryName = pycountry.countries.get(alpha_3=info['iso'])
    if not countryName:
        raise ValueError(f"{info['iso']} not a country alpha_3 iso code")

    
    body.update({"name":countryName.name})
    
    try:
        url= f"{info['baseUrl']}" if op == 'POST' else f"{info['baseUrl']}/{info['iso']}"
        s = requests.request(op, url, auth=auth, json=body)
        s.raise_for_status()
        print(f'Country {body["name"]} {("created" if op=="POST" else "updated")} succesfully')
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 400:
            s = upsertCountry(info, auth,  op='PATCH')
        else:
            print(e.response.text)
    return s
    
def uploadDataFunction(info, auth):
    """
    Recive info as an dict with:
    {
    iso:
    year:
    fileUrl:
    jsonUrl: 
    url:
    baseUrl:
    }
    """ 
    files1 = {'json': open(info["jsonUrl"], 'rb'), 'csv': open(info["fileUrl"], 'rb')}
    r = requests.post(info["url"], auth=auth, files=files1)
    return r.json()


### FULL PIPELINE

In [12]:
### Requirements that need to be set:
basePath = '/home/jovyan/work/datasets/i2i'
OutPath = f'{basePath}/newDatav7/out'
InPath = f'{basePath}/newDatav7'
DatasetList = [
    [f'{InPath}/200303 Kenya CAPI weighted outputs label update.csv', 'ken', '2020']
    #[f'{InPath}/FS_MSME Swaziland_Shortened_Version_13Sept19 final .xlsx', 'swz', '2020'],
    #[f'{InPath}/FS Myanmar 2018 Final Data Set Shortened 13Sep19 final.xlsx', 'mmr', '2020']
    ]

## 'vizzuality', '<check Last Pass>'
user = getpass.getpass(prompt='User: ')
password = getpass.getpass(prompt='Password: ')
auth =requests.auth.HTTPBasicAuth(user, password)

User: ········
Password: ········


In [16]:
# if new indicators don't forget to add them here too
useJson_t =json.loads("""
{
    "geographic_area": "Geographic Area",
    "gender": "Gender",
    "age": "Age",
    "access_to_resources": "Access to Resources",
    "dwelling_type": "Dwelling type: roof/dwelling",
    "i2i_Marital_Status": "Marital Status",
    "i2i_Education": "Level of education",
    "i2i_Income_Sources": "Sources of income",
    "toilet_type": "Sanitation type",
    "cooking_energy_type": "Cooking energy source",
    "electricity_access_type": "Electricity access",
    "usd_per_day": "USD per day",
    "poverty_line": "Poverty line",
    "mobile_money": "Mobile Money",
    "water_source_type": "Water source type",
    "english_literacy": "English literacy",
    "interview_lan": "Language of interview",
    "own_phone": "Own a phone",
    "who_phone": "Phone ownership",
    "phone_use": "Independent phone use",
  "phone_use_financial": "Informal financial usage",
    "main_lan": "Main language",
    "household_size": "Mean household size",
  "mobile_money":"Mobile Money",
  "bank":"Bank",
  "bank_permission":"Permission to open bank",
  "account_money_taken":"Money taken from account",
  "saving_goal_deccision":"Savings goal decision maker",
  "saving_goal_influence":"Savings goal influence",
  "land_owner":"Land ownership",
  "land_decission":"Land decision maker",
  "children_decission":"Children decision maker",
  "earning_freq":"Earning frequency",
  "spending_decission":"Spending decision maker",
  "permission_work":"Permission to work",
  "main_income_earned":"Main income earner",
  "migrant_work":"Migrant worker identity",
  "raise_gni":"Possibility to raise 1/20 GNI (Gross National Income)",
  "freq_inconme_spouse":"Income frequency of spouse",
  "allow_spouse_work":"Allow spouse to work"
}
 """)
with open(f'{OutPath}/indicators.json', 'w') as outfile:
    json.dump(useJson_t, outfile, sort_keys=True, indent=4)

In [13]:
### this will generate the initial check of the dataset. 
### if you need to add new checks and cleaning options for now add them in checDataset function
datasets = []
for dataset in DatasetList:
    datasets.append(checkDataset(*dataset, clean=True))

## ken2020

Unnamed: 0,InstanceID,finalweight,Gender,Relationship status,Urbanicity,Age Group,Education,English literacy,Own a phone,$2.50 PPP Poverty line,Language of interview,Main language,Mean household size,Region,Province,Phone ownership,Independent phone use,Informal financial usage,Mobile Money,Bank,Permission to open bank,Money taken from account,Savings goal decision maker,Savings goal influence,Land ownership,Land decision maker,Children decision maker,Earning frequency,Spending decision maker,Permission to work,Main income earner,Migrant worker identity,Possibility to raise 1/20 GNI (Gross National Income),Income frequency of spouse,"Allow spouse to work,,"
0,770458,10776.661459,Male,Married/relationship,Urban,25-34,Secondary education,Fair,own phone,Above poverty line,English,Swahili,Below mean size,Nairobi,Nairobi,own phone/business/employer,Yes,no,own,unbanked,no,money not taken,spouse/family/others on behalf,sometimes accepted/rejected,dont own,,decide together,Daily,Help from someone else,Already working,respondent,"No-one, all live together",Not very possible,dont work,Yes
1,770459,9983.318465,Female,Married/relationship,Urban,25-34,Primary education,Good,own phone,Above poverty line,Swahili,Swahili,Below mean size,Nairobi,Nairobi,own phone/business/employer,Yes,no,own,unbanked,no,money not taken,self alone,sometimes accepted/rejected,own by self,Family/friends without me,decide together,Weekly/biweekly,Help from someone else,Already working,spouse,"No-one, all live together",Not at all possible,Monthly,Spouse already works for money


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
InstanceID,3000,,,,775220.0,2804.73,770458.0,773092.0,774944.0,777020.0,788781.0
finalweight,3000,,,,7323.37,6821.96,119.348,3513.72,6031.08,8937.02,91545.8
Gender,3000,2.0,Female,1731.0,,,,,,,
Relationship status,3000,2.0,Married/relationship,2101.0,,,,,,,
Urbanicity,3000,2.0,Rural,1920.0,,,,,,,
Age Group,3000,5.0,25-34,1021.0,,,,,,,
Education,3000,5.0,Secondary education,1182.0,,,,,,,
English literacy,3000,5.0,Good,971.0,,,,,,,
Own a phone,3000,2.0,own phone,2670.0,,,,,,,
$2.50 PPP Poverty line,3000,2.0,Above poverty line,1502.0,,,,,,,


### Columns descriptions

#### **column:** InstanceID ----- **dtype**: Int64

#### **column:** finalweight ----- **dtype**: float64

#### **column:** Gender ----- **dtype**: string

##### Before clean applied

<StringArray>
['Male', 'Female']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Male', 'Female']
Length: 2, dtype: string


#### **column:** Relationship status ----- **dtype**: string

##### Before clean applied

<StringArray>
['Married/relationship', 'Not married/relationship']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Married/Relationship', 'Not married/Relationship']
Length: 2, dtype: string


#### **column:** Urbanicity ----- **dtype**: string

##### Before clean applied

<StringArray>
['Urban', 'Rural']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Urban', 'Rural']
Length: 2, dtype: string


#### **column:** Age Group ----- **dtype**: string

##### Before clean applied

<StringArray>
['25-34', '35-44', '18-24', '55+', '45-54']
Length: 5, dtype: string


##### After clean applied

<StringArray>
['25-34', '35-44', '18-24', '55+', '45-54']
Length: 5, dtype: string


#### **column:** Education ----- **dtype**: string

##### Before clean applied

<StringArray>
['Secondary education',   'Primary education',    'Higher education',
 'No formal education',               'Other']
Length: 5, dtype: string


##### After clean applied

<StringArray>
['Secondary education',   'Primary education',    'Higher education',
 'No formal education',               'Other']
Length: 5, dtype: string


#### **column:** English literacy ----- **dtype**: string

##### Before clean applied

<StringArray>
['Fair', 'Good', 'Excellent', 'Not at all', 'Poorly']
Length: 5, dtype: string


##### After clean applied

<StringArray>
['Fair', 'Good', 'Excellent', 'Not at all', 'Poorly']
Length: 5, dtype: string


#### **column:** Own a phone ----- **dtype**: string

##### Before clean applied

<StringArray>
['own phone', 'dont own phone']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Own phone', 'Dont own phone']
Length: 2, dtype: string


#### **column:** $2.50 PPP Poverty line ----- **dtype**: string

##### Before clean applied

<StringArray>
['Above poverty line', 'Below poverty line']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['Above poverty line', 'Below poverty line']
Length: 2, dtype: string


#### **column:** Language of interview ----- **dtype**: string

##### Before clean applied

<StringArray>
[ 'English',  'Swahili',      'Luo',    'Kisii',   'Kikuyu',   'Somali',
     'Meru',    'Kamba',    'Luhya', 'Kalenjin']
Length: 10, dtype: string


##### After clean applied

<StringArray>
[ 'English',  'Swahili',      'Luo',    'Kisii',   'Kikuyu',   'Somali',
     'Meru',    'Kamba',    'Luhya', 'Kalenjin']
Length: 10, dtype: string


#### **column:** Main language ----- **dtype**: string

##### Before clean applied

<StringArray>
[  'Swahili',     'Luhya',       'Luo',  'Kalenjin',   'English', 'Congolese',
    'Kikuyu',      'Teso',     'Kisii',    'Somali',     'Kamba',      'Meru',
  'Kiduruma', 'Mijikenda',      'Gari',    'Chonyi',   'Turkana',   'Gariama',
    'Kidigo',    'Borana',   'Kiganda',    'Maasai',    'Kiembu',     'Taita',
  'Kimbeere',  'Kibajuni',   'Kikuria']
Length: 27, dtype: string


##### After clean applied

<StringArray>
[  'Swahili',     'Luhya',       'Luo',  'Kalenjin',   'English', 'Congolese',
    'Kikuyu',      'Teso',     'Kisii',    'Somali',     'Kamba',      'Meru',
  'Kiduruma', 'Mijikenda',      'Gari',    'Chonyi',   'Turkana',   'Gariama',
    'Kidigo',    'Borana',   'Kiganda',    'Maasai',    'Kiembu',     'Taita',
  'Kimbeere',  'Kibajuni',   'Kikuria']
Length: 27, dtype: string


#### **column:** Mean household size ----- **dtype**: string

##### Before clean applied

<StringArray>
['Below mean size', 'Above mean size', 'Mean']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Below mean size', 'Above mean size', 'Mean']
Length: 3, dtype: string


#### **column:** Region ----- **dtype**: string

##### Before clean applied

<StringArray>
[      'Nairobi',        'Nyanza',       'Central',   'Rift Valley',
         'Coast', 'North Eastern',       'Western',       'Eastern']
Length: 8, dtype: string


##### After clean applied

<StringArray>
[      'Nairobi',        'Nyanza',       'Central',   'Rift valley',
         'Coast', 'North eastern',       'Western',       'Eastern']
Length: 8, dtype: string


#### **column:** Province ----- **dtype**: string

##### Before clean applied

<StringArray>
[    'Nairobi',      'Nyanza',     'Central', 'Rift Valley',       'Coast',
           ' ',     'Western',     'Eastern',  'North East']
Length: 9, dtype: string


##### After clean applied

<StringArray>
[    'Nairobi',      'Nyanza',     'Central', 'Rift valley',       'Coast',
            '',     'Western',     'Eastern',  'North east']
Length: 9, dtype: string


#### **column:**  Phone ownership ----- **dtype**: string

##### Before clean applied

<StringArray>
['own phone/business/employer',                     'Sibling',
       'Dont use mobile phone',               'Husband/wives',
             'Other man/women',                      'Parent',
                    'Children']
Length: 7, dtype: string


##### After clean applied

<StringArray>
['Own phone/Business/Employer',                     'Sibling',
       'Dont use mobile phone',               'Husband/Wives',
             'Other man/Women',                      'Parent',
                    'Children']
Length: 7, dtype: string


#### **column:** Independent phone use ----- **dtype**: string

##### Before clean applied

<StringArray>
['Yes', 'Dont use mobile phone', 'No', 'Somewhat']
Length: 4, dtype: string


##### After clean applied

<StringArray>
['Yes', 'Dont use mobile phone', 'No', 'Somewhat']
Length: 4, dtype: string


#### **column:** Informal financial usage ----- **dtype**: string

##### Before clean applied

<StringArray>
['no', 'yes']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['No', 'Yes']
Length: 2, dtype: string


#### **column:** Mobile Money ----- **dtype**: string

##### Before clean applied

<StringArray>
['own', 'dont use MM', 'share/use others', 'dont use mobile phone']
Length: 4, dtype: string


##### After clean applied

<StringArray>
['Own', 'Dont use mm', 'Share/Use others', 'Dont use mobile phone']
Length: 4, dtype: string


#### **column:** Bank ----- **dtype**: string

##### Before clean applied

<StringArray>
['unbanked', 'own', 'spouse', 'other', 'other fam']
Length: 5, dtype: string


##### After clean applied

<StringArray>
['Unbanked', 'Own', 'Spouse', 'Other', 'Other fam']
Length: 5, dtype: string


#### **column:** Permission to open bank ----- **dtype**: string

##### Before clean applied

<StringArray>
['no', 'yes']
Length: 2, dtype: string


##### After clean applied

<StringArray>
['No', 'Yes']
Length: 2, dtype: string


#### **column:** Money taken from account ----- **dtype**: string

##### Before clean applied

<StringArray>
['money not taken', 'money taken', 'dont have account']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Money not taken', 'Money taken', 'Dont have account']
Length: 3, dtype: string


#### **column:** Savings goal decision maker ----- **dtype**: string

##### Before clean applied

<StringArray>
['spouse/family/others on behalf',                     'self alone',
                'no savings goal',                'decide together']
Length: 4, dtype: string


##### After clean applied

<StringArray>
['Spouse/Family/Others on behalf',                     'Self alone',
                'No savings goal',                'Decide together']
Length: 4, dtype: string


#### **column:** Savings goal influence ----- **dtype**: string

##### Before clean applied

<StringArray>
['sometimes accepted/rejected',                'dont consult',
             'no savings goal',             'always accepted',
             'always rejected']
Length: 5, dtype: string


##### After clean applied

<StringArray>
['Sometimes accepted/Rejected',                'Dont consult',
             'No savings goal',             'Always accepted',
             'Always rejected']
Length: 5, dtype: string


#### **column:** Land ownership ----- **dtype**: string

##### Before clean applied

<StringArray>
['dont own', 'own by self', 'own with others']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Dont own', 'Own by self', 'Own with others']
Length: 3, dtype: string


#### **column:** Land decision maker ----- **dtype**: string

##### Before clean applied

<StringArray>
[' ', 'Family/friends without me', 'Me only', 'Decide together with others']
Length: 4, dtype: string


##### After clean applied

<StringArray>
['', 'Family/Friends without me', 'Me only', 'Decide together with others']
Length: 4, dtype: string


#### **column:** Children decision maker ----- **dtype**: string

##### Before clean applied

<StringArray>
[        'decide together',     'Not in relationship',
 'We do not talk about it',          'Spouse decides',
            'decide alone']
Length: 5, dtype: string


##### After clean applied

<StringArray>
[        'Decide together',     'Not in relationship',
 'We do not talk about it',          'Spouse decides',
            'Decide alone']
Length: 5, dtype: string


#### **column:** Earning frequency ----- **dtype**: string

##### Before clean applied

<StringArray>
[          'Daily', 'Weekly/biweekly',     'I dont work',         'Monthly',
 'When I get work',     'Work unpaid']
Length: 6, dtype: string


##### After clean applied

<StringArray>
[          'Daily', 'Weekly/Biweekly',     'I dont work',         'Monthly',
 'When i get work',     'Work unpaid']
Length: 6, dtype: string


#### **column:** Spending decision maker ----- **dtype**: string

##### Before clean applied

<StringArray>
['Help from someone else', 'Fully on own', 'Someone else decides for me']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Help from someone else', 'Fully on own', 'Someone else decides for me']
Length: 3, dtype: string


#### **column:** Permission to work ----- **dtype**: string

##### Before clean applied

<StringArray>
['Already working', 'Yes', 'No']
Length: 3, dtype: string


##### After clean applied

<StringArray>
['Already working', 'Yes', 'No']
Length: 3, dtype: string


#### **column:** Main income earner ----- **dtype**: string

##### Before clean applied

<StringArray>
[       'respondent',            'spouse',         'other fam',
         'dont know',             'other', 'refused to answer',
      'earn equally']
Length: 7, dtype: string


##### After clean applied

<StringArray>
[       'Respondent',            'Spouse',         'Other fam',
        'Don’t know',             'Other', 'Refused to answer',
      'Earn equally']
Length: 7, dtype: string


#### **column:** Migrant worker identity ----- **dtype**: string

##### Before clean applied

<StringArray>
['No-one, all live together', 'Other family', 'Spouse', 'Me']
Length: 4, dtype: string


##### After clean applied

<StringArray>
['No-one, all live together', 'Other family', 'Spouse', 'Me']
Length: 4, dtype: string


#### **column:** Possibility to raise 1/20 GNI (Gross National Income) ----- **dtype**: string

##### Before clean applied

<StringArray>
[  'Not very possible', 'Not at all possible',   'Somewhat possible',
       'Very possible']
Length: 4, dtype: string


##### After clean applied

<StringArray>
[  'Not very possible', 'Not at all possible',   'Somewhat possible',
       'Very possible']
Length: 4, dtype: string


#### **column:** Income frequency of spouse ----- **dtype**: string

##### Before clean applied

<StringArray>
[         'dont work',            'Monthly',          'no spouse',
              'Daily',    'Weekly/biweekly',          'dont know',
 'When they get work',        'Work unpaid']
Length: 8, dtype: string


##### After clean applied

<StringArray>
[         'Dont work',            'Monthly',          'No spouse',
              'Daily',    'Weekly/Biweekly',         'Don’t know',
 'When they get work',        'Work unpaid']
Length: 8, dtype: string


#### **column:** Allow spouse to work,, ----- **dtype**: string

##### Before clean applied

<StringArray>
[                            'Yes',  'Spouse already works for money',
                     'Not married', 'Spouse already works for money,',
                    'Not married,',                            'Yes,',
                              'No',                             'No,']
Length: 8, dtype: string


##### After clean applied

<StringArray>
['Yes', 'Spouse already works for money', 'Not married', 'No']
Length: 4, dtype: string


In [14]:
## this will compare the columns for different dataset to help harmonize the results.
compareDatasets(datasets)

{'English literacy', 'Mobile Money', 'Permission to work', 'Language of interview', 'Earning frequency', 'Urbanicity', 'Own a phone', 'Education', 'Main income earner', 'Bank', 'Children decision maker', 'Income frequency of spouse', 'Region', 'Mean household size', 'Permission to open bank', 'Money taken from account', 'Allow spouse to work', 'Land decision maker', 'Province', 'finalweight', '$2.50 PPP Poverty line', 'InstanceID', 'Savings goal decision maker', 'Independent phone use', 'Land ownership', 'Gender', 'Possibility to raise 1/20 GNI (Gross National Income)', 'Main language', 'Relationship status', 'Savings goal influence', 'Age Group', 'Spending decision maker', 'Informal financial usage', 'Migrant worker identity', 'Phone ownership'}
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 35 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                         

None

In [13]:
## Once we are happy with the result of above functions we will proceed to upload the data.
for dataset in datasets:
    print('///////////////////////////////////////')
    myInfo = saveData(dataset, OutPath, env='staging', section='mobile')
    print('Creating or patching the country')
    print(upsertCountry(myInfo, auth))
    print('______________________________')
    print('Uploading the data')
    print(uploadDataFunction(myInfo, auth))

///////////////////////////////////////
Population over which it was estimated
_________________  ken2020
Mujeres:  11265010.97035495
Hombres:  10705085.832696838
Total:  21970096.803051755
_________________ 
Creating or patching the country
Country Kenya created succesfully
<Response [200]>
______________________________
Uploading the data
{'ok': 1}
