# Domestic Load Research Programme Social Survey Exploration

This notebook requires access to a data directory with DLR survey data saved as feather objects. The data files must be saved in `/data/tables/` .

In [None]:
import features.feature_socios as s
import colorlover as cl
import plotly.graph_objs as go
import plotly.offline as po
po.init_notebook_mode(connected=True)

import os
import pandas as pd
import numpy as np

## List of Questionaires

In [None]:
qnrs = s.loadTable('questionaires')
qnrs[qnrs['QuestionaireID'].isin([3, 4, 6, 7, 1000000, 1000001, 1000002])]

In [None]:
groups = s.loadTable('groups')
print(groups.head())
print(groups.Survey.unique())
groups.loc[groups.Survey!='Namibia'].groupby(['Year'])['Location'].count().T

In [None]:
ids = s.loadID()
ids.head()

## Search Questions

In [None]:
s.searchQuestions('watersource')

## Search Answers

In [None]:
answers = s.searchAnswers('watersource')
answers.tail()

## Extract Survey Responses

In [None]:
sdist = s.genS('features3',1994,2014)
sdist.head()

In [None]:
sdist.tail()

## Plot Survey Responses

In [None]:
income = [go.Histogram(x=sdist.monthly_income, 
                           xbins=dict(
                                start=0,
                                end=sdist.monthly_income.max()/2,
                                size= 1000),
                            autobinx = False,
                            marker=dict(color=cl.scales['3']['qual']['Pastel1'][0], 
                                        line=dict(color='#000000', width=0.5)))]
layout = go.Layout(title='Distribution of Monthly Income of Survey Respondents',
                  xaxis=dict(title='Household Income in ZAR (R1000 bin size, inflation adjusted to Dec 2016)'),
                  yaxis=dict(title='Household Count', showline=True),
                  margin=dict(t=70),
                  height=350, width=650)
fig = go.Figure(data=income, layout=layout)
po.iplot(fig)

In [None]:
floor_area = [go.Histogram(x=sdist.floor_area, 
                            xbins=dict(
                                start=0,
                                end=sdist.floor_area.max()/2,
                                size= 25),
                            autobinx=False,
                            marker=dict(color=cl.scales['3']['qual']['Pastel1'][1], 
                                        line=dict(color='#000000', width=0.5)))]
layout = go.Layout(title='Distribution of Dwelling Floor Area of Survey Respondents',
                  xaxis=dict(title='Dwelling Floor Area (25mË†2 bin size)'),
                  yaxis=dict(title='Household Count', showline=True),
                  margin=dict(t=70),
                  height=350, width=650)
fig = go.Figure(data=floor_area, layout=layout)
po.iplot(fig)

In [None]:
years_electrified = [go.Histogram(x=sdist.years_electrified, 
                            xbins=dict(
                                start=0,
                                end=sdist.years_electrified.max()/2,
                                size= 5),
                            autobinx=False,
                            marker=dict(color=cl.scales['3']['qual']['Pastel1'][2], 
                                        line=dict(color='#000000', width=0.5)))]
layout = go.Layout(title="Survey Respondents' Years Since Electrification ",
                  xaxis=dict(title='Years Electrified (1 year bin size)'),
                  yaxis=dict(title='Household Count', showline=True),
                  margin=dict(t=70),
                  height=350, width=650)
fig = go.Figure(data=years_electrified, layout=layout)
po.iplot(fig)

In [None]:
wall_material = sdist.reset_index().groupby('wall_material')['ProfileID'].count()
roof_material = sdist.reset_index().groupby('roof_material')['ProfileID'].count()
colors = cl.scales['12']['qual']['Set3']

fig = {
  "data": [
    {
      "values": wall_material.values,
      "labels": wall_material.index,
      "textinfo": 'label+percent',
      "textposition": "inside",
      "textfont": dict(size=20, color='#000000'),
      "marker": dict(colors=colors, line=dict(color='#000000', width=0.5)),
      "domain": {"x": [0, .48]},
      "name": "wall material",
      "hole": .3,
      "type": "pie",
      "pull": 0.035,
      "rotation": 17
    },
    {
      "values": roof_material.values,
      "labels": roof_material.index,
      "textinfo": 'label+percent',
      "textposition":"inside",
      "textfont": dict(size=20, color='#000000'),
      "marker": dict(line=dict(color='#000000', width=0.5)),
      "domain": {"x": [.52, 1]},
      "name": "roof material",
      "hole": .3,
      "type": "pie",
      "pull": 0.035,
      "rotation": -103
    }],
  "layout": {
        "title":"Dwelling Materials of Survey Respondents",
        "titlefont":dict(size=24),
        "legend": dict(font=dict(size=18), orientation="h"),
        "width": 900,
        "height": 600,
        "annotations": [
            {
                "font": {"size": 24},
                "showarrow": False,
                "text": "wall",
                "x": 0.21,
                "y": 0.5
            },
            {
                "font": {"size": 24},
                "showarrow": False,
                "text": "roof",
                "x": 0.79,
                "y": 0.5
            }]}}
po.iplot(fig)

In [None]:
water = sdist.reset_index().groupby('water_access')['ProfileID'].count().sort_values()
water.index = ['river/dam/borehole', 'block/street taps', 'tap in yard', 'tap in house']

fig = {
  "data": [
    {
      "values": water.values,
      "labels": water.index,
      "textinfo": 'label+percent',
      "textposition": "auto",
      "textfont": dict(size=20, color='#000000'),
      "marker": dict(colors=cl.scales['4']['div']['BrBG'], line=dict(color='#000000', width=0.5)),
      "name": "water access",
      "hole": .15,
      "type": "pie",
      "pull": 0.035,
      "rotation": 110
    }],
  "layout": dict(title="Water Access of Survey Respondents",
                 titlefont=dict(size=24),
                 legend=dict(font=dict(size=18), orientation="h"),
                 margin=dict(b=200),
                 height=600,
                 width=700)}
po.iplot(fig)

In [None]:
wall_material = sdist.reset_index().groupby('wall_material')['ProfileID'].count()
roof_material = sdist.reset_index().groupby('roof_material')['ProfileID'].count()
water = sdist.reset_index().groupby('water_access')['ProfileID'].count().sort_values()
water.index = ['river/dam/borehole', 'block/street taps', 'tap in yard', 'tap in house']

colors = cl.scales['12']['qual']['Set3']

fig = {
  "data": [
    {
      "values": wall_material.values,
      "labels": wall_material.index,
      "textinfo": 'label+percent',
      "textposition": "inside",
      "textfont": dict(size=14, color='#000000'),
      "legendgroup":"materials",
      "marker": dict(colors=colors, line=dict(color='#000000', width=0.5)),
      "domain": {"x": [0.35, .66]},
      "name": "wall material",
      "hole": .25,
      "type": "pie",
      "pull": 0.04,
      "rotation": 17
    },
    {
      "values": roof_material.values,
      "labels": roof_material.index,
      "textinfo": 'label+percent',
      "textposition":"inside",
      "textfont": dict(size=14, color='#000000'),
      "legendgroup":"materials",
      "marker": dict(line=dict(color='#000000', width=0.5)),
      "domain": {"x": [.68, 1]},
      "name": "roof material",
      "hole": .25,
      "type": "pie",
      "pull": 0.035,
      "rotation": -103
    },
    {
      "values": water.values,
      "labels": water.index,
      "textinfo": 'label+percent',
      "textposition": "auto",
      "textfont": dict(size=14, color='#000000'),
      "legendgroup":"water",
      "marker": dict(colors=cl.scales['4']['div']['BrBG'], line=dict(color='#000000', width=0.5)),
      "domain": {"x": [0, .33]},
      "name": "water access",
      "hole": .25,
      "type": "pie",
      "pull": 0.035,
      "rotation": 150
    }
  ],
  "layout": {
        "title":"Water Access and Dwelling Materials of Survey Respondents",
        "titlefont":dict(size=22),
        "legend": dict(font=dict(size=14), traceorder="grouped", tracegroupgap=20, y=1.5),
        "margin": dict(t=10),
        "width": 1000,
        "height": 600,
        "annotations": [
            {
                "font": {"size": 22},
                "showarrow": False,
                "text": "wall",
                "x": 0.5,
                "y": 0.5
            },
            {
                "font": {"size": 22},
                "showarrow": False,
                "text": "roof",
                "x": 0.87,
                "y": 0.5
            },            
            {
                "font": {"size": 22},
                "showarrow": False,
                "text": "water",
                "x": 0.125,
                "y": 0.5
            },
            {
                "x":1.2,
                "y":1.05,
                "font": {"size": 16},
                "xref":'paper',
                "yref":'paper',
                "text":'Dwelling Materials',
                "showarrow":False
            },
            {
                "x":1.16,
                "y":0.23,
                "font": {"size": 16},
                "xref":'paper',
                "yref":'paper',
                "text":'Water Access',
                "showarrow":False
            }
    ]}}
po.iplot(fig)

In [None]:
def loadID():
    """
    This function matches all ProfileIDs of observational electricity data with AnswerIDs of the corresponding survey 
    responses. Namibian households are removed. The following geographic information is added for each location:
        - Latitude
        - Longitude
        - Province
        - Municipality
        - District
    """
    this_dir = 'data'
    groups = s.loadTable('groups')
    links = s.loadTable('links')
    profiles = s.loadTable('profiles')
    
#    a_id = links[(links.GroupID != 0) & (links['AnswerID'] != 0)].drop(columns=['ConsumerID','lock','ProfileID'])
    p_id = links[(links.GroupID != 0) & (links['ProfileID'] != 0)].drop(labels=['ConsumerID','lock','AnswerID'], axis=1)
    profile_meta = profiles.merge(p_id, how='left', left_on='ProfileId', right_on='ProfileID').drop(labels=['ProfileId','lock'], axis=1)

    ap = links[links.GroupID==0].drop(labels=['ConsumerID','lock','GroupID'], axis=1)
    
    x = profile_meta.merge(ap, how='outer', on = 'ProfileID')    
    join = x.merge(groups, on='GroupID', how='left')

    #Wrangling data into right format    
    all_ids = join[join['Survey'] != 'Namibia'] # remove Namibian households 
    all_ids = all_ids.dropna(subset=['GroupID','Year'])
    all_ids.Year = all_ids.Year.astype(int)
    all_ids.GroupID = all_ids.GroupID.astype(int)
    all_ids.AnswerID.fillna(0, inplace=True)
    all_ids.AnswerID = all_ids.AnswerID.astype(int)
    all_ids.ProfileID = all_ids.ProfileID.astype(int)

    try:
        geo_meta = pd.read_csv(os.path.join(this_dir,'obs_datasets','geo_meta', 'site_geo.csv'))
    except:
        geoMeta()
        geo_meta = pd.read_csv(os.path.join(this_dir,'data', 'geometa', 'site_geo.csv'))

    output = all_ids.merge(geo_meta[['GPSName','Lat','Long','Province','Municipality',
                                     'District']], left_on='LocName', right_on='GPSName', how='left')
    output.drop(labels='GPSName', axis=1, inplace=True)
        
    return output

In [None]:
ids = loadID()
ids.head()

In [None]:
np.sort(ids.loc[(ids.Survey=='NRS LR') & (ids.Municipality=='City of Cape Town'),'Year'].unique())

In [None]:
ids.groupby(['Survey','Province','Year'])['AnswerID'].count().unstack().fillna('')

In [None]:
ids.groupby(['Survey','Province','Year'])['AnswerID'].count().unstack().fillna('')

In [None]:
ids.loc[(ids.Survey=='NRS LR')].groupby('Year')['AnswerID'].count().plot.bar()

In [None]:
ids.groupby(['Survey','Year'])['AnswerID'].count().unstack().T.plot.bar(by='Survey')