In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import pandas as pd
import csv
import copy
from datetime import datetime, timedelta
from zipfile import ZipFile
from typing import Tuple, Union
import numpy as np
import base64
import itertools
import re

In [None]:
df_survey = pd.read_csv('census_survey.csv')
df_survey.head()

In [None]:
df_demographics = pd.read_csv('demographics.csv')
df_demographics.head()

In [None]:
# use pandas to remove NaN phone_numbers from df_demographics
df_demographics = df_demographics.dropna(subset=['phone_number'])
df_demographics.head()

In [None]:
# use pandas to merge df_survey and df_demographics on phone_number
df_merged = pd.merge(df_survey, df_demographics, on='phone_number')
df_merged.head()

In [None]:
# rename comfort_with_ussd column values to be shorter
df_merged['comfort_with_ussd'] = df_merged['comfort_with_ussd'].replace({
    "Very Comfortable--I can use USSD without help from anyone": "1. Very Comfortable",
    "Somewhat comfortable--I may ask for help sometimes": "2. Somewhat Comfortable",
    "Not very comfortable--I need help over 50% of the time": "3. Not Comfortable",
    "Uncomfortable--I always ask someone else to navigate the menus on my behalf": "4. Very Uncomfortable"
})

In [None]:
df_whitelist = pd.read_csv('../ekichabi-ussd/data/whitelist_pretty.csv').drop(columns=['Unnamed: 0']).rename(columns={"dir_phn1": 'phone_number'})
df_whitelist.head()

In [None]:
# use pandas to merge df_merged and df_whitelist on phone_number
df_merged = pd.merge(df_merged, df_whitelist, on='phone_number')
df_merged.head()

In [None]:
df_merged.info()

In [None]:
# describe numerical and categorical columns except for phone_number, hhID, phn1, phn2
df_merged[[col for col in df_merged.columns if col not in ["phone_number", "hhID", "phn1", "phn2"]]].describe(include='all')

In [None]:
df_merged = df_merged[df_merged["source"] == "WHITELIST_CSV"]

In [None]:
df_logs = pd.read_csv('logs.csv').drop(columns=['Unnamed: 0', 'Unnamed: 0.1']).dropna(subset=['phone_number'])
df_logs['phone_number'] = df_logs['phone_number'].astype('int64')
df_logs.info()

In [None]:
df_whitelist["phone_number"]

In [None]:
df_logs["phone_number"]

In [None]:
numbers = set(df_whitelist["phone_number"])

In [None]:
len([1 for number in df_logs["phone_number"] if number in numbers])

In [None]:
# group by phone_number and sum duration and businesses_visited while adding an extra column for number of entries summed
df_logs['sessions'] = 1
df_logs = df_logs.groupby('phone_number').agg({'sessions': 'sum', 'duration': 'sum', 'businesses_visited': 'sum', 'category_browse': 'sum', 'searches': 'sum'}).reset_index()
df_logs.info()

In [None]:
df_logs['sessions'].value_counts()

In [None]:
# use pandas to merge df_merged and df_logs on phone_number
df_hh_users = pd.merge(df_merged, df_logs, on='phone_number')
df_hh_users.head()

In [None]:
df_hh_users.info()

# Analysis

In [None]:
# print value counts for each column except phone_number, phn1, phn2, hhID
for col in df_merged.columns:
    if col not in ['phone_number', 'phn1', 'phn2', 'hhID']:
        print('=====' + col + '=====')
        print(df_merged[col].value_counts())
        print()

In [None]:
# plot the distribution of age
fig, ax = plt.subplots(figsize=(10, 5))
df_merged['age'].hist(bins=20, ax=ax)
ax.set_title('Distribution of Age')
ax.set_xlabel('Age')
ax.set_ylabel('Count')
plt.show()

In [None]:
# plot distribution of gender using hoh_gender_lb and a pie chart
fig, ax = plt.subplots(figsize=(10, 5))
df_merged['hoh_gender_lb'].value_counts().plot(kind='pie', ax=ax)

In [None]:
# create age_groups and plot distribution of age_groups in a bar chart, sort age axis ascending
df_merged['age_group'] = pd.cut(df_merged['age'], bins=[0, 18, 25, 35, 45, 55, 65, 100])
fig, ax = plt.subplots(figsize=(10, 5))
df_merged['age_group'].value_counts().sort_index().plot(kind='bar', ax=ax)


In [None]:
# stacked bar chart of age distribution by gender
temp = df_merged.groupby(['age_group', 'hoh_gender_lb'])['age_group'].count().unstack('hoh_gender_lb').fillna(0)
display(temp)
temp[df_merged['hoh_gender_lb'].unique()].sort_index().plot(kind='bar', stacked=True)
plt.title("Gender vs Age Group")

In [None]:
def remove_nan(arr):
    return arr[~pd.isnull(arr)]

In [None]:
# stacked bar chart of age distribution by comfort_with_ussd
temp = df_merged.groupby(['age_group', 'comfort_with_ussd'])['age_group'].count().unstack('comfort_with_ussd').fillna(0)
display(temp)
temp[remove_nan(df_merged['comfort_with_ussd'].unique())].plot(kind='bar', stacked=True)
plt.title("Comfort with USSD vs Age Group")

In [None]:
# density plot to compare distribution of age with comfort_with_ussd
fig, ax = plt.subplots(figsize=(10, 5))
for comfort in remove_nan(df_merged['comfort_with_ussd'].unique()):
    df_merged[df_merged['comfort_with_ussd'] == comfort]['age'].plot(kind='density', ax=ax, label=comfort)
ax.legend(loc="upper left", bbox_to_anchor=(0.6,1), ncol=1)
ax.set_title('Distribution of Age by Comfort with USSD')
ax.set_xlabel('Age')
ax.set_ylabel('Density')
plt.show()

In [None]:
# stacked bar chart of comfort_with_ussd by gender
temp = df_merged.groupby(['comfort_with_ussd', 'gender'])['comfort_with_ussd'].count().unstack('gender').fillna(0)
display(temp)
temp[remove_nan(df_merged['gender'].unique())].plot(kind='bar', stacked=True)
plt.title("Comfort with USSD vs Gender")

In [None]:
# stacked barchart of comfort_with_ussd by district_label
temp = df_merged.groupby(['comfort_with_ussd', 'district_label'])['comfort_with_ussd'].count().unstack('district_label').fillna(0)
display(temp)
temp[remove_nan(df_merged['district_label'].unique())].plot(kind='bar', stacked=True)
plt.title("Comfort with USSD vs District")

In [None]:
# stacked barchart of district_label by comfort_with_ussd
temp = df_merged.groupby(['district_label', 'comfort_with_ussd'])['district_label'].count().unstack('comfort_with_ussd').fillna(0)
display(temp)
temp[remove_nan(df_merged['comfort_with_ussd'].unique())].apply(lambda x: x/x.sum(), axis=1).plot(kind='bar', stacked=True)
plt.title("District vs Comfort with USSD")

In [None]:
# Bukoba	73	35	9	37
print("Bukoba Average Comfort: " + str((73 * 1 + 35 * 2 + 9 * 3 + 37 * 4) / (73 + 35 + 9 + 37)))
# Karagwe	73	55	38	12
print("Karagwe Average Comfort: " + str((73 * 1 + 55 * 2 + 38 * 3 + 12 * 4) / (73 + 55 + 38 + 12)))
# Kyerwa	74	59	18	19
print("Kyerwa Average Comfort: " + str((74 * 1 + 59 * 2 + 18 * 3 + 19 * 4) / (74 + 59 + 18 + 19)))
# Missenyi	39	24	5	15
print("Missenyi Average Comfort: " + str((39 * 1 + 24 * 2 + 5 * 3 + 15 * 4) / (39 + 24 + 5 + 15)))
# Muleba	68	32	12	10
print("Muleba Average Comfort: " + str((68 * 1 + 32 * 2 + 12 * 3 + 10 * 4) / (68 + 32 + 12 + 10)))

In [None]:
# standard deviations
print("Bukoba Standard Deviation: " + str(np.std([1] * 73 + [2] * 35 + [3] * 9 + [4] * 37)))
print("Karagwe Standard Deviation: " + str(np.std([1] * 73 + [2] * 55 + [3] * 38 + [4] * 12)))
print("Kyerwa Standard Deviation: " + str(np.std([1] * 74 + [2] * 59 + [3] * 18 + [4] * 19)))
print("Missenyi Standard Deviation: " + str(np.std([1] * 39 + [2] * 24 + [3] * 5 + [4] * 15)))
print("Muleba Standard Deviation: " + str(np.std([1] * 68 + [2] * 32 + [3] * 12 + [4] * 10)))

In [None]:
from scipy import stats
def anova(df, col1, col2):
    f_val, p_val = stats.f_oneway(*[r for r in [np.concatenate([[i + 1] * int(v[i]) for i in range(4)]) for v in df.groupby([col1, col2])[col1].count().unstack(col2).fillna(0).values] if len(r) > 0])
    print("ANOVA results for " + col1 + " and " + col2)
    print("One-way ANOVA P =", p_val)

In [None]:
# significance test that average comfort with USSD is the same across districts
# H0: average comfort with USSD is the same across districts
# H1: average comfort with USSD is not the same across districts
anova(df_merged, 'district_label', 'comfort_with_ussd')

In [None]:
# significance test that average comfort with USSD is the same across age groups
# H0: average comfort with USSD is the same across age groups
# H1: average comfort with USSD is not the same across age groups
anova(df_merged, 'age_group', 'comfort_with_ussd')

In [None]:
# significance test that average comfort with USSD is the same across genders
# H0: average comfort with USSD is the same across genders
# H1: average comfort with USSD is not the same across genders
anova(df_merged, 'hoh_gender_lb', 'comfort_with_ussd')

In [None]:
# significance test that average comfort with USSD is the same across whitelist dates
# H0: average comfort with USSD is the same across whitelist dates
# H1: average comfort with USSD is not the same across whitelist dates
anova(df_merged, 'date', 'comfort_with_ussd')

In [None]:
df_hh_users.head()

In [None]:
df_hh = df_merged.copy()
df_hh['sessions'] = 0
df_hh['duration'] = 0
df_hh['businesses_visited'] = 0
df_hh['category_browse'] = 0
df_hh['searches'] = 0
df_hh.head()

In [None]:
# overwrite df_hh with values in df_hh_users if the phone_number is in df_hh_users
for index, row in df_hh_users.iterrows():
    df_hh.loc[df_hh['phone_number'] == row['phone_number'], 'sessions'] = row['sessions']
    df_hh.loc[df_hh['phone_number'] == row['phone_number'], 'duration'] = row['duration']
    df_hh.loc[df_hh['phone_number'] == row['phone_number'], 'businesses_visited'] = row['businesses_visited']
    df_hh.loc[df_hh['phone_number'] == row['phone_number'], 'category_browse'] = row['category_browse']
    df_hh.loc[df_hh['phone_number'] == row['phone_number'], 'searches'] = row['searches']
df_hh.head()

In [None]:
anova(df_hh, 'sessions', 'comfort_with_ussd')

In [None]:
anova(df_hh, 'duration', 'comfort_with_ussd')

In [None]:
anova(df_hh, 'businesses_visited', 'comfort_with_ussd')

In [None]:
df_hh['average_duration'] = df_hh['duration'].apply(lambda v: 0 if not isinstance(v, str) else (datetime.strptime(v[7:22], '%H:%M:%S.%f').timestamp() - datetime(1900, 1, 1).timestamp())) / df_hh['sessions']
df_hh['average_duration'].fillna(-1, inplace=True)
anova(df_hh, 'average_duration', 'comfort_with_ussd')

In [None]:
# stacked barchart of number of businesses visited by comfort_with_ussd
temp = df_hh.groupby(['businesses_visited', 'comfort_with_ussd'])['businesses_visited'].count().unstack('comfort_with_ussd').fillna(0)
display(temp)
temp[remove_nan(df_hh['comfort_with_ussd'].unique())].apply(lambda x: x/x.sum(), axis=1).plot(kind='bar', stacked=True)
plt.title("Number of Businesses Visited vs Comfort with USSD")

In [None]:
# stacked barchart of age_group by businesses_visited
temp = df_hh.groupby(['age_group', 'businesses_visited'])['age_group'].count().unstack('businesses_visited').fillna(0)
display(temp)
temp[remove_nan(df_hh['businesses_visited'].unique())].apply(lambda x: x/x.sum(), axis=1).plot(kind='bar', stacked=True)
plt.title("Age Group vs Number of Businesses Visited")

In [None]:
# stacked barchart of businesses_visited by district_label
temp = df_hh.groupby(['businesses_visited', 'district_label'])['businesses_visited'].count().unstack('district_label').fillna(0)
display(temp)
temp[remove_nan(df_hh['district_label'].unique())].apply(lambda x: x/x.sum(), axis=1).plot(kind='bar', stacked=True)
plt.title("Number of Businesses Visited vs District")

In [None]:
# stacked barchart of businesses_visited by hoh_gender_lb
temp = df_hh.groupby(['businesses_visited', 'hoh_gender_lb'])['businesses_visited'].count().unstack('hoh_gender_lb').fillna(0)
display(temp)
temp[remove_nan(df_hh['hoh_gender_lb'].unique())].apply(lambda x: x/x.sum(), axis=1).plot(kind='bar', stacked=True)
plt.title("Number of Businesses Visited vs Gender")

In [None]:
anova(df_hh, 'businesses_visited', 'district_label')

In [None]:
anova(df_hh, 'businesses_visited', 'age_group')

In [None]:
anova(df_hh, 'businesses_visited', 'comfort_with_ussd')

In [None]:
anova(df_hh, 'hoh_gender_lb', 'businesses_visited')

In [None]:
anova(df_hh, 'businesses_visited', 'duration')

In [None]:
anova(df_hh, 'businesses_visited', 'sessions')

In [None]:
anova(df_hh, 'businesses_visited', 'date')

In [None]:
df_hh[df_hh["hoh_gender_lb"] == "Male-headed"].describe(include='all')

In [None]:
df_hh[df_hh["hoh_gender_lb"] == "Female-headed"].describe(include='all')

In [None]:
df_hh[(df_hh["hoh_gender_lb"] == "Male-headed") & (df_hh["businesses_visited"] > 9)]["businesses_visited"].std()

In [None]:
df_hh[(df_hh["hoh_gender_lb"] == "Male-headed") & (df_hh["businesses_visited"] < 9)]["businesses_visited"].std()

In [None]:
df_hh.value_counts('comfort_with_ussd')

In [None]:
df_234 = df_hh[df_hh["comfort_with_ussd"] != "1. Very Comfortable"]

In [None]:
df_234.info()

In [None]:
anova(df_234, 'comfort_with_ussd', 'searches')

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
for comfort in remove_nan(df_hh['comfort_with_ussd'].unique()):
    df_hh[df_hh['comfort_with_ussd'] == comfort]['age'].plot(kind='density', ax=ax, label=comfort)
ax.legend()
ax.set_title('Distribution of Age by Comfort with USSD for Males and Females')
ax.set_xlabel('Age')
ax.set_ylabel('Density')
plt.show()

In [None]:
# density plot to compare distribution of age with comfort_with_ussd for males
df_males = df_hh[df_hh['hoh_gender_lb'] == "Male-headed"]
fig, ax = plt.subplots(figsize=(10, 5))
for comfort in remove_nan(df_males['comfort_with_ussd'].unique()):
    df_males[df_males['comfort_with_ussd'] == comfort]['age'].plot(kind='density', ax=ax, label=comfort)
ax.legend()
ax.set_title('Distribution of Age by Comfort with USSD for Males')
ax.set_xlabel('Age')
ax.set_ylabel('Density')
plt.show()

In [None]:
df_females = df_hh[df_hh['hoh_gender_lb'] == "Female-headed"]
fig, ax = plt.subplots(figsize=(10, 5))
for comfort in remove_nan(df_males['comfort_with_ussd'].unique()):
    df_females[df_females['comfort_with_ussd'] == comfort]['age'].plot(kind='density', ax=ax, label=comfort)
ax.legend()
ax.set_title('Distribution of Age by Comfort with USSD for Females')
ax.set_xlabel('Age')
ax.set_ylabel('Density')
plt.show()

In [None]:
anova(df_females, 'comfort_with_ussd', 'age_group')

In [None]:
anova(df_males, 'comfort_with_ussd', 'age_group')

In [None]:
anova(df_females, 'businesses_visited', 'age_group')

In [None]:
anova(df_males, 'businesses_visited', 'age_group')

In [None]:
anova(df_hh, 'businesses_visited', 'age_group')

In [None]:
df_hh['age_group'].value_counts()

In [None]:
# df_0_18 = df_hh[df_hh['age_group'] == pd.Interval(0, 18, closed='right')] # no one is in this age group
df_18_25 = df_hh[df_hh['age_group'] == pd.Interval(18, 25, closed='right')]
df_25_35 = df_hh[df_hh['age_group'] == pd.Interval(25, 35, closed='right')]
df_35_45 = df_hh[df_hh['age_group'] == pd.Interval(35, 45, closed='right')]
df_45_55 = df_hh[df_hh['age_group'] == pd.Interval(45, 55, closed='right')]
df_55_65 = df_hh[df_hh['age_group'] == pd.Interval(55, 65, closed='right')]
df_65_100 = df_hh[df_hh['age_group'] == pd.Interval(65, 100, closed='right')]

In [None]:
temp = df_hh.groupby(['comfort_with_ussd', 'gender'])['comfort_with_ussd'].count().unstack('gender').fillna(0)
display(temp)
temp[remove_nan(df_hh['gender'].unique())].plot(kind='bar', stacked=True)
plt.title("Comfort with USSD vs Gender for 0-100 Age Group")

In [None]:
temp = df_hh[df_hh['age'] < 40].groupby(['comfort_with_ussd', 'gender'])['comfort_with_ussd'].count().unstack('gender').fillna(0)
display(temp)
temp[remove_nan(df_hh[df_hh['age'] < 40]['gender'].unique())].plot(kind='bar', stacked=True)
plt.title("Comfort with USSD vs Gender for 0-40 Age Group")

In [None]:
temp = df_hh[df_hh['age'] >= 40].groupby(['comfort_with_ussd', 'gender'])['comfort_with_ussd'].count().unstack('gender').fillna(0)
display(temp)
temp[remove_nan(df_hh[df_hh['age'] >= 40]['gender'].unique())].plot(kind='bar', stacked=True)
plt.title("Comfort with USSD vs Gender for 40-100 Age Group")

In [None]:
anova(df_hh, 'gender', 'comfort_with_ussd')

In [None]:
anova(df_hh[df_hh['age'] >= 40], 'gender', 'comfort_with_ussd')

In [None]:
anova(df_hh[df_hh['age'] < 40], 'gender', 'comfort_with_ussd')

In [None]:
anova(df_hh[df_hh['age'] >= 40], 'gender', 'businesses_visited')

In [None]:
anova(df_hh[df_hh['age'] < 40], 'gender', 'businesses_visited')

In [None]:
anova(df_hh, 'gender', 'businesses_visited')

In [None]:
df_nonanon = pd.read_csv('hh_nonanon.csv').drop('Unnamed: 0', axis=1)
df_nonanon.head()

In [None]:
list(df_nonanon.columns)

In [None]:
df_nonanon = df_nonanon[df_nonanon["digital"].apply(lambda x: bool(x))]

In [None]:
df_nonanon["digital"]

In [None]:
df_nonanon.info()

In [None]:
def classify_phone(row):
    if row["p_basic"] == 1 and row["p_smart"] == 1:
        return "Both" 
    elif row["p_basic"] == 1:
        return  "Feature"
    elif row["p_smart"] == 1:
        return "Smart"
    else:
        return "None"

In [None]:
df_nonanon["phone_type"] = df_nonanon.apply(classify_phone, axis=1)

In [None]:
df_nonanon["phone_type"].value_counts()

In [None]:
df_nonanon['age_group'] = pd.cut(df_nonanon['age'], bins=[0, 18, 25, 35, 45, 55, 65, 100])

In [None]:
df_nonanon[["phone_type", "gender", "age_group", "f_103_ussd", "f_102_wakala", "f_104_ussd"]].value_counts()

In [None]:
df_nonanon[["phone_type", "f_104_ussd"]].value_counts().plot(kind="barh")

In [None]:
# rename comfort_with_ussd column values to be shorter
df_nonanon['f_104_ussd'] = df_nonanon['f_104_ussd'].replace({
    "Very Comfortable – I can use USSD without help from anyone": "1. Very Comfortable",
    "Somewhat comfortable – I may ask for help sometimes": "2. Somewhat Comfortable",
    "Not very comfortable – I need help over 50% of the time": "3. Not Comfortable",
    "Uncomfortable – I always ask someone else to navigate the menus on my behalf": "4. Very Uncomfortable"
})

In [None]:
df_pivot = pd.pivot_table(
    df_nonanon, 
    values="age",
    index="f_104_ussd",
    columns="phone_type", 
    aggfunc=len,
)
ax = df_pivot.plot.barh()
ax.set_ylabel("Comfort with USSD")
ax.set_xlabel("Number of Respondents")
ax.set_title("Comfort with USSD by Phone Type")
ax.legend(title="Phone Type", loc="upper right")
ax.invert_yaxis()

In [None]:
df_nonanon["f_103_ussd"].value_counts()

In [None]:
# if "f_103_ussd" is no, set "f_104_ussd" to "0. Doesn't Use USSD"
df_nonanon.loc[df_nonanon['f_103_ussd'] == 'No', 'f_104_ussd'] = '0. Doesn\'t Use USSD'

In [None]:
df_nonanon["f_104_ussd"].value_counts()

In [None]:
def lighten_color(color, amount=0.5):
    """
    Lightens the given color by multiplying (1-luminosity) by the given amount.
    Input can be matplotlib color string, hex string, or RGB tuple.

    Examples:
    >> lighten_color('g', 0.3)
    >> lighten_color('#F034A3', 0.6)
    >> lighten_color((.3,.55,.1), 0.5)
    """
    import matplotlib.colors as mc
    import colorsys
    try:
        c = mc.cnames[color]
    except:
        c = color
    c = colorsys.rgb_to_hls(*mc.to_rgb(c))
    return colorsys.hls_to_rgb(c[0], 1 - amount * (1 - c[1]), c[2])

In [None]:
# replace Wakala Usage values with <1 visit/month, ~1 visit/month, ~1 visit/week, >1 visit/week
df_nonanon['f_102_wakala'] = df_nonanon['f_102_wakala'].replace({
    "Less than once a month": "<1 visit/month",
    "Once a month": "~1 visit/month",
    "Once each week": "~1 visit/week",
    "Multiple times in a week": ">1 visit/week"
})

In [None]:
# cols = ["district_label_x", "phone_type", "gender", "age_group", "f_103_ussd", "f_102_wakala", "f_96_electric_grid", "total_indiv"]
# dispnames = ["District", "Phone Type", "Gender", "Age Group", "USSD Usage", "Wakala Usage", "Electric Grid", "Household Members"]
cols = ["district_label_x", "f_96_electric_grid", "gender", "age_group", "f_103_ussd", "f_102_wakala"]
dispnames = ["District", "Electric Grid", "Gender", "Age", "USSD Usage", "Wakala Usage"]
plt.rcParams.update({'font.size': 30}) # bottom label and subplot title size
fig, axs = plt.subplots(nrows=len(cols), ncols=1, figsize=(30, 20), height_ratios=[df_nonanon[col].nunique() for col in cols])
for ax, col, dispname in zip(axs.flatten(), cols, dispnames):
    df_pivot = pd.pivot_table(
        df_nonanon, 
        values="age",
        index=col,
        columns="f_104_ussd", 
        aggfunc=len,
    )[["0. Doesn't Use USSD", "4. Very Uncomfortable", "3. Not Comfortable", "2. Somewhat Comfortable", "1. Very Comfortable"]]
    df_pivot.plot.barh(ax=ax, stacked=True, legend=False, width=1.0, color=list(map(lambda c: lighten_color(c, 0.5), ['gray', 'red', 'orange', 'gold', 'forestgreen'])))
    ax.set_ylabel(dispname)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.invert_yaxis()
    for label in (ax.get_xticklabels() + ax.get_yticklabels()): label.set_fontsize(28) # subplot y-axis label sizes
axs[0].legend(title="Comfort With USSD", loc="upper center", bbox_to_anchor=(0.5, 1.8), ncol=5, title_fontsize=44, fontsize=26)
axs[-1].set_xlabel("Number of Household Survey Respondents")
fig.tight_layout(pad=1.0)

In [None]:
# pie chart of f_104_ussd for Males
ax = df_nonanon[df_nonanon["gender"] == "Male"]["f_104_ussd"].value_counts().plot(kind='pie', autopct='%1.1f%%', figsize=(10,10))
ax.axis('off')

In [None]:
ax = df_nonanon[df_nonanon["gender"] == "Female"]["f_104_ussd"].value_counts().plot(kind='pie', autopct='%1.1f%%', figsize=(10,10))
ax.axis('off')

In [None]:
total = df_nonanon["f_104_ussd"].value_counts().sum()
ax = df_nonanon["f_104_ussd"].value_counts().plot(kind='pie', autopct=lambda x: '{:1.1f}%\n({:.0f})'.format(x, total * x / 100), figsize=(10,10))
ax.axis('off')

In [None]:
df_nonanon[df_nonanon["gender"] == "Male"]["age"].agg(["mean", "median", "std"])

In [None]:
df_nonanon[df_nonanon["gender"] == "Female"]["age"].agg(["mean", "median", "std"])

In [None]:
df_nonanon[df_nonanon.apply(lambda f: int(f["phone_number"]) in set(df_hh_users["phone_number"]), axis=1)]["phone_type"].value_counts()

In [None]:
9 / 18