In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import pandas as pd
import csv
import copy
from datetime import datetime, timedelta
from zipfile import ZipFile
from typing import Tuple, Union
import numpy as np
import base64
import itertools
import re

# Load Data

In [None]:
df_survey = pd.read_csv('census_survey.csv')
df_survey.head()

In [None]:
# Android UTILS

def standard_format(phone_num):
    '''Converts any phone number format into a standard format. Assumes a valid number is passed (not None, not empty, etc.)'''
    phone_num = re.sub("[^0-9]", "", phone_num) # only keep numeric characters (no plus, space, etc.)
    return phone_num[-9:] # only keep last 9 numbers (no country code or leading zero)

def getBit(num, ix):
    return (num >> ix) & 1


def decode_Base64(binstr):
    '''Takes a binary string and decodes it into bits'''
    for byte in base64.b64decode(binstr):  # (get bytes that each read from lower order bit to higher order bit)
        for i in range(8):
            yield getBit(byte, i)


def getActionType(binary):
    '''loops through the bit generator's next 3 bits to find the action type'''
    if binary.__next__():
        if binary.__next__():
            if binary.__next__():
                return "FILTER"  # 111
            else:
                return "SEARCH"  # 110
        else:
            if binary.__next__():
                return "CONTACT"  # 101
            else:
                return "OPEN BUSINESS SCREEN"  # 100
    else:
        if binary.__next__():
            if binary.__next__():
                return "CALL"  # 011
            else:
                return "UNFAVORITE"  # 010
        else:
            if binary.__next__():  # 001
                return "FAVORITE"
            else:
                return "UNRECOGNIZED ACTION"  # 000


def getDate(binary):
    daycount = 0
    for i in range(12):
        if binary.__next__():
            daycount += pow(2, i)
    reference = datetime.strptime("2022/1/1", "%Y/%m/%d")
    return reference + timedelta(days=daycount)


def getPK(binary):
    pk = 0
    for i in range(16):
        if binary.__next__():
            pk += pow(2, i)
    return pk

def decode_Base64_actions(binstr):
    '''Takes a binary string (b'text') and decodes it into humanly readable action strings'''
    return decode_binary_actions(decode_Base64(binstr))

def decode_binary_actions(binary):
    '''Takes a binary (boolean) generator and decodes it into humanly readable action strings'''
    currentActionType = False
    actionStrings = []
    while binary:
        try:
            if not currentActionType:  # we are starting a new action
                currentActionType = getActionType(binary)
                actionStrings += [currentActionType + " | "]
                if currentActionType == 'UNRECOGNIZED ACTION':
                    actionStrings[-1] += "[000]"
            if currentActionType in ["FAVORITE", "UNFAVORITE", "CALL", "OPEN BUSINESS SCREEN", "CONTACT"]:
                date = getDate(binary)
                actionStrings[-1] += date.strftime("%Y/%m/%d") + " | "
                actionStrings[-1] += "Business pk: " + str(getPK(binary))
                for _ in range(9):
                    binary.__next__()
                currentActionType = False
            elif currentActionType in ["SEARCH", "FILTER"]:
                date = getDate(binary)
                actionStrings[-1] += date.strftime("%Y/%m/%d") + " | "
                actionStrings[-1] += "SUCCESSFUL | " if binary.__next__() else "UNSUCCESSFUL | "
                actionStrings[-1] += "TRUNCATED | " if binary.__next__() else "UNTRUNCATED | "
                actionStrings[-1] += "CLEANED | " if binary.__next__() else "RAW | "
                searchstr = ""
                isOffByThree = False
                while True:
                    value = -1
                    for i in range(5):
                        value += pow(2, i) * binary.__next__()
                    if value == -1:
                        bit1 = binary.__next__()
                        bit2 = binary.__next__()
                        bit3 = binary.__next__()
                        if not bit1 and not bit2 and not bit3 and not (currentActionType == "FILTER" and searchstr in ['mafuta ya', 'kuosha']):
                            binary = itertools.chain([0, 0, 0], binary)
                            break
                        elif len(searchstr) >= 21:
                            isOffByThree = True
                            binary = itertools.chain([bit1, bit2, bit3], binary)
                            break
                        else: # catch misencoded strings that contain special characters
                            binary = itertools.chain([bit1, bit2, bit3], binary)
                            searchstr += " "
                    else:
                        searchstr += chr(value + 97)
                actionStrings[-1] += searchstr
                bits = 3 + len(searchstr) * 5 + 8 + 3 + 12
                for _ in range(3):
                    binary.__next__()
                if bits % 8 != 0:
                    for _ in range(8 - (bits % 8)):
                        binary.__next__()
                currentActionType = False
                if isOffByThree: # recovery mode from off by 3 error
                    currentActionType = "UNRECOGNIZED ACTION"
                    actionStrings += [currentActionType + " | ["]
                    for _ in range(3):
                        actionStrings[-1] += str(binary.__next__())
                    actionStrings[-1] += "]"
            else:
                for _ in range(5):
                    actionStrings[-1] += str(binary.__next__())
                while not actionStrings[-1].endswith("0000000000"):
                    for _ in range(8):
                        actionStrings[-1] += str(binary.__next__())
                currentActionType = False
        except StopIteration:
            break
    return actionStrings

In [None]:
# from ekichabi.whitelist.GetWhiteList import populate_whitelist_for_admins
def populate_whitelist_for_admins(set):
    # demo number for internal testing
    set.add(standard_format('255000000000'))

In [None]:
# load whitelist
whitelist = {}
with open('../ekichabi-server/data/whitelist_pretty.csv', 'r') as csvfile:
    # creating a csv reader object
    csvreader = csv.reader(csvfile)

    # ignore first row - which contains field names
    next(csvreader)

    # extracting phone number(s) out of each row
    for row in csvreader:
        date = datetime.strptime(row[-1], '%Y-%d-%b')
        if date not in whitelist:
            whitelist[date] = set()
        if str(row[1]):
            whitelist[date].add(standard_format(str(row[1])))  # number1

print(whitelist.keys())
totalwhitelist = set.union(*whitelist.values())
print(len(totalwhitelist))

In [None]:
# load enumerators
enumerators = set()
with open('../ekichabi-server/data/Enumerator.csv', 'r') as csvfile:
    # creating a csv reader object
    csvreader = csv.reader(csvfile)

    # ignore header
    next(csvreader)

    # extracting phone number out of each row
    for row in csvreader:
        enumerators.add(standard_format(str(row[2])))  # number1
print(enumerators)

In [None]:
# load testers and admins
testers = set()
populate_whitelist_for_admins(testers)
print(testers)

In [None]:
# load wakala intervention group
wakalagroup = set()
with open('../ekichabi-server/data/Wakala.csv', 'r') as csvfile:
    # creating a csv reader object
    csvreader = csv.reader(csvfile)

    # extracting phone number out of each row
    for row in csvreader:
        wakalagroup.add(standard_format(str(row[1])))  # number1
print(wakalagroup)

In [None]:
# load business group
cutoffdate = datetime.strptime('2022-7-Dec', '%Y-%d-%b')
businessgroup = set()
with open('../ekichabi-server/data/census_data_trimmed.csv', 'r') as csvfile:
    # creating a csv reader object
    csvreader = csv.reader(csvfile)

    # ignore first row - which contains field names
    next(csvreader)

    # extracting phone number out of each row
    for row in csvreader:
        businessgroup.add(standard_format(str(row[17])))  # number1
        businessgroup.add(standard_format(str(row[18])))  # number2
businessgroup.remove("")
len(businessgroup)

In [None]:
def wasWhitelisted(number: str, date: datetime) -> bool:
    number = standard_format(number)
    if number in testers or number in enumerators:
        return False
    if date <= cutoffdate and number in businessgroup:
        return True
    for d, l in whitelist.items():
        if d > date:
            return False
        if number in l:
            return True
    return False

In [None]:
def categorizeGroup(number, time):
    return 'non-whitelisted' if (not wasWhitelisted(number, time)) else 'wakalagroup' if number in wakalagroup else 'business' if number in businessgroup else 'household'

In [None]:
sum([1 if categorizeGroup(standard_format(str(n)), cutoffdate) == 'household' else 0 for n in totalwhitelist])

In [None]:
# load log data
def parseLine(line: bytes) -> Tuple[datetime, str, Union[str, bool]]:
    line = line.decode().strip()
    columns = line.split('\t')
    t = datetime.strptime(columns[0], '[%H:%M:%S.%f]')
    time = startTime + timedelta(hours=t.hour, minutes=t.minute, seconds=t.second, microseconds=t.microsecond)
    action = columns[1]
    details = columns[3] if len(columns) > 3 else False
    return time, action, details

def initRow(number, startTime):
    return {'phone_number': number, 'session_start': startTime, 'session_end': pd.NaT, 'duration': pd.NaT,'inputs': "", 
        'group': categorizeGroup(number, startTime), 
        'businesses_visited': 0, 'businesses': [], 'searches': 0, 'filter_location': 0, 'helper_instruction': 0, 'category_browse': 0, 'location_browse': 0, 'exit': 0, 'home': 0, 'back': 0}

df = pd.read_csv('logs.csv')

# df = pd.DataFrame(columns=initRow('000000000', 0).keys())
# with ZipFile('logs/ussd_2_20_2024.zip', 'r') as archive:
#     for name in archive.namelist(): # name = [phone_number]_[Year]-[month]-[day]-[Hour].[Minute].[Second]
#         parts = name.split('_')
#         number = standard_format(parts[0])
#         print(name)
#         print(parts[1].replace('.log', ''))
#         startTime = datetime.strptime(parts[1].replace('.log', ''), '%Y-%m-%d-%H.%M.%S')
#         # if not wasWhitelisted(number, startTime):
#         #     continue
#         if number in testers or number in enumerators:
#             continue
#         row = initRow(number, startTime)
#         isHome = True
#         path = []
#         with archive.open(name) as file:
#             file.readline() # ignore session info
#             line = file.readline() # ignore column headers
#             # line = file.readline() # ignore first home screen

#             def handleReadLine():
#                 global line
#                 global row
#                 global isHome
#                 global path
#                 line = file.readline()
#                 if not line:
#                     return "", "", ""
#                 time, action, details = parseLine(line)
#                 row['session_end'] = time
#                 row['duration'] = time - startTime
#                 if action == 'INPUT RECEIVED':
#                     row['inputs'] += ("," if len(row['inputs']) != 0 else "") + details.strip("'")
#                 elif action == 'RENDERED SCREEN' and details == 'SearchSelectorScreen':
#                     row['searches'] += 1
#                 elif action == 'RENDERED SCREEN' and details == 'IfFilterBYLocationScreen':
#                     row['filter_location'] += 1
#                 elif action == 'RENDERED SCREEN' and details == 'HelpScreen':
#                     row['helper_instruction'] += 1
#                 elif isHome and action == 'RENDERED SCREEN' and details == 'MenuHierarchyScreen - selecting a category':
#                     row['category_browse'] += 1
#                 elif isHome and action == 'RENDERED SCREEN' and details == 'MenuHierarchyScreen - selecting a district':
#                     row['category_browse'] += 1
#                 elif action == 'RENDERED SCREEN' and details and details.startswith('BusinessDetailsScreen'):
#                     row['businesses_visited'] += 1
#                     duration = pd.NaT
#                     next_line = file.peek()
#                     if next_line:
#                         next_time, _, _ = parseLine(next_line)
#                         duration = next_time - time
#                     business = {'id': details.split(' - for ')[1], 'duration': duration,'path': copy.copy(path)}
#                     row['businesses'].append(business)
#                 if action == 'RENDERED SCREEN':
#                     path.append(details)
#                     isHome = False
#                 return time, action, details

#             while line:
#                 time, action, details = handleReadLine()
#                 if not line: break
#                 if action == 'BACK PRESSED': # BACK PRESSED + RENDERED SCREEN (back button -- 99)
#                     row['back'] += 1
#                     _, _, details = handleReadLine()
#                     isHome = details == 'HomeScreen'
#                     if isHome:
#                         path = []
#                 elif action == 'HOME PRESSED': # HOME PRESSED + RENDERED SCREEN (home button -- 100)
#                     row['home'] += 1
#                     handleReadLine()
#                     isHome = True
#                     path = []
#                 elif action == 'EXIT PRESSED': # EXIT PRESSED + RENDERED SCREEN (session end button -- 109)
#                     row['exit'] += 1
#                     handleReadLine()
#         df = pd.concat([df, pd.DataFrame(row.values(), index=row.keys()).T], ignore_index=True)
#         # break

#     print(len(archive.namelist()))
pd.set_option('display.max_colwidth', None)
df.head()

In [None]:
df.to_csv('logs2_20_2024.csv')

In [None]:
df.info()

In [None]:
df[df['session_start'] >= datetime.strptime('2024-01-01', '%Y-%m-%d')]['group'].value_counts()

# Data Viz Utils

In [None]:
def hist(df, cat = 'wakala_frequency', labels = {1: '1 time or less', 2: '2-5 times', 3:'5+ times'}):
    val = 'percentage'
    temp = pd.DataFrame({cat:[labels[v] for v in df[cat].value_counts().keys()], val:df[cat].value_counts()})
    temp[val] = temp[val].apply(lambda x: (x/temp[val].sum())*100)
    labels = temp[val].round(1).astype('str') + '%'
    ax = temp.plot.bar(x=cat, y=val, rot=30)
    for container in ax.containers:
        ax.bar_label(container, labels=labels)
        ax.yaxis.set_major_formatter(mtick.PercentFormatter())

# Analysis

In [None]:
df_survey.value_counts('comfort_with_ussd')

In [None]:
hist(df_survey, cat='comfort_with_ussd', labels = {
    'Very Comfortable--I can use USSD without help from anyone': '1. Very Comfortable', 
    'Somewhat comfortable--I may ask for help sometimes': '2. Somewhat Comfortable', 
    'Not very comfortable--I need help over 50% of the time': '3. Not comfortable', 
    'Uncomfortable--I always ask someone else to navigate the menus on my behalf': '4. Uncomfortable'
    })

In [None]:
df_survey['group'] = df_survey.apply(lambda n: categorizeGroup(standard_format(str(n['phone_number'])), cutoffdate), axis = 1)

In [None]:
df_survey.groupby(['comfort_with_ussd', 'group'])['comfort_with_ussd'].count().unstack('group').fillna(0)

In [None]:
df2 = df_survey.groupby(['comfort_with_ussd', 'group'])['comfort_with_ussd'].count().unstack('group').fillna(0)
df2[df_survey['group'].unique()].plot(kind='bar', stacked=True)

In [None]:
df_survey['phone_number'].count()

In [None]:
df_survey['phone_number'].nunique()

In [None]:
df_whitelisted = df[df['group'] != 'non-whitelisted']

In [None]:
def categorize1vs1234(number):
    try:
        number = standard_format(str(int(number)))
        labels = {
            'Very Comfortable--I can use USSD without help from anyone': 1, 
            'Somewhat comfortable--I may ask for help sometimes': 2, 
            'Not very comfortable--I need help over 50% of the time': 3, 
            'Uncomfortable--I always ask someone else to navigate the menus on my behalf': 4
        }
        match = df_survey[df_survey['phone_number'].apply(lambda n: str(standard_format(str(n)))) == number]['comfort_with_ussd']
        return labels[list(match)[0] if isinstance(match, pd.Series) else match] 
    except:
        return -1

In [None]:
df_whitelisted['1vs234'] = df_whitelisted.apply(lambda n: categorize1vs1234(n['phone_number']), axis = 1)

In [None]:
df_whitelisted['1vs234'].value_counts()

In [None]:
hist(df_whitelisted, '1vs234', {-1: 'Not in Survey', 1: '1. Very Comfortable', 2: '2. Somewhat Comfortable', 3: '3. Not comfortable', 4: '4. Uncomfortable'})

In [None]:
df_whitelisted[df_whitelisted['1vs234'] == -1]['group'].value_counts()

In [None]:
df2 = df_whitelisted.groupby(['1vs234', 'group'])['1vs234'].count().unstack('group').fillna(0)
df2[df_whitelisted['group'].unique()].rename({-1: 'Not in Survey', 1: '1. Very Comfortable', 2: '2. Somewhat Comfortable', 3: '3. Not comfortable', 4: '4. Uncomfortable'}).plot(kind='bar', stacked=True)

In [None]:
control234 = df_survey[(df_survey['comfort_with_ussd'] != 'Very Comfortable--I can use USSD without help from anyone') & (df_survey['group'] == 'household') & (~pd.isna(df_survey['comfort_with_ussd']))]

In [None]:
df_234 = df_whitelisted[df_whitelisted['phone_number'].isin(control234['phone_number'])]
df_234.info()

In [None]:
df_234['phone_number'].nunique()

In [None]:
df_234['group'] = '234'

In [None]:
control1 = df_survey[(df_survey['comfort_with_ussd'] == 'Very Comfortable--I can use USSD without help from anyone') & (df_survey['group'] == 'household') & (~pd.isna(df_survey['comfort_with_ussd']))]

In [None]:
df_1 = df_whitelisted[df_whitelisted['phone_number'].isin(control1['phone_number'])]
df_1.info()

In [None]:
df_1['phone_number'].nunique()

In [None]:
df_1['group'] = '1'

In [None]:
df_comb = pd.concat([df_1, df_234]).drop('Unnamed: 0.1', axis=1).drop('Unnamed: 0', axis=1)
df_comb

In [None]:
def label_function(val):
    return f'{val:.0f}\n{val / len(df) * 100:.0f}%'

In [None]:
groups = df_comb.groupby('group')['phone_number'].nunique()
patches, texts = plt.pie(groups, textprops={'fontsize': 20}, colors=['tomato', 'gold', 'skyblue'])
labels = [group + "-" + label_function(val) for val, group in zip(groups, ['1. Comfortable', '234. Uncormfortable'])]
plt.legend(patches, labels, loc='center left', bbox_to_anchor=(-0.1, 1.), fontsize=8)
plt.title("Distribution of numbers")

In [None]:
groups = df_comb.groupby('group').size()
patches, texts = plt.pie(groups, textprops={'fontsize': 20}, colors=['tomato', 'gold', 'skyblue'])
labels = [group + "-" + label_function(val) for val, group in zip(groups, ['1. Comfortable', '234. Uncormfortable'])]
plt.legend(patches, labels, loc='center left', bbox_to_anchor=(-0.1, 1.), fontsize=8)
plt.title("Distribution of sessions")

In [None]:
groups = df_comb.groupby('group').size() / df_comb.groupby('group')['phone_number'].nunique()
patches, texts = plt.pie(groups, textprops={'fontsize': 20}, colors=['tomato', 'gold', 'skyblue'])
labels = [group + "-" + label_function(val) for val, group in zip(groups, ['1. Comfortable', '234. Uncormfortable'])]
plt.legend(patches, labels, loc='center left', bbox_to_anchor=(-0.1, 1.), fontsize=8)
plt.title("Distribution of Sessions/Number")

In [None]:
df_survey['comfort_with_ussd'].value_counts()

In [None]:
groups = df_comb.groupby('group').size() / np.array([319, 371])
patches, texts = plt.pie(groups, textprops={'fontsize': 20}, colors=['tomato', 'gold', 'skyblue'])
labels = [group + "-" + label_function(val) for val, group in zip(groups, ['1. Comfortable', '234. Uncormfortable'])]
plt.legend(patches, labels, loc='center left', bbox_to_anchor=(-0.1, 1.), fontsize=8)
plt.title("Distribution of Sessions/Number")

In [None]:
(df_comb.groupby(['group', 'searches']).size() / df_comb.groupby('group').size()).unstack().plot(kind='bar', stacked=True)

In [None]:
(df_comb.groupby(['group', 'businesses_visited']).size() / df_comb.groupby('group').size()).unstack().plot(kind='bar', stacked=True)

In [None]:
(df_comb.groupby(['group', 'category_browse']).size() / df_comb.groupby('group').size()).unstack().plot(kind='bar', stacked=True)

In [None]:
(df_comb.groupby(['group', 'location_browse']).size() / df_comb.groupby('group').size()).unstack().plot(kind='bar', stacked=True)

In [None]:
(df_comb.groupby(['group', 'back']).size() / df_comb.groupby('group').size()).unstack().plot(kind='bar', stacked=True)

In [None]:
(df_comb.groupby(['group', 'exit']).size() / df_comb.groupby('group').size()).unstack().plot(kind='bar', stacked=True)

In [None]:
(df_comb.groupby(['group', 'home']).size() / df_comb.groupby('group').size()).unstack().plot(kind='bar', stacked=True)

In [None]:
(df_comb.groupby(['group', 'filter_location']).size() / df_comb.groupby('group').size()).unstack().plot(kind='bar', stacked=True)

In [None]:
df_comb['session_start'] = pd.to_datetime(df_comb['session_start'])
df_comb['session_end'] = pd.to_datetime(df_comb['session_end'])
df_comb['duration'] = df_comb['session_end'] - df_comb['session_start']
df_comb['duration'] = df_comb['duration'].astype('timedelta64[s]')
fig, ax = plt.subplots(figsize=(8,5))
for label, df1 in df_comb.groupby('group'):
     df1.duration.plot(kind="kde", ax=ax, label=label)
ax.set_xlabel("Session Duration (seconds)")
ax.set_xlim(0, 800)
ax.legend(loc="upper left")

In [None]:
df_comb['time_after'] = (df_comb['session_start'] - datetime(2022,11,1,0,0)) / np.timedelta64(1,'D')
fig, ax = plt.subplots(figsize=(8,5))
for label, df1 in df_comb.groupby('group'):
     df1.time_after.plot(kind="kde", ax=ax, label=label)
ax.set_xlabel("Session start date after November 1st")
ax.set_xlim(0, 290)
ax.legend(loc="upper left")