In [1]:
import pandas as pd
import numpy as np

In [2]:
import sqlite3
conn = sqlite3.connect('dlight.db')
cursor=conn.cursor()

In [3]:
#load data
service_tickets = pd.read_csv('../data/bi_case_calls_combined.csv')
contracts = pd.read_csv('../data/bi_case_contracts_combined.csv')
payments = pd.read_csv('../data/bi_case_payments_combined.csv')
calls = pd.read_csv('../data/bi_case_calls_combined.csv')

In [4]:
#convert contracts to table
contracts.to_sql('contracts', conn, index=False, if_exists='replace')


486623

In [5]:
# clean contract columns
contracts.columns = (
        contracts.columns
          .str.strip()
          .str.lower()
          .str.replace(' ', '_')
          .str.replace(r'[^\w]', '', regex=True)
    )

In [6]:
#look at datatypes
contracts.dtypes

contractid             int64
sales_month           object
region                object
product               object
sales_person_id      float64
contract_type         object
price_usd            float64
payment_frequency     object
perc_deposit         float64
tenor_length         float64
daily_amount_usd     float64
customer_gender       object
household_size        object
occupation            object
dtype: object

In [7]:
contracts.isna().sum()

contractid                0
sales_month               0
region                    0
product                2166
sales_person_id        9720
contract_type             0
price_usd              4561
payment_frequency     25341
perc_deposit          25352
tenor_length          25341
daily_amount_usd      29080
customer_gender           0
household_size       476253
occupation            59683
dtype: int64

In [8]:
contracts['household_size'].notna().sum()

10370

In [9]:
# standardize contract id
contracts.rename({'contractid' : 'contract_id'}, inplace= True)

In [10]:
# convert sales_month to date
pd.to_datetime(contracts['sales_month'],format="%Y-%m-%d")

0        2024-10-31
1        2025-04-30
2        2024-03-31
3        2024-05-31
4        2025-06-30
            ...    
486618   2025-09-30
486619   2025-09-30
486620   2025-09-30
486621   2025-09-30
486622   2025-09-30
Name: sales_month, Length: 486623, dtype: datetime64[ns]

In [11]:
# convert house hold size to int
#contracts['household_size'] = contracts['household_size'].astype('Int64')

In [12]:
# take a close look at the household column
"""It appears that household size has mixed data and contains names and contacts"""

household = """SELECT household_size
                FROM contracts
                WHERE household_size IS NOT NULL"""
pd.read_sql(household, conn)

Unnamed: 0,household_size
0,4
1,3
2,4
3,6
4,3
...,...
10365,256746372749 timwesige Jackson
10366,3
10367,256764718509 Nigesa marry
10368,256705097664 Nabbanja Annet


In [13]:
#strip white spaces
col = 'household_size'
contracts[col] = contracts[col].astype(str).str.strip()


In [14]:
"""after physcially accessing the contracts csv file,it appears household sizes range from
1 to 9 members, so i'll isolate the actual household sizes to their own separate rows """

contracts['actual_household_size'] = contracts[col].where(contracts[col].str.match(r'^[1-9]$')).astype('Int64')

In [15]:
# have a closer look at the column
contracts['actual_household_size'].unique()

<IntegerArray>
[<NA>, 4, 3, 6, 1, 7, 8, 5, 2, 9]
Length: 10, dtype: Int64

In [16]:
"""create a boolean filter to separate names from phone numbers """
# Filters actual household sizes from mixed datatypes that contains names and contacts
mask_household_size = contracts['household_size'].str.match(r'^\d{1,2}$')

# filters for rows with actual household sizes and false for mixed datatypes(names+contacts)
mask_phone_name = ~mask_household_size

In [17]:
#create a phone number column
"""The regex \d{9,15} ensures that only numbers with 9-15 digits are considered.
 This is to avoid single digits like household sizes from being matched as phone numbers """

contracts.loc[mask_phone_name, 'phone_number'] = contracts.loc[mask_phone_name, 'household_size'].str.extract(r'^(\d{9,15})')[0]

In [18]:
# have a closer look at the phone number column
contracts['phone_number'].unique()

array([nan, '256776921622', '256751507695', ..., '256764718509',
       '256705097664', '256760335303'], dtype=object)

In [19]:
# Extract names
contracts.loc[mask_phone_name, 'name'] = contracts.loc[mask_phone_name, 'household_size'].str.extract(r'^\d{9,15}\s+(.+)$')[0]

In [20]:
contracts['name'].unique()


array([nan, 'Erias Matovu', 'UPAKRWOTH  DANIEL', ..., 'Nigesa marry',
       'Nabbanja Annet', 'Mukasa  Bonifasi Abubaker'], dtype=object)

In [21]:
# standardize names
contracts['name'] = contracts['name'].str.title()

In [22]:
"""Values that  meet exceptions for household size, phone number and names"""
counts_updates = (
    contracts['actual_household_size'].notna() |
    contracts['phone_number'].notna() |
    contracts['name'].notna()
)

counts_updates.sum()

9989

In [23]:
"""Since householdsize had been converted to string type, null values will be replaced with nan. So ensure
we properly fill in the null values"""

contracts['household_size'] = contracts['household_size'].replace(["", "nan", None], np.nan)


In [24]:
unclassified = contracts[
    contracts['household_size'].notna() & ~counts_updates][['household_size']]

unclassified.sample(40)


Unnamed: 0,household_size
102314,Nakyanzi Rose 256751370487
33390,TUMWEHEIRE SYLIVIA 256707342184
342030,Nangobi Besi\t 256780693951
194897,Kabeta Fatuma 256705730756
285704,02
341148,03
16155,11
30186,10
52068,776-582-132 Leiya Edea
100165,256 Christine Batahurira


It appears that some households have two-digit members. Additionally, several phone numbers and names were not correctly extracted because some numbers contain special characters, and in certain cases, the entries start with names instead of numbers.

capture two digit households


In [25]:
# Replace NaN with False so missing values are not treated as valid two-digit households
two_digit_house = contracts['household_size'].str.match(r'^\d{1,2}$').astype('boolean').fillna(False)

#extract two digit households
contracts.loc[two_digit_house, 'actual_household_size'] = contracts.loc[two_digit_house, 'household_size'].astype('Int64')

In [26]:
contracts['actual_household_size'].unique()

<IntegerArray>
[<NA>, 4, 3, 6, 1, 7, 8, 5, 2, 9, 13, 10, 14, 11, 12, 20, 0, 25, 15, 16, 17,
 19]
Length: 22, dtype: Int64

In [27]:
"""create a boolean filter to separate names from phone numbers """
# filters actual household sizes from mixed datatypes that contains names and contacts
mask_household_size = contracts['household_size'].str.match(r'^\d{1,2}$').astype('bool').fillna(False)
# filter  rows that are not actual household sizes (names + contacts)
mask_phone_name = ~mask_household_size

In [28]:
# extract phone numbers either in the beginning or end of the string. Also phone number can start with +
contracts.loc[mask_phone_name, 'phone_number'] = (contracts.loc[mask_phone_name, 'household_size'].str.extract(r'(\+?[\d]{9,15})')[0]  
)
# clean up phone numbers by removing special characters and spaces
contracts['phone_number'] = contracts['phone_number'].str.replace(r'[\+\-\(\)\s]', '', regex=True)


In [29]:
contracts['phone_number'].unique()

array([nan, '256776921622', '256751507695', ..., '256764718509',
       '256705097664', '256760335303'], dtype=object)

In [30]:
# extract names
contracts.loc[mask_phone_name, 'name'] = contracts.loc[mask_phone_name, 'household_size'].str.replace(r'(\+?256[\d\-]{6,12}|\d{9,15})', '', regex=True).str.strip()


In [31]:
#standarize names
contracts['name'] = contracts['name'].str.title()
contracts['name'].unique()

array([nan, 'Erias Matovu', ',  Isubbi Samson', ..., 'Nigesa Marry',
       'Nabbanja Annet', 'Mukasa  Bonifasi Abubaker'], dtype=object)

Cannot standardize phone numbers to add country codes because its unclear whether all the contacts are from Ugandan nationals

In [32]:
# Check updates values
contracts[['actual_household_size', 'phone_number', 'name']].notna().sum()


actual_household_size    8308
phone_number             1968
name                     2056
dtype: int64

In [33]:
# Take a closer look at the null values
contracts.isna().sum()

contractid                    0
sales_month                   0
region                        0
product                    2166
sales_person_id            9720
contract_type                 0
price_usd                  4561
payment_frequency         25341
perc_deposit              25352
tenor_length              25341
daily_amount_usd          29080
customer_gender               0
household_size           476259
occupation                59683
actual_household_size    478315
phone_number             484655
name                     484567
dtype: int64

In [34]:
#convert contracts to sql
contracts.to_sql('contracts', conn, index=False, if_exists='replace')


486623

In [35]:
# test connection
test = """SELECT * FROM contracts

            WHERE phone_number IS NOT NULL
            AND name IS NOT NULL"""
pd.read_sql(test, conn)


Unnamed: 0,contractid,sales_month,region,product,sales_person_id,contract_type,price_usd,payment_frequency,perc_deposit,tenor_length,daily_amount_usd,customer_gender,household_size,occupation,actual_household_size,phone_number,name
0,3579445,2025-05-31,Southern,Small Solar,207965.0,FINANCED,150.0,DAILY,0.0426,450.0,0.319,Male,256776921622 Erias Matovu,Business,,256776921622,Erias Matovu
1,3156433,2024-11-30,Southern,Small Solar,693867.0,FINANCED,150.0,DAILY,0.0371,450.0,0.321,Female,"256751507695, Isubbi Samson",Business,,256751507695,", Isubbi Samson"
2,3103570,2024-10-31,Southern,Large Solar - Generation 1,652804.0,FINANCED,300.0,DAILY,0.0569,540.0,0.524,Female,256745672789 UPAKRWOTH DANIEL,Farmer,,256745672789,Upakrwoth Daniel
3,3813825,2025-09-30,Southern,Small Solar,506788.0,FINANCED,150.0,DAILY,0.0495,480.0,0.297,Female,256706348163 wanade Qassim,Teacher,,256706348163,Wanade Qassim
4,3211103,2024-12-31,Southern,Small Solar,708525.0,FINANCED,150.0,DAILY,0.0371,450.0,0.321,Male,256745565318 Opiyo Geoffrey James,Business,,256745565318,Opiyo Geoffrey James
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1963,3754782,2025-08-31,Southern,Large Solar - Generation 2,927588.0,FINANCED,280.0,DAILY,0.0495,480.0,0.554,Male,256781860689 Nzyabake Venelanda Rubasaka,Business,,256781860689,Nzyabake Venelanda Rubasaka
1964,3766446,2025-08-31,Southern,Small Solar,809876.0,FINANCED,150.0,DAILY,0.0495,480.0,0.297,Male,256746372749 timwesige Jackson,Farmer,,256746372749,Timwesige Jackson
1965,3789151,2025-09-30,Southern,Small Solar,472915.0,FINANCED,150.0,DAILY,0.0495,480.0,0.297,Female,256764718509 Nigesa marry,Farmer,,256764718509,Nigesa Marry
1966,3790585,2025-09-30,Southern,Small Solar,272008.0,CASH,150.0,,,,,Male,256705097664 Nabbanja Annet,Business,,256705097664,Nabbanja Annet


## Clean Null Values

In [36]:
q1 = """SELECT contract_type, price_usd,perc_deposit, tenor_length, daily_amount_usd
        FROM contracts
        WHERE contract_type = 'FINANCED'
        LIMIT 5 """
pd.read_sql(q1,conn)

Unnamed: 0,contract_type,price_usd,perc_deposit,tenor_length,daily_amount_usd
0,FINANCED,150.0,0.051,434.0,0.328
1,FINANCED,150.0,0.0313,434.0,0.335
2,FINANCED,150.0,0.0554,434.0,0.326
3,FINANCED,200.0,0.1827,189.0,0.865
4,FINANCED,,0.0296,364.0,


Contracts classified as CASH (one-time payment) do not have financing terms


In [37]:
q2 = """SELECT contract_type, payment_frequency, price_usd,perc_deposit, tenor_length, daily_amount_usd
        FROM contracts
        WHERE contract_type = 'CASH'
        LIMIT 5 """
pd.read_sql(q2,conn)      

Unnamed: 0,contract_type,payment_frequency,price_usd,perc_deposit,tenor_length,daily_amount_usd
0,CASH,,150.0,,,
1,CASH,,150.0,,,
2,CASH,,300.0,,,
3,CASH,,150.0,,,
4,CASH,,150.0,,,


In [38]:
"""We have 11 instances where tenor_length is not null"""

q4 = """SELECT contract_type,
       SUM(CASE WHEN payment_frequency IS NULL THEN 1 ELSE 0 END) AS pf_nulls,
       SUM(CASE WHEN perc_deposit IS NULL THEN 1 ELSE 0 END) AS deposit_nulls,
       SUM(CASE WHEN tenor_length IS NULL THEN 1 ELSE 0 END) AS tenor_nulls,
       SUM(CASE WHEN daily_amount_usd IS NULL THEN 1 ELSE 0 END) AS daily_nulls,
       COUNT(*) AS total_contracts
FROM contracts
GROUP BY contract_type"""

q4=pd.read_sql(q4,conn)
q4.head()


Unnamed: 0,contract_type,pf_nulls,deposit_nulls,tenor_nulls,daily_nulls,total_contracts
0,CASH,25341,25352,25341,25352,25352
1,FINANCED,0,0,0,3728,461271


In [39]:
q5 = """SELECT COUNT(*) AS invalid_cash_contracts
        FROM contracts
        WHERE contract_type = 'CASH'
        AND(
        payment_frequency IS NOT NULL
        OR perc_deposit IS NOT NULL
        OR daily_amount_usd IS NOT NULL
        OR tenor_length IS NOT NULL)
        """
pd.read_sql(q5 , conn)


Unnamed: 0,invalid_cash_contracts
0,11


In [40]:
# In these 11 cash contracts, payment_frequency is written as "daily" and tenor_length is 0
q6 = """SELECT * FROM contracts
        WHERE contract_type = 'CASH'
        AND(
        payment_frequency IS NOT NULL
        OR tenor_length IS NOT NULL)"""

pd.read_sql(q6, conn)

Unnamed: 0,contractid,sales_month,region,product,sales_person_id,contract_type,price_usd,payment_frequency,perc_deposit,tenor_length,daily_amount_usd,customer_gender,household_size,occupation,actual_household_size,phone_number,name
0,2708359,2024-01-31,Northern,PAYGO_PHONE,,CASH,200.0,DAILY,,0.0,,Male,,Business,,,
1,2686003,2024-01-31,Northern,PAYGO_PHONE,,CASH,200.0,DAILY,,0.0,,Female,,,,,
2,2704071,2024-01-31,Northern,PAYGO_PHONE,,CASH,200.0,DAILY,,0.0,,Female,,Business,,,
3,2709900,2024-01-31,Northern,PAYGO_PHONE,,CASH,200.0,DAILY,,0.0,,Male,,Other,,,
4,2686001,2024-01-31,Northern,PAYGO_PHONE,,CASH,200.0,DAILY,,0.0,,Male,,Other,,,
5,2929007,2024-06-30,Northern,PAYGO_PHONE,,CASH,200.0,DAILY,,0.0,,Male,,Business,,,
6,2908726,2024-06-30,Northern,PAYGO_PHONE,,CASH,200.0,DAILY,,0.0,,Male,4.0,Labourer,4.0,,
7,3453167,2025-03-31,Northern,PAYGO_PHONE,,CASH,200.0,DAILY,,0.0,,Male,,,,,
8,2761329,2024-02-29,Northern,PAYGO_PHONE,,CASH,200.0,DAILY,,0.0,,Female,,,,,
9,2905940,2024-06-30,Northern,PAYGO_PHONE,,CASH,200.0,DAILY,,0.0,,Female,3.0,Business,3.0,,


In the documentation, 
- perc_deposit for CASH contracts is 1 and for FINANCED contracts is the deposit required (in USD) as percentage of total value.

- tenor_length is number of days (even if they are not required to pay daily this value is in DAYS) customer must pay to fully pay off the contract

***Therefore, ill fill the null tenor length values with 0 to represent 0 days as cash contracts do not have tenor length and fill null perc deposit with 1. Ill replace the 'Daily' payment frequency with NULL as Cash payments are a one-time payment*** 


In [41]:
update_contracts = """
UPDATE contracts
SET 
    perc_deposit = 1,               -- full payment upfront
    tenor_length = 0,
    payment_frequency = NULL

WHERE contract_type = 'CASH'
"""
# execute the update
cursor.execute(update_contracts)

# commit the changes
conn.commit()

In [None]:
# inspect 
nulls = """SELECT * FROM contracts
        WHERE contract_type = 'CASH'
        AND(
        payment_frequency IS NOT NULL
        OR tenor_length IS NOT NULL)"""

       
pd.read_sql(nulls , conn)


Unnamed: 0,contractid,sales_month,region,product,sales_person_id,contract_type,price_usd,payment_frequency,perc_deposit,tenor_length,daily_amount_usd,customer_gender,household_size,occupation,actual_household_size,phone_number,name
0,3591244,2025-05-31,Southern,Small Solar,418845.0,CASH,150.0,,1.0,0.0,,Female,6,Labourer,6.0,,
1,2833147,2024-04-30,Southern,Small Solar,577862.0,CASH,150.0,,1.0,0.0,,Female,,Business,,,
2,3321226,2025-01-31,Southern,Large Solar - Generation 1,534838.0,CASH,300.0,,1.0,0.0,,Female,3,Other,3.0,,
3,2849697,2024-04-30,Western,Small Solar,226990.0,CASH,150.0,,1.0,0.0,,Male,,Business,,,
4,2803810,2024-03-31,Western,Small Solar,353729.0,CASH,150.0,,1.0,0.0,,Male,5,Government Employee,5.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25347,3815513,2025-09-30,Western,Small Solar,961082.0,CASH,150.0,,1.0,0.0,,Male,,Business,,,
25348,3815643,2025-09-30,Western,Large Solar - Generation 2,978783.0,CASH,280.0,,1.0,0.0,,Female,,,,,
25349,3815672,2025-09-30,Western,Large Solar - Generation 2,291803.0,CASH,280.0,,1.0,0.0,,Female,,,,,
25350,3818975,2025-09-30,Western,Small Solar,148824.0,CASH,150.0,,1.0,0.0,,Female,,Business,,,


Take a closer look at financed columns

In [43]:
financed_nulls = """SELECT
    SUM(CASE WHEN payment_frequency IS NULL THEN 1 ELSE 0 END) AS pf_nulls,
    SUM(CASE WHEN perc_deposit IS NULL THEN 1 ELSE 0 END) AS deposit_nulls,
    SUM(CASE WHEN tenor_length IS NULL THEN 1 ELSE 0 END) AS tenor_nulls,
    SUM(CASE WHEN daily_amount_usd IS NULL THEN 1 ELSE 0 END) AS daily_nulls
FROM contracts
WHERE contract_type = 'FINANCED'
"""
pd.read_sql(financed_nulls,conn)

Unnamed: 0,pf_nulls,deposit_nulls,tenor_nulls,daily_nulls
0,0,0,0,3728


In [44]:
f_nulls = """ SELECT * FROM contracts
              WHERE daily_amount_usd IS NULL
              AND contract_type = 'FINANCED' """

pd.read_sql(f_nulls, conn)

Unnamed: 0,contractid,sales_month,region,product,sales_person_id,contract_type,price_usd,payment_frequency,perc_deposit,tenor_length,daily_amount_usd,customer_gender,household_size,occupation,actual_household_size,phone_number,name
0,3674681,2025-06-30,Western,Small Solar,813534.0,FINANCED,,WEEKLY,0.0296,364.0,,Male,,Business,,,
1,3443355,2025-03-31,Southern,Large Solar - Generation 2,413796.0,FINANCED,,DAILY,0.0442,540.0,,Male,,Business,,,
2,3108860,2024-10-31,Western,,782850.0,FINANCED,,WEEKLY,0.0374,665.0,,Male,,Farmer,,,
3,2888739,2024-05-31,Northern,,242074.0,FINANCED,,DAILY,0.0576,640.0,,Male,,Business,,,
4,3039294,2024-08-31,Northern,,307572.0,FINANCED,,DAILY,0.0576,640.0,,Male,,Business,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3723,3814403,2025-09-30,Northern,,947527.0,FINANCED,,DAILY,0.0560,640.0,,Female,,,,,
3724,3756025,2025-08-31,Southern,Large Solar - Generation 2,958705.0,FINANCED,,DAILY,0.0495,480.0,,Male,,Farmer,,,
3725,3761205,2025-08-31,Western,Small Solar,125352.0,FINANCED,,WEEKLY,0.0253,434.0,,Female,,Other,,,
3726,3800719,2025-09-30,Western,Small Solar,208812.0,FINANCED,,WEEKLY,0.0313,434.0,,Female,,Farmer,,,


Assumption for filling missing daily_amount_usd in FINANCED contracts:
- Using the available fields: payment_frequency, perc_deposit, and tenor_length, we can make an assumption to estimate price and daily payment.
- Estimate method (assumption):
    1. Estimate price_usd using the average price of the same product where price_usd is missing.
    2. Compute financed amount = price_usd * (1 - perc_deposit)
    3. Compute daily_amount_usd = financed_amount / tenor_length
- This is purely an assumption for data completeness and may not reflect the actual contract value.


In [45]:
# Add a flag column to this track assumptions
assumptions_flag = """
ALTER TABLE contracts 
ADD COLUMN assumption_flag VARCHAR(20)"""

cursor.execute(assumptions_flag)
conn.commit()

In [46]:
"""create an assumption flag to show all the values that have been imputed"""

assumptions = """UPDATE contracts
                SET assumption_flag = CASE
                      WHEN price_usd IS NULL OR daily_amount_usd IS NULL 
                      THEN 'ASSUMPTION'
                      ELSE 'ORIGINAL'
                      END """
# execute the update
cursor.execute(assumptions)

# commit the changes
conn.commit()

In [47]:
updates_nulls_prices = """ 
UPDATE CONTRACTS
SET price_usd = CASE
                    WHEN product = 'Small Solar' THEN 150
                    WHEN product = 'PAYGO_PHONE' THEN 200
                    WHEN product = 'Large Solar - Generation 1' THEN 300
                    WHEN product = 'Large Solar - Generation 2' THEN 280
                    WHEN product = 'PAYGO_PORTABLE' THEN 100
                    END
WHERE contract_type = 'FINANCED'
AND price_usd IS NULL """

# execute the update
cursor.execute(updates_nulls_prices)

# commit the changes
conn.commit()

In [48]:
# Compute missing daily_amount_usd based on price, perc_deposit, and tenor_length

update_daily_amt = """
UPDATE contracts
SET daily_amount_usd = (price_usd * (1 - perc_deposit)) / tenor_length
WHERE contract_type = 'FINANCED'
  AND daily_amount_usd IS NULL
  AND price_usd IS NOT NULL
  AND perc_deposit IS NOT NULL
  AND tenor_length IS NOT NULL"""
cursor.execute(update_daily_amt)
conn.commit()

In [49]:
"""check number of null values left"""
nulls = """SELECT * 
FROM contracts
WHERE contract_type = 'FINANCED'
AND daily_amount_usd IS NULL """

pd.read_sql(nulls, conn)

Unnamed: 0,contractid,sales_month,region,product,sales_person_id,contract_type,price_usd,payment_frequency,perc_deposit,tenor_length,daily_amount_usd,customer_gender,household_size,occupation,actual_household_size,phone_number,name,assumption_flag
0,3108860,2024-10-31,Western,,782850.0,FINANCED,,WEEKLY,0.0374,665.0,,Male,,Farmer,,,,ASSUMPTION
1,2888739,2024-05-31,Northern,,242074.0,FINANCED,,DAILY,0.0576,640.0,,Male,,Business,,,,ASSUMPTION
2,3039294,2024-08-31,Northern,,307572.0,FINANCED,,DAILY,0.0576,640.0,,Male,,Business,,,,ASSUMPTION
3,3622217,2025-05-31,Northern,,844770.0,FINANCED,,DAILY,0.0560,640.0,,Male,,Teacher,,,,ASSUMPTION
4,2852364,2024-04-30,Northern,,277342.0,FINANCED,,DAILY,0.0442,540.0,,Female,,Farmer,,,,ASSUMPTION
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1451,3756624,2025-08-31,Western,,979914.0,FINANCED,,WEEKLY,0.0324,504.0,,Male,,Business,,,,ASSUMPTION
1452,3654080,2025-06-30,Northern,,874439.0,FINANCED,,DAILY,0.0560,640.0,,Male,,Farmer,,,,ASSUMPTION
1453,3674495,2025-06-30,Western,,917325.0,FINANCED,,WEEKLY,0.0328,371.0,,Male,,Business,,,,ASSUMPTION
1454,3687403,2025-07-31,Northern,,366638.0,FINANCED,,DAILY,0.0560,640.0,,Female,,Farmer,,,,ASSUMPTION


There are 1456 null values left. Some price null values also have product field as null. Without product type we cannot assign a price using assumptions.


In [50]:
null_products = """
SELECT *
FROM contracts
WHERE contract_type = 'FINANCED'
AND product IS NULL """

pd.read_sql(null_products, conn)



Unnamed: 0,contractid,sales_month,region,product,sales_person_id,contract_type,price_usd,payment_frequency,perc_deposit,tenor_length,daily_amount_usd,customer_gender,household_size,occupation,actual_household_size,phone_number,name,assumption_flag
0,3108860,2024-10-31,Western,,782850.0,FINANCED,,WEEKLY,0.0374,665.0,,Male,,Farmer,,,,ASSUMPTION
1,2888739,2024-05-31,Northern,,242074.0,FINANCED,,DAILY,0.0576,640.0,,Male,,Business,,,,ASSUMPTION
2,3039294,2024-08-31,Northern,,307572.0,FINANCED,,DAILY,0.0576,640.0,,Male,,Business,,,,ASSUMPTION
3,3622217,2025-05-31,Northern,,844770.0,FINANCED,,DAILY,0.0560,640.0,,Male,,Teacher,,,,ASSUMPTION
4,2852364,2024-04-30,Northern,,277342.0,FINANCED,,DAILY,0.0442,540.0,,Female,,Farmer,,,,ASSUMPTION
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1451,3756624,2025-08-31,Western,,979914.0,FINANCED,,WEEKLY,0.0324,504.0,,Male,,Business,,,,ASSUMPTION
1452,3654080,2025-06-30,Northern,,874439.0,FINANCED,,DAILY,0.0560,640.0,,Male,,Farmer,,,,ASSUMPTION
1453,3674495,2025-06-30,Western,,917325.0,FINANCED,,WEEKLY,0.0328,371.0,,Male,,Business,,,,ASSUMPTION
1454,3687403,2025-07-31,Northern,,366638.0,FINANCED,,DAILY,0.0560,640.0,,Female,,Farmer,,,,ASSUMPTION


The 1456 null values also coincide with number of rows will no product info.

- Flag all records without products by creating a new table and copy incomplete rows into it


In [53]:
#create table to move all records with null product values for further analysis
table_creation= """CREATE TABLE null_financed_products AS
SELECT *
FROM contracts
WHERE contract_type = 'FINANCED'
AND product IS NULL"""

cursor.execute(table_creation)
conn.commit

<function Connection.commit()>

In [54]:
# create an incomplete flag column that flags all columns with null product values
flag_complete = """ALTER TABLE contracts
                    ADD incomplete_flag VARCHAR(20)"""
cursor.execute(flag_complete)
conn.commit

<function Connection.commit()>

In [55]:
#set default
update = """UPDATE contracts
                    SET incomplete_flag = 'COMPLETE'
                    WHERE incomplete_flag IS NULL"""
cursor.execute(update)
conn.commit()

In [55]:
q7 = """SELECT * FROM contracts"""
pd.read_sql(q7 , conn)

Unnamed: 0,contractid,sales_month,region,product,sales_person_id,contract_type,price_usd,payment_frequency,perc_deposit,tenor_length,daily_amount_usd,customer_gender,household_size,occupation,actual_household_size,phone_number,name,assumption_flag,incomplete_flag
0,3098268,2024-10-31,Western,Small Solar,154077.0,FINANCED,150.0,WEEKLY,0.0510,434.0,0.32800,Male,,Teacher,,,,ORIGINAL,COMPLETE
1,3554125,2025-04-30,Western,Small Solar,878345.0,FINANCED,150.0,WEEKLY,0.0313,434.0,0.33500,Male,,Teacher,,,,ORIGINAL,COMPLETE
2,2792627,2024-03-31,Western,Small Solar,648775.0,FINANCED,150.0,WEEKLY,0.0554,434.0,0.32600,Male,4,Government Employee,4.0,,,ORIGINAL,COMPLETE
3,2869807,2024-05-31,Western,PAYGO_PHONE,328066.0,FINANCED,200.0,WEEKLY,0.1827,189.0,0.86500,Male,,Business,,,,ORIGINAL,COMPLETE
4,3674681,2025-06-30,Western,Small Solar,813534.0,FINANCED,150.0,WEEKLY,0.0296,364.0,0.39989,Male,,Business,,,,ASSUMPTION,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486618,3817296,2025-09-30,Southern,Small Solar,670507.0,FINANCED,150.0,DAILY,0.0495,480.0,0.29700,Male,,Labourer,,,,ORIGINAL,COMPLETE
486619,3817534,2025-09-30,Southern,Small Solar,884340.0,FINANCED,150.0,DAILY,0.0495,480.0,0.29700,Female,,Business,,,,ORIGINAL,COMPLETE
486620,3817434,2025-09-30,Northern,PAYGO_PORTABLE,246897.0,FINANCED,100.0,TRIDAILY,0.0401,399.0,0.24100,Male,,Farmer,,,,ORIGINAL,COMPLETE
486621,3817380,2025-09-30,Northern,Large Solar - Generation 2,500619.0,FINANCED,280.0,DAILY,0.0565,640.0,0.41300,Female,,Business,,,,ORIGINAL,COMPLETE


check for more null values

In [57]:
q8 = """select * from contracts
        WHERE contract_type = 'CASH'
        AND product IS NULL
"""
pd.read_sql(q8 , conn)

Unnamed: 0,contractid,sales_month,region,product,sales_person_id,contract_type,price_usd,payment_frequency,perc_deposit,tenor_length,daily_amount_usd,customer_gender,household_size,occupation,actual_household_size,phone_number,name,assumption_flag,incomplete_flag
0,2711666,2024-01-31,Northern,,509405.0,CASH,,,1.0,0.0,,Female,2,Business,2.0,,,ASSUMPTION,COMPLETE
1,2702597,2024-01-31,Northern,,369385.0,CASH,,,1.0,0.0,,Male,3,Farmer,3.0,,,ASSUMPTION,COMPLETE
2,2697995,2024-01-31,Northern,,442594.0,CASH,,,1.0,0.0,,Female,8,Business,8.0,,,ASSUMPTION,COMPLETE
3,2714710,2024-01-31,Northern,,393223.0,CASH,,,1.0,0.0,,Male,2,Business,2.0,,,ASSUMPTION,COMPLETE
4,3093451,2024-09-30,Western,,182684.0,CASH,,,1.0,0.0,,Male,,Business,,,,ASSUMPTION,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
705,3165594,2024-11-30,Southern,,839926.0,CASH,,,1.0,0.0,,Female,,Farmer,,,,ASSUMPTION,COMPLETE
706,3788006,2025-09-30,Southern,,309405.0,CASH,,,1.0,0.0,,Female,5,Business,5.0,,,ASSUMPTION,COMPLETE
707,3694870,2025-07-31,Southern,,,CASH,,,1.0,0.0,,Male,5,Farmer,5.0,,,ASSUMPTION,COMPLETE
708,3711431,2025-07-31,Eastern,,120402.0,CASH,,,1.0,0.0,,Male,,,,,,ASSUMPTION,COMPLETE


In [58]:
"""Some cash contracts have the product type but dont have the product price (price_usd). check for instances with
product but no price_usd """

price_null = """SELECT * FROM contracts WHERE product IS NOT NULL AND price_usd IS NULL;
"""
pd.read_sql(price_null , conn)

Unnamed: 0,contractid,sales_month,region,product,sales_person_id,contract_type,price_usd,payment_frequency,perc_deposit,tenor_length,daily_amount_usd,customer_gender,household_size,occupation,actual_household_size,phone_number,name,assumption_flag,incomplete_flag
0,3517826,2025-04-30,Southern,Small Solar,299803.0,CASH,,,1.0,0.0,,Male,,Business,,,,ASSUMPTION,COMPLETE
1,3516113,2025-04-30,Western,Large Solar - Generation 1,899264.0,CASH,,,1.0,0.0,,Male,,Business,,,,ASSUMPTION,COMPLETE
2,2681810,2024-01-31,Northern,Large Solar - Generation 1,,CASH,,,1.0,0.0,,Male,,Business,,,,ASSUMPTION,COMPLETE
3,2850545,2024-04-30,Northern,PAYGO_PORTABLE,906267.0,CASH,,,1.0,0.0,,Male,,Other,,,,ASSUMPTION,COMPLETE
4,3776342,2025-08-31,Western,Small Solar,567163.0,CASH,,,1.0,0.0,,Male,4,Business,4.0,,,ASSUMPTION,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118,2937716,2024-06-30,Northern,PAYGO_PORTABLE,365572.0,CASH,,,1.0,0.0,,Male,,Farmer,,,,ASSUMPTION,COMPLETE
119,3618473,2025-05-31,Southern,Large Solar - Generation 2,475515.0,CASH,,,1.0,0.0,,Male,,Farmer,,,,ASSUMPTION,COMPLETE
120,3456969,2025-03-31,Northern,Small Solar,297565.0,CASH,,,1.0,0.0,,Female,,,,,,ASSUMPTION,COMPLETE
121,3511106,2025-04-30,Western,Small Solar,973414.0,CASH,,,1.0,0.0,,Male,,Other,,,,ASSUMPTION,COMPLETE


In [59]:
"""There are 123 cash rows that have product but no product price. Use assumptions to fill them"""
update_price = """
    UPDATE contracts
    SET price_usd = CASE 
                    WHEN product = 'Small Solar' THEN 150
                    WHEN product = 'PAYGO_PHONE' THEN 200
                    WHEN product = 'Large Solar - Generation 1' THEN 300
                    WHEN product = 'Large Solar - Generation 2' THEN 280
                    WHEN product = 'PAYGO_PORTABLE' THEN 100
                    ELSE price_usd  
                END
WHERE contract_type = 'CASH'
  AND price_usd IS NULL;
"""
cursor.execute(update_price)
conn.commit()

In [60]:
price_null = """SELECT  * FROM contracts WHERE price_usd IS NULL
"""
pd.read_sql(price_null, conn)

Unnamed: 0,contractid,sales_month,region,product,sales_person_id,contract_type,price_usd,payment_frequency,perc_deposit,tenor_length,daily_amount_usd,customer_gender,household_size,occupation,actual_household_size,phone_number,name,assumption_flag,incomplete_flag
0,3108860,2024-10-31,Western,,782850.0,FINANCED,,WEEKLY,0.0374,665.0,,Male,,Farmer,,,,ASSUMPTION,COMPLETE
1,2888739,2024-05-31,Northern,,242074.0,FINANCED,,DAILY,0.0576,640.0,,Male,,Business,,,,ASSUMPTION,COMPLETE
2,2711666,2024-01-31,Northern,,509405.0,CASH,,,1.0000,0.0,,Female,2,Business,2.0,,,ASSUMPTION,COMPLETE
3,3039294,2024-08-31,Northern,,307572.0,FINANCED,,DAILY,0.0576,640.0,,Male,,Business,,,,ASSUMPTION,COMPLETE
4,2702597,2024-01-31,Northern,,369385.0,CASH,,,1.0000,0.0,,Male,3,Farmer,3.0,,,ASSUMPTION,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2161,3654080,2025-06-30,Northern,,874439.0,FINANCED,,DAILY,0.0560,640.0,,Male,,Farmer,,,,ASSUMPTION,COMPLETE
2162,3674495,2025-06-30,Western,,917325.0,FINANCED,,WEEKLY,0.0328,371.0,,Male,,Business,,,,ASSUMPTION,COMPLETE
2163,3787510,2025-09-30,Western,,675932.0,CASH,,,1.0000,0.0,,Male,,Business,,,,ASSUMPTION,COMPLETE
2164,3687403,2025-07-31,Northern,,366638.0,FINANCED,,DAILY,0.0560,640.0,,Female,,Farmer,,,,ASSUMPTION,COMPLETE


In [61]:
product_null = """SELECT * FROM contracts WHERE product IS NULL"""
pd.read_sql(product_null, conn)

Unnamed: 0,contractid,sales_month,region,product,sales_person_id,contract_type,price_usd,payment_frequency,perc_deposit,tenor_length,daily_amount_usd,customer_gender,household_size,occupation,actual_household_size,phone_number,name,assumption_flag,incomplete_flag
0,3108860,2024-10-31,Western,,782850.0,FINANCED,,WEEKLY,0.0374,665.0,,Male,,Farmer,,,,ASSUMPTION,COMPLETE
1,2888739,2024-05-31,Northern,,242074.0,FINANCED,,DAILY,0.0576,640.0,,Male,,Business,,,,ASSUMPTION,COMPLETE
2,2711666,2024-01-31,Northern,,509405.0,CASH,,,1.0000,0.0,,Female,2,Business,2.0,,,ASSUMPTION,COMPLETE
3,3039294,2024-08-31,Northern,,307572.0,FINANCED,,DAILY,0.0576,640.0,,Male,,Business,,,,ASSUMPTION,COMPLETE
4,2702597,2024-01-31,Northern,,369385.0,CASH,,,1.0000,0.0,,Male,3,Farmer,3.0,,,ASSUMPTION,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2161,3654080,2025-06-30,Northern,,874439.0,FINANCED,,DAILY,0.0560,640.0,,Male,,Farmer,,,,ASSUMPTION,COMPLETE
2162,3674495,2025-06-30,Western,,917325.0,FINANCED,,WEEKLY,0.0328,371.0,,Male,,Business,,,,ASSUMPTION,COMPLETE
2163,3787510,2025-09-30,Western,,675932.0,CASH,,,1.0000,0.0,,Male,,Business,,,,ASSUMPTION,COMPLETE
2164,3687403,2025-07-31,Northern,,366638.0,FINANCED,,DAILY,0.0560,640.0,,Female,,Farmer,,,,ASSUMPTION,COMPLETE


There are 2166 fields left. This are fields that dont have any product values hence no procuct price value. I will flag all columns with null values

The incomplete_flag counts the number of missing important fields
- Always checks `product` and `price_usd` for all contracts
- Checks `tenor_length`, `daily_amount_usd`, `perc_deposit` only for FINANCED contracts as this fields are only relevant to this contract type

***The result is the total number of missing fields relevant to the contract type***


In [62]:
fill_incomplete_flag = """UPDATE contracts
                        SET incomplete_flag = COALESCE(
                        CAST(
                        (CASE WHEN product IS NULL THEN 1 ELSE 0 END) +
                        (CASE WHEN sales_person_id IS NULL THEN 1 ELSE 0 END)+
                        (CASE WHEN price_usd IS NULL THEN 1 ELSE 0 END) +
                        (CASE WHEN contract_type = 'FINANCED' AND tenor_length IS NULL THEN 1 ELSE 0 END) +
                        (CASE WHEN contract_type = 'FINANCED' AND daily_amount_usd IS NULL THEN 1 ELSE 0 END) +
                        (CASE WHEN contract_type = 'FINANCED' AND perc_deposit IS NULL THEN 1 ELSE 0 END)
                        AS TEXT),
                            '0'
)"""  
cursor.execute(fill_incomplete_flag)
conn.commit()



In [63]:
q9 = """SELECT * FROM contracts"""
pd.read_sql(q9, conn)

Unnamed: 0,contractid,sales_month,region,product,sales_person_id,contract_type,price_usd,payment_frequency,perc_deposit,tenor_length,daily_amount_usd,customer_gender,household_size,occupation,actual_household_size,phone_number,name,assumption_flag,incomplete_flag
0,3098268,2024-10-31,Western,Small Solar,154077.0,FINANCED,150.0,WEEKLY,0.0510,434.0,0.32800,Male,,Teacher,,,,ORIGINAL,0
1,3554125,2025-04-30,Western,Small Solar,878345.0,FINANCED,150.0,WEEKLY,0.0313,434.0,0.33500,Male,,Teacher,,,,ORIGINAL,0
2,2792627,2024-03-31,Western,Small Solar,648775.0,FINANCED,150.0,WEEKLY,0.0554,434.0,0.32600,Male,4,Government Employee,4.0,,,ORIGINAL,0
3,2869807,2024-05-31,Western,PAYGO_PHONE,328066.0,FINANCED,200.0,WEEKLY,0.1827,189.0,0.86500,Male,,Business,,,,ORIGINAL,0
4,3674681,2025-06-30,Western,Small Solar,813534.0,FINANCED,150.0,WEEKLY,0.0296,364.0,0.39989,Male,,Business,,,,ASSUMPTION,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486618,3817296,2025-09-30,Southern,Small Solar,670507.0,FINANCED,150.0,DAILY,0.0495,480.0,0.29700,Male,,Labourer,,,,ORIGINAL,0
486619,3817534,2025-09-30,Southern,Small Solar,884340.0,FINANCED,150.0,DAILY,0.0495,480.0,0.29700,Female,,Business,,,,ORIGINAL,0
486620,3817434,2025-09-30,Northern,PAYGO_PORTABLE,246897.0,FINANCED,100.0,TRIDAILY,0.0401,399.0,0.24100,Male,,Farmer,,,,ORIGINAL,0
486621,3817380,2025-09-30,Northern,Large Solar - Generation 2,500619.0,FINANCED,280.0,DAILY,0.0565,640.0,0.41300,Female,,Business,,,,ORIGINAL,0


In [65]:
# Move incomplete rows to a new table. They have already been flagged as incomplete

null_price_products = """CREATE TABLE null_price_products AS
SELECT *
FROM contracts
WHERE product IS NULL"""

cursor.execute(null_price_products)
conn.commit()


check null values in sales_person_id


In [66]:
"""Leave the null values as is as there are no csv files with sales person id to help us inpute the null values
Add missing sales person ID, to the incomplete flag"""

null_sales_person = """select * from contracts where sales_person_id is null 
"""
pd.read_sql(null_sales_person, conn)

Unnamed: 0,contractid,sales_month,region,product,sales_person_id,contract_type,price_usd,payment_frequency,perc_deposit,tenor_length,daily_amount_usd,customer_gender,household_size,occupation,actual_household_size,phone_number,name,assumption_flag,incomplete_flag
0,2808119,2024-03-31,Southern,Small Solar,,FINANCED,150.0,DAILY,0.0515,450.0,0.309,Female,,Business,,,,ORIGINAL,1
1,2802742,2024-03-31,Southern,Large Solar - Generation 1,,CASH,300.0,,1.0000,0.0,,Male,,Farmer,,,,ASSUMPTION,1
2,3399302,2025-02-28,Southern,Large Solar - Generation 1,,CASH,300.0,,1.0000,0.0,,Female,,Farmer,,,,ASSUMPTION,1
3,2895662,2024-05-31,Southern,Large Solar - Generation 1,,FINANCED,300.0,DAILY,0.0836,666.0,0.509,Male,,Business,,,,ORIGINAL,1
4,2999435,2024-07-31,Northern,PAYGO_PHONE,,FINANCED,200.0,DAILY,0.0735,400.0,0.463,Male,,Business,,,,ORIGINAL,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9715,3776685,2025-08-31,Northern,PAYGO_PORTABLE,,FINANCED,100.0,DAILY,0.0534,399.0,0.237,Female,,Other,,,,ORIGINAL,1
9716,3792425,2025-09-30,Southern,Large Solar - Generation 1,,CASH,300.0,,1.0000,0.0,,Male,,,,,,ASSUMPTION,1
9717,3805913,2025-09-30,Northern,PAYGO_PHONE,,FINANCED,200.0,DAILY,0.0959,400.0,0.452,Male,,Business,,,,ORIGINAL,1
9718,3808994,2025-09-30,Northern,PAYGO_PORTABLE,,FINANCED,100.0,DAILY,0.0534,399.0,0.237,Female,,,,,,ORIGINAL,1


In [68]:
#save cleaned table as csv
df = pd.read_sql("SELECT * FROM contracts", conn)

df.to_csv("../cleaned_data/cleaned_contracts.csv", index=False)


In [None]:
df = pd.read_csv('../cleaned_data/cleaned_contracts.csv')
df.head()

Unnamed: 0,contractid,sales_month,region,product,sales_person_id,contract_type,price_usd,payment_frequency,perc_deposit,tenor_length,daily_amount_usd,customer_gender,household_size,occupation,actual_household_size,phone_number,name,assumption_flag,incomplete_flag
0,3098268,2024-10-31,Western,Small Solar,154077.0,FINANCED,150.0,WEEKLY,0.051,434.0,0.328,Male,,Teacher,,,,ORIGINAL,0
1,3554125,2025-04-30,Western,Small Solar,878345.0,FINANCED,150.0,WEEKLY,0.0313,434.0,0.335,Male,,Teacher,,,,ORIGINAL,0
2,2792627,2024-03-31,Western,Small Solar,648775.0,FINANCED,150.0,WEEKLY,0.0554,434.0,0.326,Male,4.0,Government Employee,4.0,,,ORIGINAL,0
3,2869807,2024-05-31,Western,PAYGO_PHONE,328066.0,FINANCED,200.0,WEEKLY,0.1827,189.0,0.865,Male,,Business,,,,ORIGINAL,0
4,3674681,2025-06-30,Western,Small Solar,813534.0,FINANCED,150.0,WEEKLY,0.0296,364.0,0.39989,Male,,Business,,,,ASSUMPTION,0


In [70]:
df.isna().sum()

contractid                    0
sales_month                   0
region                        0
product                    2166
sales_person_id            9720
contract_type                 0
price_usd                  2166
payment_frequency         25352
perc_deposit                  0
tenor_length                  0
daily_amount_usd          26808
customer_gender               0
household_size           476259
occupation                59683
actual_household_size    478315
phone_number             484655
name                     484612
assumption_flag               0
incomplete_flag               0
dtype: int64