# Data Exploration

In [2]:
import pandas as pd

# Load an Excel file
df = pd.read_excel("KRA_Sample_Dataset.xlsx", sheet_name="Sheet1")

# Display first few rows
print(df.head())


  taxpayer_id    taxpayer_name    region         sector taxpayer_type  \
0  KE75682867       Nina Quinn   Nairobi         Retail    Individual   
1  KE66755036     Roger Thomas    Nakuru         Retail    Individual   
2  KE66882282        Gina Yang  Machakos         Health    Individual   
3  KE31081788    James Wheeler    Nakuru  Manufacturing     Corporate   
4  KE23315092  Michael Goodwin     Thika    Agriculture    Individual   

   tax_paid  VAT_paid  income_tax_paid  corporate_tax_paid registration_date  \
0     81153     52578            54428              108036        2021-04-30   
1    240128     80025            35870              116525        2020-02-27   
2     80388     33658           105547               85679        2018-06-20   
3     36401      7852            51067              125829        2021-03-22   
4    207903     95986           191657              128216        2018-04-15   

  compliance_status  
0         Compliant  
1         Compliant  
2         Comp

In [4]:
# Display the last few rows
print(df.tail())

    taxpayer_id     taxpayer_name    region        sector taxpayer_type  \
995  KE60676914     Caitlyn Burns     Nyeri     Transport    Individual   
996  KE36704425  Stephen Gonzalez  Kakamega        Retail    Individual   
997  KE71298111   Laurie Valencia     Nyeri   Hospitality    Individual   
998  KE55014515      Anita Farmer  Kakamega  Construction    Individual   
999  KE64869006  Jonathan Carlson      Meru  Construction    Individual   

     tax_paid  VAT_paid  income_tax_paid  corporate_tax_paid  \
995     26253     95108           119269               45436   
996    462382      3204           199701               11569   
997    184635     93769            82764              144101   
998    162462     88265           153956               22490   
999    355808     32131           158255               45875   

    registration_date compliance_status  
995        2017-02-16     Non-Compliant  
996        2023-01-05         Compliant  
997        2018-09-30         Complian

In [3]:
# Geting basic information
print (df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   taxpayer_id         1000 non-null   object        
 1   taxpayer_name       1000 non-null   object        
 2   region              1000 non-null   object        
 3   sector              1000 non-null   object        
 4   taxpayer_type       1000 non-null   object        
 5   tax_paid            1000 non-null   int64         
 6   VAT_paid            1000 non-null   int64         
 7   income_tax_paid     1000 non-null   int64         
 8   corporate_tax_paid  1000 non-null   int64         
 9   registration_date   1000 non-null   datetime64[ns]
 10  compliance_status   1000 non-null   object        
dtypes: datetime64[ns](1), int64(4), object(6)
memory usage: 86.1+ KB
None


In [5]:
# Statistics summary
print(df.describe())

            tax_paid      VAT_paid  income_tax_paid  corporate_tax_paid  \
count    1000.000000   1000.000000      1000.000000         1000.000000   
mean   238575.992000  49667.325000    101107.975000        75225.576000   
min      5781.000000   1138.000000      2174.000000          287.000000   
25%    114655.000000  23775.250000     49744.250000        39152.750000   
50%    228613.000000  50829.500000    101776.000000        75553.000000   
75%    360742.250000  75041.000000    151738.250000       111578.250000   
max    498948.000000  99826.000000    199971.000000       149888.000000   
std    143964.174834  28633.449716     57481.637667        43017.255093   

                registration_date  
count                        1000  
mean   2020-04-06 08:51:21.600000  
min           2015-02-12 00:00:00  
25%           2017-08-15 06:00:00  
50%           2020-07-16 00:00:00  
75%           2022-11-01 00:00:00  
max           2025-02-11 00:00:00  
std                           NaN  


In [6]:
# Rows and columns count
print(df.shape)

(1000, 11)


In [7]:
# List column names
print(df.columns)

Index(['taxpayer_id', 'taxpayer_name', 'region', 'sector', 'taxpayer_type',
       'tax_paid', 'VAT_paid', 'income_tax_paid', 'corporate_tax_paid',
       'registration_date', 'compliance_status'],
      dtype='object')


In [12]:
# Data types of each column
print (df.dtypes)

taxpayer_id                   object
taxpayer_name                 object
region                        object
sector                        object
taxpayer_type                 object
tax_paid                       int64
VAT_paid                       int64
income_tax_paid                int64
corporate_tax_paid             int64
registration_date     datetime64[ns]
compliance_status             object
dtype: object


In [45]:
# Count of unique values per categorical column
categorical_columns = ["region", "sector", "taxpayer_type", "compliance_status"]
for col in categorical_columns:
    print(f"\nUnique values in {col}:")
    print(df[col].value_counts())


Unique values in region:
region
Mombasa     110
Nairobi     105
Meru        103
Kakamega    101
Thika       100
Nyeri       100
Eldoret     100
Machakos     97
Nakuru       93
Kisumu       91
Name: count, dtype: int64

Unique values in sector:
sector
Education        123
Manufacturing    110
Hospitality      106
Retail           102
Construction     100
Transport         97
IT & Tech         95
Agriculture       90
Finance           89
Health            88
Name: count, dtype: int64

Unique values in taxpayer_type:
taxpayer_type
Individual    604
SME           290
Corporate     106
Name: count, dtype: int64

Unique values in compliance_status:
compliance_status
Compliant        795
Non-Compliant    205
Name: count, dtype: int64


# Selecting and filtering Data

In [15]:
# Select columns
print (df['tax_paid'])
df2 = (df['tax_paid'])

0       81153
1      240128
2       80388
3       36401
4      207903
        ...  
995     26253
996    462382
997    184635
998    162462
999    355808
Name: tax_paid, Length: 1000, dtype: int64


In [20]:
# Select multiple columns
df3 = df[['tax_paid','region']]
print(df3)

     tax_paid    region
0       81153   Nairobi
1      240128    Nakuru
2       80388  Machakos
3       36401    Nakuru
4      207903     Thika
..        ...       ...
995     26253     Nyeri
996    462382  Kakamega
997    184635     Nyeri
998    162462  Kakamega
999    355808      Meru

[1000 rows x 2 columns]


In [21]:
# Select row by index label 
print(df.loc[200])

taxpayer_id                    KE21506305
taxpayer_name                David Powell
region                           Machakos
sector                          IT & Tech
taxpayer_type                  Individual
tax_paid                           233590
VAT_paid                            29365
income_tax_paid                    197575
corporate_tax_paid                  36683
registration_date     2017-05-04 00:00:00
compliance_status               Compliant
Name: 200, dtype: object


In [23]:
# Select row by integer position
print(df.iloc[200])

taxpayer_id                    KE21506305
taxpayer_name                David Powell
region                           Machakos
sector                          IT & Tech
taxpayer_type                  Individual
tax_paid                           233590
VAT_paid                            29365
income_tax_paid                    197575
corporate_tax_paid                  36683
registration_date     2017-05-04 00:00:00
compliance_status               Compliant
Name: 200, dtype: object


# Filtering and Sorting

In [26]:
# taxpayers who paid more than KES 200,000
high_tax_payers = df[df["tax_paid"] > 200000] 
high_tax_payers.head()

Unnamed: 0,taxpayer_id,taxpayer_name,region,sector,taxpayer_type,tax_paid,VAT_paid,income_tax_paid,corporate_tax_paid,registration_date,compliance_status
1,KE66755036,Roger Thomas,Nakuru,Retail,Individual,240128,80025,35870,116525,2020-02-27,Compliant
4,KE23315092,Michael Goodwin,Thika,Agriculture,Individual,207903,95986,191657,128216,2018-04-15,Compliant
6,KE36735830,Joseph Cortez,Nairobi,Agriculture,SME,237601,72947,154679,65901,2023-09-23,Compliant
9,KE98358551,John Crawford,Nairobi,Agriculture,Corporate,256459,52707,144980,90012,2021-11-03,Compliant
12,KE23953367,Jason Tanner,Mombasa,Manufacturing,Individual,454042,76939,63215,102476,2023-03-06,Compliant


In [27]:
high_tax_payers.tail()

Unnamed: 0,taxpayer_id,taxpayer_name,region,sector,taxpayer_type,tax_paid,VAT_paid,income_tax_paid,corporate_tax_paid,registration_date,compliance_status
987,KE62840397,Jacob Hernandez,Nairobi,Construction,Individual,280709,28606,153396,11350,2024-06-23,Non-Compliant
990,KE54132455,Matthew Sellers,Machakos,Education,Individual,358323,80303,149616,97557,2023-08-22,Compliant
992,KE20016389,Rebecca Rodriguez,Thika,Finance,Individual,448945,63522,31063,17576,2019-01-10,Compliant
996,KE36704425,Stephen Gonzalez,Kakamega,Retail,Individual,462382,3204,199701,11569,2023-01-05,Compliant
999,KE64869006,Jonathan Carlson,Meru,Construction,Individual,355808,32131,158255,45875,2022-09-16,Compliant


In [29]:
high_tax_payers.shape

(555, 11)

In [31]:
# Sorting by tax paid
sorted_tax_payers = df.sort_values(by="tax_paid", ascending=False)  
sorted_tax_payers.head()

Unnamed: 0,taxpayer_id,taxpayer_name,region,sector,taxpayer_type,tax_paid,VAT_paid,income_tax_paid,corporate_tax_paid,registration_date,compliance_status
104,KE73203558,David Hernandez,Meru,Health,SME,498948,26556,130389,116545,2022-06-29,Compliant
762,KE20320855,Michael Moore,Kisumu,Manufacturing,Individual,498253,24782,46942,84149,2025-01-25,Compliant
318,KE89688895,Annette Jackson,Kisumu,Construction,Individual,498158,30033,54874,13233,2024-06-24,Compliant
49,KE83506850,Dustin Bradley,Eldoret,IT & Tech,Individual,497289,6906,115000,70014,2020-03-23,Compliant
916,KE90480036,Theresa Hughes,Machakos,Education,Individual,497092,81098,37931,2035,2016-01-28,Compliant


In [32]:
sorted_tax_payers.tail()

Unnamed: 0,taxpayer_id,taxpayer_name,region,sector,taxpayer_type,tax_paid,VAT_paid,income_tax_paid,corporate_tax_paid,registration_date,compliance_status
596,KE11286643,Robin Reed,Kakamega,Education,Individual,8754,47840,183561,78537,2016-09-24,Compliant
411,KE41530803,Jesus Logan,Eldoret,Construction,Individual,8631,20086,127355,105583,2019-07-22,Compliant
768,KE75377052,Monica Randolph,Mombasa,Construction,Individual,8471,36596,10328,114456,2021-03-14,Non-Compliant
405,KE10449395,Charles Stone,Nairobi,Transport,SME,8191,11651,6986,25901,2016-05-20,Compliant
976,KE18351245,Alex Perez,Meru,Health,Individual,5781,34487,119603,35932,2023-10-22,Compliant


# String Operations on Data

In [33]:
# Convert names to uppercase
df["taxpayer_name"] = df["taxpayer_name"].str.upper()  

In [None]:
# Extract numeric part of ID
df_kra["taxpayer_pin"] = df_kra["taxpayer_id"].str.extract(r'KE(\d+)')  

# Changing Data Types

In [None]:
 # Convert to datetime

df["registration_date"] = pd.to_datetime(df["registration_date"]) 

# Handling Missing Values

In [34]:
# Check missing values
missing_values = df.isnull().sum()
missing_values

taxpayer_id           0
taxpayer_name         0
region                0
sector                0
taxpayer_type         0
tax_paid              0
VAT_paid              0
income_tax_paid       0
corporate_tax_paid    0
registration_date     0
compliance_status     0
dtype: int64

In [35]:
# Fill missing tax values with median
df["tax_paid"].fillna(df["tax_paid"].median(), inplace=True)  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["tax_paid"].fillna(df["tax_paid"].median(), inplace=True)


# Handling Duplicates

In [37]:
# Count duplicates
duplicate_count = df.duplicated().sum()  
duplicate_count

0

In [38]:
# Remove duplicates by taxpayer_id
df.drop_duplicates(subset=["taxpayer_id"], keep="first", inplace=True)  

# Aggregating Data

In [55]:
# Total tax per region
tax_per_region = df.groupby("region")["tax_paid"].sum() 
tax_per_region_formatted = tax_per_region.apply(lambda x: f"{x:.2f}")
formatted_output = '\n'.join([f"{region} = {amount}" for region, amount in tax_per_region_formatted.items()])
print("\nTax paid per region:")
print(formatted_output)




Tax paid per region:
Eldoret = 21503741.00
Kakamega = 27815106.00
Kisumu = 23358871.00
Machakos = 23340658.00
Meru = 24895384.00
Mombasa = 26315121.00
Nairobi = 24329114.00
Nakuru = 19975796.00
Nyeri = 23556210.00
Thika = 23485991.00


In [41]:
# Average VAT per sector
avg_vat_per_sector = df.groupby("sector")["VAT_paid"].mean() 
avg_vat_per_sector

sector
Agriculture      56299.588889
Construction     53058.530000
Education        50696.715447
Finance          45212.235955
Health           47485.625000
Hospitality      50703.632075
IT & Tech        52549.778947
Manufacturing    43295.981818
Retail           46996.833333
Transport        50857.113402
Name: VAT_paid, dtype: float64

In [40]:

# Count taxpayers per region
taxpayer_count = df.groupby("region")["taxpayer_id"].count() 
taxpayer_count

region
Eldoret     100
Kakamega    101
Kisumu       91
Machakos     97
Meru        103
Mombasa     110
Nairobi     105
Nakuru       93
Nyeri       100
Thika       100
Name: taxpayer_id, dtype: int64

# Merging Datasets

In [43]:
 # Sample dataset
df_tax_compliance = df[["taxpayer_id", "compliance_status"]].sample(frac=1).reset_index(drop=True) 
df_tax_compliance

Unnamed: 0,taxpayer_id,compliance_status
0,KE53351876,Compliant
1,KE56216125,Non-Compliant
2,KE64474455,Compliant
3,KE69994988,Compliant
4,KE88725274,Non-Compliant
...,...,...
995,KE67631343,Compliant
996,KE12976242,Compliant
997,KE94302960,Compliant
998,KE45912253,Non-Compliant


In [44]:
# Merge datasets
df_merged = pd.merge(df, df_tax_compliance, on="taxpayer_id", how="inner", suffixes=("_orig", "_compliance"))  
df_merged

Unnamed: 0,taxpayer_id,taxpayer_name,region,sector,taxpayer_type,tax_paid,VAT_paid,income_tax_paid,corporate_tax_paid,registration_date,compliance_status_orig,compliance_status_compliance
0,KE75682867,NINA QUINN,Nairobi,Retail,Individual,81153,52578,54428,108036,2021-04-30,Compliant,Compliant
1,KE66755036,ROGER THOMAS,Nakuru,Retail,Individual,240128,80025,35870,116525,2020-02-27,Compliant,Compliant
2,KE66882282,GINA YANG,Machakos,Health,Individual,80388,33658,105547,85679,2018-06-20,Compliant,Compliant
3,KE31081788,JAMES WHEELER,Nakuru,Manufacturing,Corporate,36401,7852,51067,125829,2021-03-22,Compliant,Compliant
4,KE23315092,MICHAEL GOODWIN,Thika,Agriculture,Individual,207903,95986,191657,128216,2018-04-15,Compliant,Compliant
...,...,...,...,...,...,...,...,...,...,...,...,...
995,KE60676914,CAITLYN BURNS,Nyeri,Transport,Individual,26253,95108,119269,45436,2017-02-16,Non-Compliant,Non-Compliant
996,KE36704425,STEPHEN GONZALEZ,Kakamega,Retail,Individual,462382,3204,199701,11569,2023-01-05,Compliant,Compliant
997,KE71298111,LAURIE VALENCIA,Nyeri,Hospitality,Individual,184635,93769,82764,144101,2018-09-30,Compliant,Compliant
998,KE55014515,ANITA FARMER,Kakamega,Construction,Individual,162462,88265,153956,22490,2024-05-29,Compliant,Compliant


# Descriptive Statistics

In [42]:
summary_statistics = df.describe()
summary_statistics

Unnamed: 0,tax_paid,VAT_paid,income_tax_paid,corporate_tax_paid,registration_date
count,1000.0,1000.0,1000.0,1000.0,1000
mean,238575.992,49667.325,101107.975,75225.576,2020-04-06 08:51:21.600000
min,5781.0,1138.0,2174.0,287.0,2015-02-12 00:00:00
25%,114655.0,23775.25,49744.25,39152.75,2017-08-15 06:00:00
50%,228613.0,50829.5,101776.0,75553.0,2020-07-16 00:00:00
75%,360742.25,75041.0,151738.25,111578.25,2022-11-01 00:00:00
max,498948.0,99826.0,199971.0,149888.0,2025-02-11 00:00:00
std,143964.174834,28633.449716,57481.637667,43017.255093,


In [47]:
# Correlation matrix of numerical features
continuous_columns = df.select_dtypes(include=['number']).columns.tolist()
print("\nCorrelation matrix of continuous variables:")
print(df[continuous_columns].corr())


Correlation matrix of continuous variables:
                    tax_paid  VAT_paid  income_tax_paid  corporate_tax_paid
tax_paid            1.000000  0.017675        -0.023196            0.024190
VAT_paid            0.017675  1.000000        -0.007595            0.002230
income_tax_paid    -0.023196 -0.007595         1.000000           -0.023621
corporate_tax_paid  0.024190  0.002230        -0.023621            1.000000


# Exercise

## Exercise 1

Load the dataset into a Pandas DataFrame and display the first 5 rows.

In [4]:
import pandas as pd

df = pd.read_excel('KRA_Class_Dataset.xlsx')

print(df.head())

  taxpayer_id  taxpayer_name    region         sector taxpayer_type  tax_paid  \
0  KE75682867  Brandon Smith   Nairobi         Retail    Individual     81153   
1  KE66755036   Megan Patton    Nakuru         Retail    Individual    240128   
2  KE66882282  James Douglas  Machakos         Health    Individual     80388   
3  KE31081788    Haley Moore    Nakuru  Manufacturing     Corporate     36401   
4  KE23315092      Dean Kemp     Thika    Agriculture    Individual    207903   

   VAT_paid  income_tax_paid  corporate_tax_paid registration_date  \
0     52578            54428              108036        2022-06-16   
1     80025            35870              116525        2024-02-02   
2     33658           105547               85679        2017-07-12   
3      7852            51067              125829        2023-07-06   
4     95986           191657              128216        2015-04-14   

  compliance_status  
0         Compliant  
1         Compliant  
2         Compliant  
3   

## Exercise 2

Filter taxpayers who have paid more than 300,000 KES in tax.

In [5]:
high_tax_payers = df[df['tax_paid'] > 300000]

print(high_tax_payers.head())

   taxpayer_id     taxpayer_name    region         sector taxpayer_type  \
12  KE23953367    Andrea Johnson   Mombasa  Manufacturing    Individual   
13  KE95652971   Matthew Elliott  Kakamega      Transport    Individual   
16  KE86893497  Curtis Hernandez   Nairobi        Finance    Individual   
17  KE40349564       Jodi Torres     Thika      IT & Tech           SME   
18  KE52860080     Kevin Jenkins     Thika    Agriculture    Individual   

    tax_paid  VAT_paid  income_tax_paid  corporate_tax_paid registration_date  \
12    454042     76939            63215              102476        2017-05-29   
13    434667     58828           188907              120721        2018-10-09   
16    303185     22384           110048              131040        2017-12-30   
17    356064     22038           171530               35503        2019-07-02   
18    332825     41846           179241               75403        2018-03-25   

   compliance_status  
12         Compliant  
13         Compl

## Exercise 3

Sort the dataset by the amount of tax paid in descending order.

In [6]:
sorted_df = df.sort_values(by='tax_paid', ascending=False)

print(sorted_df.head())

    taxpayer_id  taxpayer_name    region         sector taxpayer_type  \
104  KE73203558   Rachel Brown      Meru         Health           SME   
762  KE20320855   Tricia Smith    Kisumu  Manufacturing    Individual   
318  KE89688895  Nicole Martin    Kisumu   Construction    Individual   
49   KE83506850       Amy Haas   Eldoret      IT & Tech    Individual   
916  KE90480036  Nicole Hughes  Machakos      Education    Individual   

     tax_paid  VAT_paid  income_tax_paid  corporate_tax_paid  \
104    498948     26556           130389              116545   
762    498253     24782            46942               84149   
318    498158     30033            54874               13233   
49     497289      6906           115000               70014   
916    497092     81098            37931                2035   

    registration_date compliance_status  
104        2023-11-08         Compliant  
762        2015-10-08         Compliant  
318        2021-04-24         Compliant  
49      

## Exercise 4

Convert all taxpayer names to uppercase

In [7]:
df['taxpayer_name'] = df['taxpayer_name'].str.upper()

print(df.head())

  taxpayer_id  taxpayer_name    region         sector taxpayer_type  tax_paid  \
0  KE75682867  BRANDON SMITH   Nairobi         Retail    Individual     81153   
1  KE66755036   MEGAN PATTON    Nakuru         Retail    Individual    240128   
2  KE66882282  JAMES DOUGLAS  Machakos         Health    Individual     80388   
3  KE31081788    HALEY MOORE    Nakuru  Manufacturing     Corporate     36401   
4  KE23315092      DEAN KEMP     Thika    Agriculture    Individual    207903   

   VAT_paid  income_tax_paid  corporate_tax_paid registration_date  \
0     52578            54428              108036        2022-06-16   
1     80025            35870              116525        2024-02-02   
2     33658           105547               85679        2017-07-12   
3      7852            51067              125829        2023-07-06   
4     95986           191657              128216        2015-04-14   

  compliance_status  
0         Compliant  
1         Compliant  
2         Compliant  
3   

## Exercise 5

Extract the numeric portion from the taxpayer ID (e.g., 'KE12345678' -> '12345678').

In [8]:
df['taxpayer_pin'] = df['taxpayer_id'].str.extract(r'KE(\d+)')

print(df.head())

  taxpayer_id  taxpayer_name    region         sector taxpayer_type  tax_paid  \
0  KE75682867  BRANDON SMITH   Nairobi         Retail    Individual     81153   
1  KE66755036   MEGAN PATTON    Nakuru         Retail    Individual    240128   
2  KE66882282  JAMES DOUGLAS  Machakos         Health    Individual     80388   
3  KE31081788    HALEY MOORE    Nakuru  Manufacturing     Corporate     36401   
4  KE23315092      DEAN KEMP     Thika    Agriculture    Individual    207903   

   VAT_paid  income_tax_paid  corporate_tax_paid registration_date  \
0     52578            54428              108036        2022-06-16   
1     80025            35870              116525        2024-02-02   
2     33658           105547               85679        2017-07-12   
3      7852            51067              125829        2023-07-06   
4     95986           191657              128216        2015-04-14   

  compliance_status taxpayer_pin  
0         Compliant     75682867  
1         Compliant   

## Exercise 6

Count the number of taxpayers per region

In [9]:
taxpayer_count = df.groupby('region')['taxpayer_id'].count()

print(taxpayer_count)

region
Eldoret     100
Kakamega    101
Kisumu       91
Machakos     97
Meru        103
Mombasa     110
Nairobi     105
Nakuru       93
Nyeri       100
Thika       100
Name: taxpayer_id, dtype: int64


## Exercise 7

Find the total tax collected per sector.

In [10]:
tax_per_sector = df.groupby('sector')['tax_paid'].sum()

print(tax_per_sector)

sector
Agriculture      24157013
Construction     24143047
Education        31307726
Finance          19756769
Health           21340778
Hospitality      23485686
IT & Tech        21554273
Manufacturing    27938699
Retail           22845099
Transport        22046902
Name: tax_paid, dtype: int64


## Exercise 8

Remove any duplicate taxpayer records based on the taxpayer ID

In [12]:
df.drop_duplicates(subset=['taxpayer_id'], keep='first', inplace=True)
print(df.shape)

(1000, 12)


## Exercise 9

Convert the registration date column into a proper datetime format.

In [13]:
df['registration_date'] = pd.to_datetime(df['registration_date'])

print(df.dtypes)

taxpayer_id                   object
taxpayer_name                 object
region                        object
sector                        object
taxpayer_type                 object
tax_paid                       int64
VAT_paid                       int64
income_tax_paid                int64
corporate_tax_paid             int64
registration_date     datetime64[ns]
compliance_status             object
taxpayer_pin                  object
dtype: object


# Exercise 10

Merge this dataset with another dataset containing taxpayer compliance status

In [14]:
df_merged = pd.merge(df, KRA_Taxpayer_Compliance, on='taxpayer_id', how='inner', suffixes=('_original', '_compliance'))

print(df_merged.head())

NameError: name 'KRA_Taxpayer_Compliance' is not defined

# Exercise 11

Write a code that counts the unique values per categorical column

In [15]:
categorical_unique_counts = {col: df[col].nunique() for col in categorical_columns}
print(categorical_unique_counts)

NameError: name 'categorical_columns' is not defined

# Exercise 12

Compute the correlation matrix of continuous variables

In [16]:
print(df[continuous_columns].corr())

NameError: name 'continuous_columns' is not defined