# DATA QUALITY

Goal: Assess the health of the ingested data.
Tasks:

Inspect dimensions (df.shape, df.info(), df.describe())

Missing values report (df.isna().sum())

Check duplicates

Validate column types (e.g., dates, integers, floats)

Outlier detection (basic stats, boxplots)

High-level summary → like a "data health report"

## marketing_campaign_2024.csv

In [1]:
import pandas as pd
import numpy as np

In [2]:
#LOAD DATA
#LOAD 1ST CSV - 2024 FILE

df1 = pd.read_csv("../data/raw/marketing_campaign_2024.csv")

In [3]:
df1.head()

Unnamed: 0,campaign_id,campaign_name,start_date,end_date,channel,region,impressions,clicks,conversions,spend_usd,revenue_usd,target_audience,product_category,device,year
0,2024_0001,Campaign_2024_0001,2024-02-06,2024-10-29,Print,North America,98331,7575,5651,5388.54,16034.82,Youth,Electronics,Tablet,2024
1,2024_0002,Campaign_2024_0002,2024-01-31,2024-04-13,Social,Asia,7434,2848,542,25115.44,29488.38,Youth,Electronics,Tablet,2024
2,2024_0003,Campaign_2024_0003,2024-01-13,2024-07-17,Email,Asia,93676,67404,32708,47090.84,65441.18,Adults,Electronics,Tablet,2024
3,2024_0004,Campaign_2024_0004,2024-01-02,2024-03-03,Email,Europe,46596,34586,17099,6856.25,17325.38,Adults,Services,Mobile,2024
4,2024_0005,Campaign_2024_0005,2024-04-24,2024-11-24,Email,Africa,61629,1750,584,1888.24,2351.42,Adults,Services,Mobile,2024


In [4]:
#INSPECT
print(df1.shape)
print(type(df1))
print(df1.keys())
df1.info()
df1.head()


(500, 15)
<class 'pandas.core.frame.DataFrame'>
Index(['campaign_id', 'campaign_name', 'start_date', 'end_date', 'channel',
       'region', 'impressions', 'clicks', 'conversions', 'spend_usd',
       'revenue_usd', 'target_audience', 'product_category', 'device', 'year'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   campaign_id       500 non-null    object 
 1   campaign_name     500 non-null    object 
 2   start_date        500 non-null    object 
 3   end_date          500 non-null    object 
 4   channel           500 non-null    object 
 5   region            500 non-null    object 
 6   impressions       500 non-null    int64  
 7   clicks            500 non-null    int64  
 8   conversions       500 non-null    int64  
 9   spend_usd         500 non-null    float64
 10  revenue_usd       500 non-null

Unnamed: 0,campaign_id,campaign_name,start_date,end_date,channel,region,impressions,clicks,conversions,spend_usd,revenue_usd,target_audience,product_category,device,year
0,2024_0001,Campaign_2024_0001,2024-02-06,2024-10-29,Print,North America,98331,7575,5651,5388.54,16034.82,Youth,Electronics,Tablet,2024
1,2024_0002,Campaign_2024_0002,2024-01-31,2024-04-13,Social,Asia,7434,2848,542,25115.44,29488.38,Youth,Electronics,Tablet,2024
2,2024_0003,Campaign_2024_0003,2024-01-13,2024-07-17,Email,Asia,93676,67404,32708,47090.84,65441.18,Adults,Electronics,Tablet,2024
3,2024_0004,Campaign_2024_0004,2024-01-02,2024-03-03,Email,Europe,46596,34586,17099,6856.25,17325.38,Adults,Services,Mobile,2024
4,2024_0005,Campaign_2024_0005,2024-04-24,2024-11-24,Email,Africa,61629,1750,584,1888.24,2351.42,Adults,Services,Mobile,2024


In [5]:
#INSPECT df1 2

#last 5 row of the table

df1.tail()

Unnamed: 0,campaign_id,campaign_name,start_date,end_date,channel,region,impressions,clicks,conversions,spend_usd,revenue_usd,target_audience,product_category,device,year
495,2024_0496,Campaign_2024_0496,2024-05-04,2024-10-30,Email,North America,66276,40646,23317,37606.83,74183.96,Adults,Clothing,Desktop,2024
496,2024_0497,Campaign_2024_0497,2024-03-26,2024-07-06,Email,South America,85645,74278,15456,1874.71,3417.77,Adults,Food,Desktop,2024
497,2024_0498,Campaign_2024_0498,2024-01-01,2024-12-07,Display,Africa,57922,50631,6746,48765.11,71746.81,Seniors,Services,Desktop,2024
498,2024_0499,Campaign_2024_0499,2024-02-01,2024-07-24,Print,Europe,91714,37018,32775,35245.95,53831.39,Youth,Clothing,Tablet,2024
499,2024_0500,Campaign_2024_0500,2024-01-18,2024-08-29,Social,Europe,45040,29153,22977,18102.83,32817.6,Youth,Software,Desktop,2024


In [6]:
#SUMMARY STATS

df1.describe()

Unnamed: 0,impressions,clicks,conversions,spend_usd,revenue_usd,year
count,500.0,500.0,500.0,500.0,500.0,500.0
mean,49803.16,24724.252,12884.418,24966.87452,50549.28508,2024.0
std,27958.139477,22225.181931,15254.271725,13912.457387,32104.110938,0.0
min,1175.0,79.0,9.0,1007.23,1404.13,2024.0
25%,25152.5,6045.0,1722.75,12453.2875,23504.3625,2024.0
50%,48506.5,18345.0,6994.0,26014.21,46331.395,2024.0
75%,73151.0,37203.75,19132.0,36339.0375,72874.9575,2024.0
max,99725.0,90122.0,86432.0,49800.16,142682.22,2024.0


In [7]:
#COUNT OF NULL VALUES

df1.isnull().sum()

campaign_id         0
campaign_name       0
start_date          0
end_date            0
channel             0
region              0
impressions         0
clicks              0
conversions         0
spend_usd           0
revenue_usd         0
target_audience     0
product_category    0
device              0
year                0
dtype: int64

In [8]:
#DUPLICATE ROWS

df1.duplicated().sum()

np.int64(0)

In [9]:
#UNIQUE VALUES

df1.nunique()

campaign_id         500
campaign_name       500
start_date          169
end_date            248
channel               6
region                5
impressions         500
clicks              499
conversions         488
spend_usd           500
revenue_usd         500
target_audience       3
product_category      5
device                3
year                  1
dtype: int64

In [10]:
#just need to onfirm the uniqueness of this column

df1['campaign_id'].is_unique

True