## Importación de librerías

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Primera carga de datos y visualización previa

In [2]:
# cargar el archivo marketingcampaigns_corrected.csv (datos corregidos)
df = pd.read_csv('../data/marketingcampaigns_corrected.csv')
df

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
0,Public-key multi-tasking throughput,2023-04-01,2024-02-23,8082.3,0.35,email,B2B,organic,0.40,709593.48
1,De-engineered analyzing task-force,2023-02-15,2024-04-22,17712.98,0.74,email,B2C,promotion,0.66,516609.10
2,Balanced solution-oriented Local Area Network,2022-12-20,2023-10-11,84643.1,0.37,podcast,B2B,paid,0.28,458227.42
3,Distributed real-time methodology,2022-09-26,2023-09-27,14589.75,0.47,webinar,B2B,organic,0.19,89958.73
4,Front-line executive infrastructure,2023-07-07,2024-05-15,39291.9,0.30,social media,B2B,promotion,0.81,47511.35
...,...,...,...,...,...,...,...,...,...,...
1032,No revenue campaign,2023-02-01,2023-08-01,20000,0.30,social media,B2B,organic,0.50,
1033,Random mess,2023-06-06,,100000,,podcast,,referral,,300000.00
1034,Invalid budget,2022-12-01,2023-06-01,abc,,email,B2C,promotion,0.20,50000.00
1035,Overlapping dates,2023-03-01,2022-12-31,60000,0.60,webinar,B2B,paid,0.70,90000.00


In [3]:
# información preliminar de los datos
'''
campaign_name -----> nombre_de_campaña
start_date --------> fecha_de_inicio
end_date ----------> fecha_de_finalización
budget ------------> presupuesto objetivo
roi ---------------> ROI "retorno de la inversión" (Return On Investment)
type --------------> tipo
target_audience ---> público_objetivo
channel -----------> canal
conversion_rate ---> tasa_de_conversión
revenue -----------> ingresos
'''
df.info()
# OBSERVACIONES:
# start_date y end_date son de tipo object, se deben convertir a datetime
# conversion_rate es de tipo object, se debe convertir a float
# revenue es de tipo object, se debe convertir a float

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1037 entries, 0 to 1036
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   campaign_name    1037 non-null   object 
 1   start_date       1036 non-null   object 
 2   end_date         1035 non-null   object 
 3   budget           1034 non-null   object 
 4   roi              1033 non-null   float64
 5   type             1036 non-null   object 
 6   target_audience  1035 non-null   object 
 7   channel          1036 non-null   object 
 8   conversion_rate  1033 non-null   float64
 9   revenue          1034 non-null   float64
dtypes: float64(3), object(7)
memory usage: 81.1+ KB


In [4]:
df.describe()
# OBSERVACIONES:
# hay al menos un registro con valores negativos de roi y revenue
# hay valores demasiado separados (desviación estándar muy grande) en budget

Unnamed: 0,roi,conversion_rate,revenue
count,1033.0,1033.0,1034.0
mean,0.533553,0.541946,509901.685667
std,0.26151,0.267271,287916.037107
min,-0.2,0.0,-15000.0
25%,0.31,0.3,266545.58
50%,0.53,0.55,516907.835
75%,0.76,0.77,764547.97
max,0.99,1.5,999712.49


In [5]:
print(df.isnull().sum())
print(f"Suma de maximo de registros con nulos: {df.isnull().sum().sum()}")
# OBSERVACIONES:
# hay valores nulos a corregir en todas las columnas

campaign_name      0
start_date         1
end_date           2
budget             3
roi                4
type               1
target_audience    2
channel            1
conversion_rate    4
revenue            3
dtype: int64
Suma de maximo de registros con nulos: 21


In [6]:
# convertir los datos de las columnas 'budget', 'roi', 'conversion_rate' y 'revenue' a numéricos, y reemplazar valores no numericos por NaN
for column in ['budget', 'roi', 'conversion_rate', 'revenue']:
  df[column] = pd.to_numeric(df[column], errors='coerce')
print(df.isnull().sum())

campaign_name      0
start_date         1
end_date           2
budget             4
roi                4
type               1
target_audience    2
channel            1
conversion_rate    4
revenue            3
dtype: int64


vemos que hay un registro adicional en el campo 'budget' que no es un número, por lo que se debe convertir a NaN para poder trabajar con el campo.

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1037 entries, 0 to 1036
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   campaign_name    1037 non-null   object 
 1   start_date       1036 non-null   object 
 2   end_date         1035 non-null   object 
 3   budget           1033 non-null   float64
 4   roi              1033 non-null   float64
 5   type             1036 non-null   object 
 6   target_audience  1035 non-null   object 
 7   channel          1036 non-null   object 
 8   conversion_rate  1033 non-null   float64
 9   revenue          1034 non-null   float64
dtypes: float64(4), object(6)
memory usage: 81.1+ KB


## Primera limpieza de datos
Se eliminarán los registros que contengan valores de nulos que, por su importancia o por su cantidad en el registro, hagan que el registro sea inútil para el análisis.

In [8]:
# nuevo campo que indica la cantidad de nulos en cada registro (este campo se eliminará más adelante)
df['num_nulls'] = df.isnull().sum(axis=1)

# Ordenar el DataFrame por la columna de número de nulos en orden descendente
df_sorted_by_nulls = df.sort_values(by='num_nulls', ascending=False)

# Mostrar los registros con nulos, ordenados por número de nulos en cada registro
df_sorted_by_nulls[df_sorted_by_nulls['num_nulls'] > 0]

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue,num_nulls
1029,Null-heavy campaign,2023-01-01,,,,B2B,social media,,,,6
1033,Random mess,2023-06-06,,100000.0,,podcast,,referral,,300000.0,4
1026,Cloud-based scalable solution,,2023-12-31,50000.0,,event,B2C,paid,0.3,120000.0,2
1034,Invalid budget,2022-12-01,2023-06-01,,,email,B2C,promotion,0.2,50000.0,2
1008,NEW CAMPAIGN - Missing Budget,2023-10-01,2024-01-15,,0.25,email,B2B,organic,,45000.0,2
1027,Broken-date campaign,2023-13-01,2024-01-01,25000.0,0.45,email,B2B,organic,,87500.0,1
1028,Negative ROI test,2022-10-10,2023-05-05,-10000.0,-0.2,podcast,B2C,referral,0.1,,1
1005,Upgradable transitional data-warehouse,2023-06-29,2023-12-13,,0.59,social media,B2C,referral,0.67,558302.11,1
1031,Extra long name campaign test,2023-04-15,2023-09-15,30000.0,0.25,email,,paid,0.4,45000.0,1
1032,No revenue campaign,2023-02-01,2023-08-01,20000.0,0.3,social media,B2B,organic,0.5,,1


In [9]:
# eliminar registros con un número de nulos por registro mayor a 1
df = df[df['num_nulls'] <= 1]
df[df['num_nulls'] > 0]

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue,num_nulls
1005,Upgradable transitional data-warehouse,2023-06-29,2023-12-13,,0.59,social media,B2C,referral,0.67,558302.11,1
1007,User-friendly client-driven service-desk,2023-01-06,2023-12-11,36800.58,0.4,,B2C,promotion,0.52,206241.46,1
1027,Broken-date campaign,2023-13-01,2024-01-01,25000.0,0.45,email,B2B,organic,,87500.0,1
1028,Negative ROI test,2022-10-10,2023-05-05,-10000.0,-0.2,podcast,B2C,referral,0.1,,1
1031,Extra long name campaign test,2023-04-15,2023-09-15,30000.0,0.25,email,,paid,0.4,45000.0,1
1032,No revenue campaign,2023-02-01,2023-08-01,20000.0,0.3,social media,B2B,organic,0.5,,1


> Eliminados 5 registros (5/1037 --> 0.48%) por demasiados nulos
>
> Eliminados hasta el momento 5 registros (5/1037 --> 0.48%)

In [10]:
df.drop(columns='num_nulls', inplace=True)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns='num_nulls', inplace=True)


Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
0,Public-key multi-tasking throughput,2023-04-01,2024-02-23,8082.3,0.35,email,B2B,organic,0.4,709593.48
1,De-engineered analyzing task-force,2023-02-15,2024-04-22,17712.98,0.74,email,B2C,promotion,0.66,516609.1
2,Balanced solution-oriented Local Area Network,2022-12-20,2023-10-11,84643.1,0.37,podcast,B2B,paid,0.28,458227.42
3,Distributed real-time methodology,2022-09-26,2023-09-27,14589.75,0.47,webinar,B2B,organic,0.19,89958.73
4,Front-line executive infrastructure,2023-07-07,2024-05-15,39291.9,0.3,social media,B2B,promotion,0.81,47511.35


In [11]:
# dividir registros por tipo de dato object, y hallar la cantidad de valores únicos en cada columna para ver si hay errores
print(df.groupby('type').size())
print(f'Suma de registros: {df.groupby('type').size().sum()}\n')
print(df.groupby('channel').size())
print(f'Suma de registros: {df.groupby('channel').size().sum()}\n')
print(df.groupby('target_audience').size())
print(f'Suma de registros: {df.groupby('target_audience').size().sum()}\n')
print(f'Suma de registros totales: {df['campaign_name'].count()}')

# OBSERVACIONES:
# hay un valor incorrecto (referal) en la columna channel --> cambiar por referral
# el resto de valores son consistentes (no se detectan otros errores de escritura)

type
email           288
podcast         233
social media    242
webinar         268
dtype: int64
Suma de registros: 1031

channel
organic      249
paid         243
promotion    281
referal        1
referral     258
dtype: int64
Suma de registros: 1032

target_audience
B2B    530
B2C    501
dtype: int64
Suma de registros: 1031

Suma de registros totales: 1032


## Corrección de errores a nivel de datos
1. ESCRITURA DE DATOS
* hay un valor incorrecto (referal) en la columna channel --> cambiar por referral

2. NULOS
* hay valores nulos a corregir en las columnas: `budget`, `type`, `target_audience`, `conversion_rate` y `revenue`

3. TYPADO
* start_date y end_date son de tipo object, se deben convertir a datetime
* conversion_rate es de tipo object, se debe convertir a float
* revenue es de tipo object, se debe convertir a float

4. DISPERSIÓN
* hay al menos un registro con valores negativos de roi y revenue
* hay valores demasiado separados (desviación estándar muy grande) en budget

### 1. ESCRITURA DE DATOS

In [12]:
# cambiar el valor referal por referral en la columna channel
df.loc[df['channel'] == 'referal', 'channel'] = 'referral'
print(df.groupby('channel').size())
print(f'Suma de registros: {df.groupby('channel').size().sum()}\n')

channel
organic      249
paid         243
promotion    281
referral     259
dtype: int64
Suma de registros: 1032



### 2. NULOS Y DISPERSIÓN DE DATOS

In [13]:
# suma de valores nulos en cada columna
df.isnull().sum()

campaign_name      0
start_date         0
end_date           0
budget             1
roi                0
type               1
target_audience    1
channel            0
conversion_rate    1
revenue            2
dtype: int64

#### 2.1. Columna `type`

In [14]:
# registro con valores nulos en la columna type
df[df['type'].isnull()]

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
1007,User-friendly client-driven service-desk,2023-01-06,2023-12-11,36800.58,0.4,,B2C,promotion,0.52,206241.46


In [15]:
# contar registros con los valores channel=promotion y target_audence=B2C, agrupados por type
df[(df['channel'] == 'promotion') & (df['target_audience'] == 'B2C')].groupby('type').size()

type
email           37
podcast         30
social media    23
webinar         39
dtype: int64

In [16]:
# cambiar el valor nulo en la columna type por 'others' (por el nombre de campaign_name, se puede catalogar como 'others')
df.loc[df['type'].isnull(), 'type'] = 'others'
df.isnull().sum()

campaign_name      0
start_date         0
end_date           0
budget             1
roi                0
type               0
target_audience    1
channel            0
conversion_rate    1
revenue            2
dtype: int64

> Eliminados 0 registros por type nulos
>
> Eliminados hasta el momento 5 registros (5/1037 --> 0.48%)

#### 2.2. Columna `budget`
Correción de nulos en la columna `budget`.

In [17]:
# registro con valores nulos en la columna budget
df[df['budget'].isnull()]

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
1005,Upgradable transitional data-warehouse,2023-06-29,2023-12-13,,0.59,social media,B2C,referral,0.67,558302.11


In [18]:
# print(f'Media de la columna \'budget\': {df['budget'].mean()}')
# print(f'\nMedia de \'budget\': {df[(df['target_audience'] == 'B2C') &
#                                     (df['type'] == 'social media')]['budget'].mean()}\nfiltros\n\ttype: social media\n\ttarget_audience: B2C\t')
# print(f'\nMedia de \'budget\': {df[(df['target_audience'] == 'B2C') &
#                                     (df['type'] == 'social media') &
#                                       (df['channel'] == 'referral')]['budget'].mean()}\nfiltros\n\ttype: social media\n\ttarget_audience: B2C\n\tchannel: referral\t')

df[(df['target_audience'] == 'B2C') & (df['type'] == 'social media') & (df['channel'] == 'referral')].sort_values(by='revenue', ascending=False)

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
373,Configurable client-driven migration,2022-12-09,2024-03-10,21172.32,0.16,social media,B2C,referral,0.32,972694.3
704,Innovative intangible complexity,2022-09-03,2024-03-04,19992.62,0.46,social media,B2C,referral,0.62,910901.4
466,Reduced content-based Graphic Interface,2022-10-26,2024-03-03,16945.29,0.72,social media,B2C,referral,0.76,884669.86
89,Stand-alone bandwidth-monitored capability,2022-10-10,2024-03-29,89383.19,0.83,social media,B2C,referral,0.77,828963.15
424,Mandatory 3rdgeneration matrices,2023-07-02,2023-10-28,29709.16,0.86,social media,B2C,referral,0.12,811850.37
169,Ergonomic next generation hub,2022-12-18,2024-01-27,27079.85,0.32,social media,B2C,referral,0.3,784636.2
976,User-centric clear-thinking interface,2022-08-28,2024-05-27,29206.8,0.6,social media,B2C,referral,0.34,628376.14
980,Reverse-engineered 4thgeneration analyzer,2022-12-24,2023-11-03,35539.13,0.53,social media,B2C,referral,0.68,579364.9
1023,Intuitive responsive support,2022-11-25,2024-04-04,1816.22,0.81,social media,B2C,referral,0.85,563280.3
9,Intuitive responsive support,2022-11-25,2024-04-04,1816.22,0.81,social media,B2C,referral,0.85,563280.3


comprobado que el valor faltante en la columna budget corresponde con un registro repetido, se elimina el registro.

In [19]:
# eliminación de registros con valores nulos en la columna budget
df = df.dropna(subset=['budget'])

In [20]:
df.isnull().sum()

campaign_name      0
start_date         0
end_date           0
budget             0
roi                0
type               0
target_audience    1
channel            0
conversion_rate    1
revenue            2
dtype: int64

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1031 entries, 0 to 1036
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   campaign_name    1031 non-null   object 
 1   start_date       1031 non-null   object 
 2   end_date         1031 non-null   object 
 3   budget           1031 non-null   float64
 4   roi              1031 non-null   float64
 5   type             1031 non-null   object 
 6   target_audience  1030 non-null   object 
 7   channel          1031 non-null   object 
 8   conversion_rate  1030 non-null   float64
 9   revenue          1029 non-null   float64
dtypes: float64(4), object(6)
memory usage: 88.6+ KB


**Eliminación de outliers en la columna `budget`**

Aprovecharemos para eliminar los outliers en la columna `budget` que se encuentran a más de 3 desviaciones estándar de la media, así como los valores negativos.

In [22]:
# eliminar los outliers en la columna `budget` que se encuentran a más de 3 desviaciones estándar de la media
df = df[np.abs(df['budget'] - df['budget'].mean()) <= (3 * df['budget'].std())]
df.sort_values(by='budget', ascending=False)

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
53,User-centric holistic firmware,2022-08-06,2024-03-12,99957.15,0.63,webinar,B2C,referral,0.62,430498.24
330,Persistent radical approach,2022-10-23,2023-08-15,99891.35,0.80,email,B2C,promotion,0.91,8272.50
28,Seamless clear-thinking product,2022-11-18,2023-10-05,99838.63,0.63,email,B2B,organic,0.18,14073.59
710,Horizontal asymmetric contingency,2022-09-16,2024-05-08,99714.19,0.23,email,B2C,referral,0.70,7622.28
935,Innovative logistical interface,2023-03-05,2023-10-25,99579.39,0.10,social media,B2C,organic,0.28,758121.44
...,...,...,...,...,...,...,...,...,...,...
835,Enhanced optimizing time-frame,2022-11-06,2024-04-02,1378.61,0.15,email,B2B,organic,0.66,862862.99
38,Vision-oriented 4thgeneration conglomeration,2023-01-18,2024-04-22,1309.17,0.91,podcast,B2C,organic,0.27,273231.88
887,Synchronized national system engine,2023-01-18,2023-11-11,1223.82,0.71,social media,B2C,paid,0.30,768567.70
252,Persevering zero administration interface,2023-03-27,2023-10-25,1052.57,0.37,email,B2C,promotion,0.44,932323.35


In [23]:
# eliminar los registros con valores negativos en las columnas `budget` y `roi`
df = df[(df['budget'] >= 0) & (df['roi'] >= 0)]
df.sort_values(by='budget', ascending=False)

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
53,User-centric holistic firmware,2022-08-06,2024-03-12,99957.15,0.63,webinar,B2C,referral,0.62,430498.24
330,Persistent radical approach,2022-10-23,2023-08-15,99891.35,0.80,email,B2C,promotion,0.91,8272.50
28,Seamless clear-thinking product,2022-11-18,2023-10-05,99838.63,0.63,email,B2B,organic,0.18,14073.59
710,Horizontal asymmetric contingency,2022-09-16,2024-05-08,99714.19,0.23,email,B2C,referral,0.70,7622.28
935,Innovative logistical interface,2023-03-05,2023-10-25,99579.39,0.10,social media,B2C,organic,0.28,758121.44
...,...,...,...,...,...,...,...,...,...,...
766,Multi-tiered context-sensitive hub,2023-06-20,2024-05-24,1380.68,0.46,podcast,B2C,promotion,0.84,159621.36
835,Enhanced optimizing time-frame,2022-11-06,2024-04-02,1378.61,0.15,email,B2B,organic,0.66,862862.99
38,Vision-oriented 4thgeneration conglomeration,2023-01-18,2024-04-22,1309.17,0.91,podcast,B2C,organic,0.27,273231.88
887,Synchronized national system engine,2023-01-18,2023-11-11,1223.82,0.71,social media,B2C,paid,0.30,768567.70


> Eliminados 3 registros (3/1037 --> 0.29%) por budget conflictivos
>
> Eliminados hasta el momento 8 registros (8/1037 --> 0.77%)

#### 2.3. Columna `target_audience`
Correción de nulos en la columna `target_audience`.

In [24]:
# registro con valores nulos en la columna type
df[df['target_audience'].isnull()]

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
1031,Extra long name campaign test,2023-04-15,2023-09-15,30000.0,0.25,email,,paid,0.4,45000.0


In [25]:
df[(df['type'] == 'email') & (df['channel'] == 'paid')].sort_values(by='revenue', ascending=False)

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
108,User-centric responsive software,2022-09-24,2023-08-04,55105.95,0.39,email,B2B,paid,0.65,980593.34
257,Self-enabling reciprocal algorithm,2022-11-01,2024-07-29,8645.67,0.12,email,B2B,paid,0.37,961551.85
635,Intuitive didactic interface,2022-09-06,2024-07-17,41350.29,0.82,email,B2C,paid,0.18,953178.97
32,Seamless zero-defect portal,2022-08-12,2024-07-19,71277.10,0.21,email,B2B,paid,0.56,932824.61
734,Grass-roots client-server middleware,2023-01-04,2023-11-10,41458.15,0.97,email,B2B,paid,0.21,924885.11
...,...,...,...,...,...,...,...,...,...,...
681,Organic actuating firmware,2023-07-07,2023-10-03,38665.10,0.14,email,B2B,paid,0.29,74042.78
458,Customizable global hierarchy,2023-04-26,2023-10-12,86716.38,0.74,email,B2B,paid,0.95,69360.86
1031,Extra long name campaign test,2023-04-15,2023-09-15,30000.00,0.25,email,,paid,0.40,45000.00
862,Inverse system-worthy utilization,2022-08-25,2023-08-15,17398.10,0.22,email,B2C,paid,0.60,36397.45


In [26]:
print(f'Moda target_audience: {df['target_audience'].mode()[0]}\n')
print(f'Moda target_audience: {df[(df['type'] == 'email') & (df['channel'] == 'paid')]['target_audience'].mode()[0]}\nFiltros\n\ttype: email\n\tchannel: paid\n')
print(f'Moda target_audience: {df[df['type'] == 'email']['target_audience'].mode()[0]}\nFiltros\n\ttype: email\n')
print(f'Moda target_audience: {df[df['channel'] == 'paid']['target_audience'].mode()[0]}\nFiltros\n\tchannel: paid\n')

Moda target_audience: B2B

Moda target_audience: B2B
Filtros
	type: email
	channel: paid

Moda target_audience: B2B
Filtros
	type: email

Moda target_audience: B2C
Filtros
	channel: paid



Siendo la mayoría de las modas de registros similares 'B2B', se reemplazarán los nulos por ese valor.

In [27]:
# cambiar el valor nulo en la columna target_audience por la moda seleccionada
df.loc[df['target_audience'].isnull(), 'target_audience'] = 'B2B'
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1029 entries, 0 to 1036
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   campaign_name    1029 non-null   object 
 1   start_date       1029 non-null   object 
 2   end_date         1029 non-null   object 
 3   budget           1029 non-null   float64
 4   roi              1029 non-null   float64
 5   type             1029 non-null   object 
 6   target_audience  1029 non-null   object 
 7   channel          1029 non-null   object 
 8   conversion_rate  1028 non-null   float64
 9   revenue          1028 non-null   float64
dtypes: float64(4), object(6)
memory usage: 88.4+ KB


> Eliminados 0 registros por type conflictivos
>
> Eliminados hasta el momento 8 registros (8/1037 --> 0.77%)

#### 2.4. Columna `conversion_rate`
Eliminación de nulos en la columna `conversion_rate`, y detección de outliers.

In [28]:
df.describe()

Unnamed: 0,budget,roi,conversion_rate,revenue
count,1029.0,1029.0,1028.0,1028.0
mean,49204.260408,0.534908,0.543152,511785.059212
std,28830.897212,0.260507,0.267001,287334.011917
min,1052.57,0.0,0.0,-15000.0
25%,24633.17,0.31,0.3,267735.6875
50%,46790.73,0.53,0.55,517944.035
75%,74720.7,0.76,0.77,765929.2575
max,99957.15,0.99,1.5,999712.49


In [29]:
df[df['conversion_rate'].isnull()]

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
1027,Broken-date campaign,2023-13-01,2024-01-01,25000.0,0.45,email,B2B,organic,,87500.0


> Nota: sabiendo que los valores de algunos registros no son correctos, procederemos a aproximarlos con un valor similar.

In [30]:
# se tomará la media de los registros similares (type=email, channel=organic, target_audience=B2B) para reemplazar el valor nulo
conversion_rate_aprox = round(df[(df['type'] == 'email') & (df['channel'] == 'organic') & (df['target_audience'] == 'B2B')]['conversion_rate'].mean(), 2)
df.loc[df['conversion_rate'].isnull(), 'conversion_rate'] = conversion_rate_aprox
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1029 entries, 0 to 1036
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   campaign_name    1029 non-null   object 
 1   start_date       1029 non-null   object 
 2   end_date         1029 non-null   object 
 3   budget           1029 non-null   float64
 4   roi              1029 non-null   float64
 5   type             1029 non-null   object 
 6   target_audience  1029 non-null   object 
 7   channel          1029 non-null   object 
 8   conversion_rate  1029 non-null   float64
 9   revenue          1028 non-null   float64
dtypes: float64(4), object(6)
memory usage: 88.4+ KB


In [31]:
df.sort_values(by='conversion_rate', ascending=False)[:10]
# df.sort_values(by='conversion_rate')[:20]

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
1036,Too many conversions,2023-05-01,2023-11-01,40000.0,0.8,social media,B2C,organic,1.5,120000.0
454,Vision-oriented transitional process improvement,2023-06-27,2023-11-26,55436.7,0.8,podcast,B2B,organic,0.99,786705.7
984,Vision-oriented zero tolerance hardware,2023-01-13,2023-12-28,25534.3,0.9,email,B2C,paid,0.99,141758.95
785,Managed regional process improvement,2023-06-14,2023-09-19,65007.53,0.7,email,B2B,promotion,0.99,429137.69
504,Advanced client-driven matrix,2023-07-02,2024-06-24,58920.71,0.74,social media,B2B,organic,0.99,416433.63
712,Reverse-engineered attitude-oriented task-force,2023-03-12,2024-07-28,82832.93,0.73,social media,B2B,organic,0.99,974376.54
185,Progressive 4thgeneration policy,2022-10-06,2024-04-20,16290.7,0.49,webinar,B2C,promotion,0.99,172899.7
140,Compatible eco-centric access,2022-08-24,2023-09-04,4016.62,0.62,social media,B2B,referral,0.99,773055.56
173,Customer-focused fault-tolerant help-desk,2022-10-08,2023-12-25,39585.98,0.11,podcast,B2B,promotion,0.99,889037.24
886,Sharable background circuit,2022-08-26,2024-06-15,39380.23,0.58,email,B2B,referral,0.99,878859.45


se eliminan los registros con valores anómalos (conversion_rate > 1) en la columna `conversion_rate`.

In [32]:
# borrar el registro con valor conversion_rate = 1.5 (es un outlier)
df = df[df['conversion_rate'] <= 1]
df

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
0,Public-key multi-tasking throughput,2023-04-01,2024-02-23,8082.30,0.35,email,B2B,organic,0.40,709593.48
1,De-engineered analyzing task-force,2023-02-15,2024-04-22,17712.98,0.74,email,B2C,promotion,0.66,516609.10
2,Balanced solution-oriented Local Area Network,2022-12-20,2023-10-11,84643.10,0.37,podcast,B2B,paid,0.28,458227.42
3,Distributed real-time methodology,2022-09-26,2023-09-27,14589.75,0.47,webinar,B2B,organic,0.19,89958.73
4,Front-line executive infrastructure,2023-07-07,2024-05-15,39291.90,0.30,social media,B2B,promotion,0.81,47511.35
...,...,...,...,...,...,...,...,...,...,...
1027,Broken-date campaign,2023-13-01,2024-01-01,25000.00,0.45,email,B2B,organic,0.62,87500.00
1030,Future campaign,2025-01-01,2025-06-01,75000.00,0.90,webinar,B2C,promotion,0.65,200000.00
1031,Extra long name campaign test,2023-04-15,2023-09-15,30000.00,0.25,email,B2B,paid,0.40,45000.00
1032,No revenue campaign,2023-02-01,2023-08-01,20000.00,0.30,social media,B2B,organic,0.50,


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1028 entries, 0 to 1035
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   campaign_name    1028 non-null   object 
 1   start_date       1028 non-null   object 
 2   end_date         1028 non-null   object 
 3   budget           1028 non-null   float64
 4   roi              1028 non-null   float64
 5   type             1028 non-null   object 
 6   target_audience  1028 non-null   object 
 7   channel          1028 non-null   object 
 8   conversion_rate  1028 non-null   float64
 9   revenue          1027 non-null   float64
dtypes: float64(4), object(6)
memory usage: 88.3+ KB


> Eliminados 1 registro (3/1037 --> 0.29%) por conversion_rate conflictivo
>
> Eliminados hasta el momento 9 registros (9/1037 --> 0.87%)

#### 2.5. Columna `revenue`
Eliminación de nulos en la columna `revenue`, y detección de outliers.

In [34]:
df.sort_values(by='revenue', ascending=False)

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
170,Realigned radical hardware,2022-11-03,2024-07-12,34512.86,0.20,webinar,B2B,referral,0.87,999712.49
618,Automated executive moderator,2022-09-03,2024-01-06,65864.61,0.29,social media,B2B,promotion,0.19,999317.92
880,Balanced optimizing software,2023-04-10,2024-05-10,10297.36,0.95,podcast,B2B,paid,0.71,997657.18
626,Realigned scalable moderator,2023-04-07,2023-10-03,55780.60,0.54,email,B2B,referral,0.19,996578.25
758,Function-based leadingedge budgetary management,2022-12-16,2023-08-23,64755.70,0.63,webinar,B2B,referral,0.20,996493.10
...,...,...,...,...,...,...,...,...,...,...
427,Programmable homogeneous projection,2022-09-24,2023-11-12,11006.30,0.25,podcast,B2C,promotion,0.98,3641.30
975,Extended 24hour contingency,2023-07-07,2023-08-18,69815.82,0.16,podcast,B2C,organic,0.27,2810.51
889,Grass-roots scalable framework,2023-02-16,2023-08-19,61978.10,0.39,social media,B2C,paid,0.77,108.21
1010,Negative Revenue Test,2023-05-15,2023-11-15,25000.00,0.45,podcast,B2B,paid,0.33,-15000.00


Vemos que los dos últimos registros tienen valores negativos o nulos en la columna `revenue`, por lo que se eliminarán.

In [35]:
df = df[df['revenue'] > 0]
df.sort_values(by='revenue', ascending=False)

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
170,Realigned radical hardware,2022-11-03,2024-07-12,34512.86,0.20,webinar,B2B,referral,0.87,999712.49
618,Automated executive moderator,2022-09-03,2024-01-06,65864.61,0.29,social media,B2B,promotion,0.19,999317.92
880,Balanced optimizing software,2023-04-10,2024-05-10,10297.36,0.95,podcast,B2B,paid,0.71,997657.18
626,Realigned scalable moderator,2023-04-07,2023-10-03,55780.60,0.54,email,B2B,referral,0.19,996578.25
758,Function-based leadingedge budgetary management,2022-12-16,2023-08-23,64755.70,0.63,webinar,B2B,referral,0.20,996493.10
...,...,...,...,...,...,...,...,...,...,...
821,Universal uniform service-desk,2023-07-10,2024-05-16,37054.16,0.39,email,B2B,referral,0.60,5971.96
14,Innovative web-enabled function,2022-11-05,2023-09-05,36848.30,0.92,webinar,B2C,referral,0.10,4190.95
427,Programmable homogeneous projection,2022-09-24,2023-11-12,11006.30,0.25,podcast,B2C,promotion,0.98,3641.30
975,Extended 24hour contingency,2023-07-07,2023-08-18,69815.82,0.16,podcast,B2C,organic,0.27,2810.51


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1026 entries, 0 to 1035
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   campaign_name    1026 non-null   object 
 1   start_date       1026 non-null   object 
 2   end_date         1026 non-null   object 
 3   budget           1026 non-null   float64
 4   roi              1026 non-null   float64
 5   type             1026 non-null   object 
 6   target_audience  1026 non-null   object 
 7   channel          1026 non-null   object 
 8   conversion_rate  1026 non-null   float64
 9   revenue          1026 non-null   float64
dtypes: float64(4), object(6)
memory usage: 88.2+ KB


> Eliminados 0 registro  por revenue conflictivo
>
> Eliminados hasta el momento 9 registros (9/1037 --> 0.87%)

### 3. TIPADO DE DATOS
Se limpiarán los datos de las columnas `start_date`, `end_date`, eliminando los registros que no cumplan con el formato de fecha, y se convertirán a tipo datetime.

In [37]:
df.head()

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
0,Public-key multi-tasking throughput,2023-04-01,2024-02-23,8082.3,0.35,email,B2B,organic,0.4,709593.48
1,De-engineered analyzing task-force,2023-02-15,2024-04-22,17712.98,0.74,email,B2C,promotion,0.66,516609.1
2,Balanced solution-oriented Local Area Network,2022-12-20,2023-10-11,84643.1,0.37,podcast,B2B,paid,0.28,458227.42
3,Distributed real-time methodology,2022-09-26,2023-09-27,14589.75,0.47,webinar,B2B,organic,0.19,89958.73
4,Front-line executive infrastructure,2023-07-07,2024-05-15,39291.9,0.3,social media,B2B,promotion,0.81,47511.35


In [38]:
df[df['start_date'] == '2023-13-01']
# visto que hay un registro con las fechas start_date mal formateada, y una fecha end_date incoherente, se procederá a eliminar el registro
# se corregirá a continuación el registro con fecha start_date mal formateada

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
1009,Typo in Date,2023-13-01,2024-02-30,50000.0,0.65,webinar,B2C,promotion,0.9,320000.0
1027,Broken-date campaign,2023-13-01,2024-01-01,25000.0,0.45,email,B2B,organic,0.62,87500.0


In [39]:
# se elimina el registro incorrecto
df = df[df['end_date'] != '2024-02-30']

# corregimos el registro con fecha mal formateada
df['start_date'] = df['start_date'].replace('2023-13-01', '2023-01-13')
df[df['start_date'] == '2023-13-01']


Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1025 entries, 0 to 1035
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   campaign_name    1025 non-null   object 
 1   start_date       1025 non-null   object 
 2   end_date         1025 non-null   object 
 3   budget           1025 non-null   float64
 4   roi              1025 non-null   float64
 5   type             1025 non-null   object 
 6   target_audience  1025 non-null   object 
 7   channel          1025 non-null   object 
 8   conversion_rate  1025 non-null   float64
 9   revenue          1025 non-null   float64
dtypes: float64(4), object(6)
memory usage: 88.1+ KB


In [41]:
# Se convierte la columna start_date y end_date a formato datetime
df['start_date'] = pd.to_datetime(df['start_date'])
df['end_date'] = pd.to_datetime(df['end_date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1025 entries, 0 to 1035
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   campaign_name    1025 non-null   object        
 1   start_date       1025 non-null   datetime64[ns]
 2   end_date         1025 non-null   datetime64[ns]
 3   budget           1025 non-null   float64       
 4   roi              1025 non-null   float64       
 5   type             1025 non-null   object        
 6   target_audience  1025 non-null   object        
 7   channel          1025 non-null   object        
 8   conversion_rate  1025 non-null   float64       
 9   revenue          1025 non-null   float64       
dtypes: datetime64[ns](2), float64(4), object(4)
memory usage: 88.1+ KB


Se comprueban las fechas, para verificar errores de fechas fuera de rango (demasiado antiguas, o futuras), orden de start_date > end_date.

In [42]:
# comprobación de fechas desordenadas (fecha de inicio posterior a fecha de finalización)
df[df['start_date'] > df['end_date']]

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
1035,Overlapping dates,2023-03-01,2022-12-31,60000.0,0.6,webinar,B2B,paid,0.7,90000.0


In [43]:
# eliminación de registros con fechas desordenadas
toDelete_index = df[df['start_date'] > df['end_date']].index
df = df.drop(toDelete_index)
df[df['start_date'] > df['end_date']] # comprobación de eliminación

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue


In [44]:
# detección de outliers (fechas excesivamente antiguas o futuras) en start_date
df.sort_values(by='start_date', ascending=False)

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
1030,Future campaign,2025-01-01,2025-06-01,75000.00,0.90,webinar,B2C,promotion,0.65,200000.00
361,Automated dedicated budgetary management,2023-08-01,2024-04-06,1436.99,0.14,podcast,B2B,organic,0.87,881768.44
23,Future-proofed local forecast,2023-08-01,2023-08-11,46775.63,0.67,webinar,B2B,referral,0.75,814672.28
653,Optional uniform projection,2023-07-31,2024-03-23,13418.23,0.64,podcast,B2B,referral,0.94,160381.96
757,Focused motivating strategy,2023-07-31,2023-11-01,44640.33,0.79,social media,B2C,organic,0.69,935541.35
...,...,...,...,...,...,...,...,...,...,...
80,Front-line full-range matrices,2022-08-04,2023-08-10,79829.52,0.69,email,B2B,paid,0.70,472828.97
281,Mandatory secondary access,2022-08-03,2023-11-04,44406.38,0.17,email,B2B,promotion,0.21,438753.14
574,Proactive regional conglomeration,2022-08-03,2024-03-29,11421.40,0.37,social media,B2B,referral,0.40,265099.33
545,Public-key optimizing protocol,2022-08-03,2023-12-29,61172.36,0.60,podcast,B2C,referral,0.13,277685.24


Comprobamos que existe un registro con fechas futuras, por lo que se eliminará.

No hay registros detectados con fechas excesivamente antiguas.

In [45]:
# eliminación de registro con fechas futuras (se toma la fecha actual como referencia)
df = df[df['start_date'] <= pd.to_datetime('2024-12-31')]
df.sort_values(by='start_date', ascending=False)

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
361,Automated dedicated budgetary management,2023-08-01,2024-04-06,1436.99,0.14,podcast,B2B,organic,0.87,881768.44
23,Future-proofed local forecast,2023-08-01,2023-08-11,46775.63,0.67,webinar,B2B,referral,0.75,814672.28
653,Optional uniform projection,2023-07-31,2024-03-23,13418.23,0.64,podcast,B2B,referral,0.94,160381.96
757,Focused motivating strategy,2023-07-31,2023-11-01,44640.33,0.79,social media,B2C,organic,0.69,935541.35
243,Customizable real-time toolset,2023-07-31,2024-05-07,45499.90,0.10,social media,B2C,organic,0.19,292766.77
...,...,...,...,...,...,...,...,...,...,...
981,Secured incremental moratorium,2022-08-04,2023-08-28,43792.40,0.94,webinar,B2B,paid,0.81,70347.37
574,Proactive regional conglomeration,2022-08-03,2024-03-29,11421.40,0.37,social media,B2B,referral,0.40,265099.33
545,Public-key optimizing protocol,2022-08-03,2023-12-29,61172.36,0.60,podcast,B2C,referral,0.13,277685.24
281,Mandatory secondary access,2022-08-03,2023-11-04,44406.38,0.17,email,B2B,promotion,0.21,438753.14


In [46]:
df.sort_values(by='end_date', ascending=False)

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
989,Open-source fault-tolerant open architecture,2023-06-03,2024-07-31,29827.35,0.52,webinar,B2C,referral,0.36,310553.26
800,Robust maximized Graphic Interface,2022-09-17,2024-07-31,95291.59,0.30,social media,B2B,organic,0.27,804844.50
611,Integrated national architecture,2023-07-14,2024-07-31,32039.70,0.86,social media,B2B,referral,0.97,334033.28
609,Intuitive well-modulated support,2022-08-14,2024-07-30,94357.67,0.80,email,B2C,organic,0.14,988161.51
824,Profit-focused next generation installation,2023-03-12,2024-07-30,16505.53,0.21,podcast,B2C,paid,0.29,401190.17
...,...,...,...,...,...,...,...,...,...,...
526,Persevering tangible hardware,2023-06-08,2023-08-02,67475.35,0.50,email,B2B,promotion,0.22,53555.95
918,Quality-focused fault-tolerant secured line,2023-02-01,2023-08-02,8324.30,0.68,webinar,B2C,promotion,0.76,875993.48
337,Re-engineered 24/7 benchmark,2022-10-07,2023-08-02,60761.56,0.60,podcast,B2C,paid,0.46,560912.60
616,Synergized 24/7 interface,2022-11-09,2023-08-02,22675.22,0.52,webinar,B2B,referral,0.78,402920.91


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1023 entries, 0 to 1031
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   campaign_name    1023 non-null   object        
 1   start_date       1023 non-null   datetime64[ns]
 2   end_date         1023 non-null   datetime64[ns]
 3   budget           1023 non-null   float64       
 4   roi              1023 non-null   float64       
 5   type             1023 non-null   object        
 6   target_audience  1023 non-null   object        
 7   channel          1023 non-null   object        
 8   conversion_rate  1023 non-null   float64       
 9   revenue          1023 non-null   float64       
dtypes: datetime64[ns](2), float64(4), object(4)
memory usage: 87.9+ KB


In [48]:
df.describe()

Unnamed: 0,start_date,end_date,budget,roi,conversion_rate,revenue
count,1023,1023,1023.0,1023.0,1023.0,1023.0
mean,2023-01-29 15:13:32.903225856,2024-01-30 22:22:52.434017792,49228.918827,0.534428,0.541935,513587.527732
min,2022-08-02 00:00:00,2023-08-02 00:00:00,1052.57,0.0,0.0,108.21
25%,2022-10-30 12:00:00,2023-11-02 00:00:00,24594.43,0.31,0.3,268492.545
50%,2023-01-29 00:00:00,2024-01-28 00:00:00,46790.73,0.53,0.55,519790.34
75%,2023-04-29 00:00:00,2024-05-04 00:00:00,74788.705,0.76,0.77,767479.655
max,2023-08-01 00:00:00,2024-07-31 00:00:00,99957.15,0.99,0.99,999712.49
std,,,28876.349774,0.260739,0.265595,286765.758096


> Eliminados 5 registro (5/1037 --> 0.48%) por start_date y/o end_date conflictivos
>
> Eliminados hasta el momento 14 registros (14/1037 --> 1.35%)

## Limpieza de Outliers
Se eliminarán los registros que contengan valores atípicos en las columnas.

In [49]:
df.sort_values(by='budget', ascending=False)

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
53,User-centric holistic firmware,2022-08-06,2024-03-12,99957.15,0.63,webinar,B2C,referral,0.62,430498.24
330,Persistent radical approach,2022-10-23,2023-08-15,99891.35,0.80,email,B2C,promotion,0.91,8272.50
28,Seamless clear-thinking product,2022-11-18,2023-10-05,99838.63,0.63,email,B2B,organic,0.18,14073.59
710,Horizontal asymmetric contingency,2022-09-16,2024-05-08,99714.19,0.23,email,B2C,referral,0.70,7622.28
935,Innovative logistical interface,2023-03-05,2023-10-25,99579.39,0.10,social media,B2C,organic,0.28,758121.44
...,...,...,...,...,...,...,...,...,...,...
766,Multi-tiered context-sensitive hub,2023-06-20,2024-05-24,1380.68,0.46,podcast,B2C,promotion,0.84,159621.36
835,Enhanced optimizing time-frame,2022-11-06,2024-04-02,1378.61,0.15,email,B2B,organic,0.66,862862.99
38,Vision-oriented 4thgeneration conglomeration,2023-01-18,2024-04-22,1309.17,0.91,podcast,B2C,organic,0.27,273231.88
887,Synchronized national system engine,2023-01-18,2023-11-11,1223.82,0.71,social media,B2C,paid,0.30,768567.70


In [50]:
df.sort_values(by='roi', ascending=False)

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
47,User-friendly analyzing moratorium,2023-04-06,2024-03-16,20328.53,0.99,email,B2B,organic,0.97,658107.70
812,Realigned homogeneous support,2022-10-13,2024-03-03,24132.55,0.99,social media,B2C,paid,0.65,33543.21
530,Triple-buffered high-level customer loyalty,2023-01-01,2023-09-20,78803.16,0.99,email,B2B,organic,0.75,562224.77
205,Cross-group foreground ability,2023-03-11,2023-11-27,32343.79,0.99,social media,B2B,paid,0.50,55043.59
280,Automated 5thgeneration attitude,2023-02-05,2024-06-03,96380.65,0.99,email,B2B,referral,0.18,35031.43
...,...,...,...,...,...,...,...,...,...,...
110,Upgradable transitional productivity,2022-12-25,2024-05-12,28460.11,0.00,social media,B2B,paid,0.37,593949.35
390,Multi-tiered object-oriented knowledge user,2023-04-22,2023-10-28,52804.87,0.00,email,B2C,paid,0.25,880106.40
297,Customizable executive task-force,2022-08-31,2024-02-10,14777.50,0.00,email,B2C,promotion,0.38,865674.86
321,Polarized systematic parallelism,2023-01-03,2024-04-25,5479.95,0.00,social media,B2B,organic,0.62,775127.75


In [51]:
df.sort_values(by='conversion_rate', ascending=False)

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
712,Reverse-engineered attitude-oriented task-force,2023-03-12,2024-07-28,82832.93,0.73,social media,B2B,organic,0.99,974376.54
984,Vision-oriented zero tolerance hardware,2023-01-13,2023-12-28,25534.30,0.90,email,B2C,paid,0.99,141758.95
504,Advanced client-driven matrix,2023-07-02,2024-06-24,58920.71,0.74,social media,B2B,organic,0.99,416433.63
697,Customizable clear-thinking adapter,2023-07-22,2024-04-30,74856.71,0.43,podcast,B2B,referral,0.99,266833.31
454,Vision-oriented transitional process improvement,2023-06-27,2023-11-26,55436.70,0.80,podcast,B2B,organic,0.99,786705.70
...,...,...,...,...,...,...,...,...,...,...
372,Versatile user-facing benchmark,2022-09-19,2024-05-21,53658.92,0.90,social media,B2B,promotion,0.00,761292.13
793,Expanded 3rdgeneration synergy,2022-10-08,2024-07-09,43946.75,0.14,email,B2B,paid,0.00,739069.46
664,Programmable fault-tolerant intranet,2022-10-01,2024-02-17,47198.52,0.80,email,B2B,paid,0.00,294588.36
727,Cloned scalable frame,2022-12-30,2023-08-18,4839.55,0.67,webinar,B2B,promotion,0.00,250403.29


In [52]:
df.sort_values(by='revenue', ascending=False)

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
170,Realigned radical hardware,2022-11-03,2024-07-12,34512.86,0.20,webinar,B2B,referral,0.87,999712.49
618,Automated executive moderator,2022-09-03,2024-01-06,65864.61,0.29,social media,B2B,promotion,0.19,999317.92
880,Balanced optimizing software,2023-04-10,2024-05-10,10297.36,0.95,podcast,B2B,paid,0.71,997657.18
626,Realigned scalable moderator,2023-04-07,2023-10-03,55780.60,0.54,email,B2B,referral,0.19,996578.25
758,Function-based leadingedge budgetary management,2022-12-16,2023-08-23,64755.70,0.63,webinar,B2B,referral,0.20,996493.10
...,...,...,...,...,...,...,...,...,...,...
821,Universal uniform service-desk,2023-07-10,2024-05-16,37054.16,0.39,email,B2B,referral,0.60,5971.96
14,Innovative web-enabled function,2022-11-05,2023-09-05,36848.30,0.92,webinar,B2C,referral,0.10,4190.95
427,Programmable homogeneous projection,2022-09-24,2023-11-12,11006.30,0.25,podcast,B2C,promotion,0.98,3641.30
975,Extended 24hour contingency,2023-07-07,2023-08-18,69815.82,0.16,podcast,B2C,organic,0.27,2810.51


> Eliminados 0 registro por outliers
>
> Eliminados hasta el momento 14 registros (14/1037 --> 1.35%)

## Eliminación de duplicados
Se eliminarán registros duplicados en el dataframe.

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1023 entries, 0 to 1031
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   campaign_name    1023 non-null   object        
 1   start_date       1023 non-null   datetime64[ns]
 2   end_date         1023 non-null   datetime64[ns]
 3   budget           1023 non-null   float64       
 4   roi              1023 non-null   float64       
 5   type             1023 non-null   object        
 6   target_audience  1023 non-null   object        
 7   channel          1023 non-null   object        
 8   conversion_rate  1023 non-null   float64       
 9   revenue          1023 non-null   float64       
dtypes: datetime64[ns](2), float64(4), object(4)
memory usage: 87.9+ KB


In [54]:
# elminación de registros duplicados idénticos
df = df.drop_duplicates()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1005 entries, 0 to 1031
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   campaign_name    1005 non-null   object        
 1   start_date       1005 non-null   datetime64[ns]
 2   end_date         1005 non-null   datetime64[ns]
 3   budget           1005 non-null   float64       
 4   roi              1005 non-null   float64       
 5   type             1005 non-null   object        
 6   target_audience  1005 non-null   object        
 7   channel          1005 non-null   object        
 8   conversion_rate  1005 non-null   float64       
 9   revenue          1005 non-null   float64       
dtypes: datetime64[ns](2), float64(4), object(4)
memory usage: 86.4+ KB


> Eliminados 18 registro (18/1037 --> 0.48%) por duplicidad (exacta) de registros
>
> Eliminados hasta el momento 32 registros (32/1037 --> 3.09%)

In [55]:
# estudio de registros con nombres iguales
# df[df.duplicated(subset=['campaign_name'], keep=False)].sort_values(by='campaign_name')
df[df.duplicated(subset=['campaign_name'], keep=False)]

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
7,User-friendly client-driven service-desk,2023-01-06,2023-12-11,36800.58,0.4,webinar,B2C,promotion,0.52,206241.46
203,Reverse-engineered static infrastructure,2023-07-26,2024-05-29,12246.32,0.48,email,B2B,referral,0.44,987205.29
985,Reverse-engineered static infrastructure,2023-07-16,2023-08-07,7315.35,0.86,email,B2C,referral,0.7,162798.55
1007,User-friendly client-driven service-desk,2023-01-06,2023-12-11,36800.58,0.4,others,B2C,promotion,0.52,206241.46


Comprobamos que los registros 7 y 1007 son iguales, a excepción del campo `type` (que fue completado en su momento con `others` como dato faltante), por lo que se puede deducir que el 1007 es un duplicado del 7, y se eliminará.

En contra, los registros 203 y 985, aún teniendo el mismo nombre y mismo tipo de campaña, difieren tanto en el resto de campos que podemos considerarlos registros independientes.

In [None]:
# eliminación del registro duplicado y comprobación
df = df.drop_duplicates(subset=['campaign_name', 'start_date', 'end_date', 'budget', 'roi', 'target_audience', 'channel', 'conversion_rate', 'revenue'], keep='first')
df[df.duplicated(subset=['campaign_name'], keep=False)]

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
203,Reverse-engineered static infrastructure,2023-07-26,2024-05-29,12246.32,0.48,email,B2B,referral,0.44,987205.29
985,Reverse-engineered static infrastructure,2023-07-16,2023-08-07,7315.35,0.86,email,B2C,referral,0.7,162798.55


In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1004 entries, 0 to 1031
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   campaign_name    1004 non-null   object        
 1   start_date       1004 non-null   datetime64[ns]
 2   end_date         1004 non-null   datetime64[ns]
 3   budget           1004 non-null   float64       
 4   roi              1004 non-null   float64       
 5   type             1004 non-null   object        
 6   target_audience  1004 non-null   object        
 7   channel          1004 non-null   object        
 8   conversion_rate  1004 non-null   float64       
 9   revenue          1004 non-null   float64       
dtypes: datetime64[ns](2), float64(4), object(4)
memory usage: 86.3+ KB


> Eliminados 1 registro (1/1037 --> 0.09%) por duplicidad (aproximada) de registros
>
> Eliminados hasta el momento 33 registros (33/1037 --> 3.18%)

## Extracción de fichero limpio
Terminada la limpieza y preprocesamiento de los datos, se extrae la información en un fichero `.csv` independiente (`marketingcampaigns_final.csv`) para su posterior análisis comercial.

In [58]:
df

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
0,Public-key multi-tasking throughput,2023-04-01,2024-02-23,8082.30,0.35,email,B2B,organic,0.40,709593.48
1,De-engineered analyzing task-force,2023-02-15,2024-04-22,17712.98,0.74,email,B2C,promotion,0.66,516609.10
2,Balanced solution-oriented Local Area Network,2022-12-20,2023-10-11,84643.10,0.37,podcast,B2B,paid,0.28,458227.42
3,Distributed real-time methodology,2022-09-26,2023-09-27,14589.75,0.47,webinar,B2B,organic,0.19,89958.73
4,Front-line executive infrastructure,2023-07-07,2024-05-15,39291.90,0.30,social media,B2B,promotion,0.81,47511.35
...,...,...,...,...,...,...,...,...,...,...
999,Up-sized user-facing secured line,2022-10-02,2023-10-29,11168.71,0.80,social media,B2C,promotion,0.20,277549.41
1011,Duplicate Campaign,2023-04-01,2024-02-23,8082.30,0.35,email,B2B,organic,0.40,709593.48
1012,Inconsistent Decimal,2023-06-01,2023-12-01,12345.67,0.33,social media,B2C,referral,0.75,89000.00
1027,Broken-date campaign,2023-01-13,2024-01-01,25000.00,0.45,email,B2B,organic,0.62,87500.00


In [59]:
df.to_csv('../data/marketingcampaigns_final.csv', index=False)