### Importación de librerías

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Importación de datos y descarte/corrección de errores de lectura

In [15]:
df = pd.read_csv('../data/marketingcampaigns.csv', on_bad_lines='skip')

with open('../data/marketingcampaigns.csv', 'r', encoding='utf-8') as file:
    line_count = sum(1 for line in file)
print(f"El archivo CSV original tiene {line_count - 1} líneas.")

print(f"La diferencia entre Original y sin lineas incorrectas es de {line_count - 1 - len(df)} líneas.")

El archivo CSV original tiene 1037 líneas.
La diferencia entre Original y sin lineas incorrectas es de 5 líneas.


Vista la diferencia de sólo 5 registros entre los datos originales y una vez saltados los registros con errores, se procede a corregir esos 5 datos manualmente en un nuevo archivo `marketingcampaigns_corrected.csv`.

CORRECCIONES REALIZADAS:
* l1003: `De-engineered analyzing task-force,2023-02-15,2024-04-22,17712.98,0,74,email,B2C,promotion,0.66,516609.1` --> el valor 0,74 se cambia por 0.74
* l1006: `Front-line executive infrastructure,2023-07-07,2024-05-15,39291.9,0.3,social media,B2B,promotion,0.81,47511.35` --> el valor 39291,9 se cambia por 39291.9 y 0,81 por 0.81
* l1008: `Innovative context-sensitive framework,2023-03-01,2024-02-23,28964.45,0.59,email,B2C,referal,0.17,172882.59` --> el valor 172882,59 se cambia por 172882.59
* l1012: `Negative Revenue Test,2023-05-15,2023-11-15,25000,0.45,podcast,B2B,paid,0.33,-15000` --> el valor 0,45 se cambia por 0.45
* l1014: `Inconsistent Decimal,2023-06-01,2023-12-01,12345.67,0.33,social media,B2C,referral,0.75,89000` --> el valor 12345,67 se cambia por 12345.67, 0,33 por 0.33 y 0,75 por 0.75

In [31]:
df = pd.read_csv('../data/marketingcampaigns_corrected.csv')
df

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
0,Public-key multi-tasking throughput,2023-04-01,2024-02-23,8082.3,0.35,email,B2B,organic,0.40,709593.48
1,De-engineered analyzing task-force,2023-02-15,2024-04-22,17712.98,0.74,email,B2C,promotion,0.66,516609.10
2,Balanced solution-oriented Local Area Network,2022-12-20,2023-10-11,84643.1,0.37,podcast,B2B,paid,0.28,458227.42
3,Distributed real-time methodology,2022-09-26,2023-09-27,14589.75,0.47,webinar,B2B,organic,0.19,89958.73
4,Front-line executive infrastructure,2023-07-07,2024-05-15,39291.9,0.30,social media,B2B,promotion,0.81,47511.35
...,...,...,...,...,...,...,...,...,...,...
1032,No revenue campaign,2023-02-01,2023-08-01,20000,0.30,social media,B2B,organic,0.50,
1033,Random mess,2023-06-06,,100000,,podcast,,referral,,300000.00
1034,Invalid budget,2022-12-01,2023-06-01,abc,,email,B2C,promotion,0.20,50000.00
1035,Overlapping dates,2023-03-01,2022-12-31,60000,0.60,webinar,B2B,paid,0.70,90000.00


In [32]:
# información preliminar de los datos
'''
campaign_name -----> nombre_de_campaña
start_date --------> fecha_de_inicio
end_date ----------> fecha_de_finalización
budget ------------> presupuesto
roi ---------------> ROI "retorno de la inversión" (Return On Investment)
type --------------> tipo
target_audience ---> público_objetivo
channel -----------> canal
conversion_rate ---> tasa_de_conversión
revenue -----------> ingresos
'''
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1037 entries, 0 to 1036
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   campaign_name    1037 non-null   object 
 1   start_date       1036 non-null   object 
 2   end_date         1035 non-null   object 
 3   budget           1034 non-null   object 
 4   roi              1033 non-null   float64
 5   type             1036 non-null   object 
 6   target_audience  1035 non-null   object 
 7   channel          1036 non-null   object 
 8   conversion_rate  1033 non-null   float64
 9   revenue          1034 non-null   float64
dtypes: float64(3), object(7)
memory usage: 81.1+ KB


In [33]:
df.describe()

Unnamed: 0,roi,conversion_rate,revenue
count,1033.0,1033.0,1034.0
mean,0.533553,0.541946,509901.685667
std,0.26151,0.267271,287916.037107
min,-0.2,0.0,-15000.0
25%,0.31,0.3,266545.58
50%,0.53,0.55,516907.835
75%,0.76,0.77,764547.97
max,0.99,1.5,999712.49


In [34]:
print(df.isnull().sum())
print(f"Suma de maximo de registros con nulos: {df.isnull().sum().sum()}")

campaign_name      0
start_date         1
end_date           2
budget             3
roi                4
type               1
target_audience    2
channel            1
conversion_rate    4
revenue            3
dtype: int64
Suma de maximo de registros con nulos: 21


In [35]:
# nuevo campo que indica la cantidad de nulos en cada registro
df['num_nulls'] = df.isnull().sum(axis=1)

# Ordenar el DataFrame por la columna de número de nulos en orden descendente
df_sorted_by_nulls = df.sort_values(by='num_nulls', ascending=False)

# Mostrar los registros con nulos, ordenados por número de nulos en cada registro
df_sorted_by_nulls[df_sorted_by_nulls['num_nulls'] > 0]

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue,num_nulls
1029,Null-heavy campaign,2023-01-01,,,,B2B,social media,,,,6
1033,Random mess,2023-06-06,,100000,,podcast,,referral,,300000.0,4
1026,Cloud-based scalable solution,,2023-12-31,50000,,event,B2C,paid,0.3,120000.0,2
1008,NEW CAMPAIGN - Missing Budget,2023-10-01,2024-01-15,,0.25,email,B2B,organic,,45000.0,2
1027,Broken-date campaign,2023-13-01,2024-01-01,25000,0.45,email,B2B,organic,,87500.0,1
1028,Negative ROI test,2022-10-10,2023-05-05,-10000,-0.2,podcast,B2C,referral,0.1,,1
1005,Upgradable transitional data-warehouse,2023-06-29,2023-12-13,,0.59,social media,B2C,referral,0.67,558302.11,1
1031,Extra long name campaign test,2023-04-15,2023-09-15,30000,0.25,email,,paid,0.4,45000.0,1
1032,No revenue campaign,2023-02-01,2023-08-01,20000,0.3,social media,B2B,organic,0.5,,1
1034,Invalid budget,2022-12-01,2023-06-01,abc,,email,B2C,promotion,0.2,50000.0,1


In [36]:
# eliminar registros con un número de nulos mayor a 1
df = df[df['num_nulls'] <= 1]
df[df['num_nulls'] > 0]

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue,num_nulls
1005,Upgradable transitional data-warehouse,2023-06-29,2023-12-13,,0.59,social media,B2C,referral,0.67,558302.11,1
1007,User-friendly client-driven service-desk,2023-01-06,2023-12-11,36800.58,0.4,,B2C,promotion,0.52,206241.46,1
1027,Broken-date campaign,2023-13-01,2024-01-01,25000,0.45,email,B2B,organic,,87500.0,1
1028,Negative ROI test,2022-10-10,2023-05-05,-10000,-0.2,podcast,B2C,referral,0.1,,1
1031,Extra long name campaign test,2023-04-15,2023-09-15,30000,0.25,email,,paid,0.4,45000.0,1
1032,No revenue campaign,2023-02-01,2023-08-01,20000,0.3,social media,B2B,organic,0.5,,1
1034,Invalid budget,2022-12-01,2023-06-01,abc,,email,B2C,promotion,0.2,50000.0,1


In [37]:
df

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue,num_nulls
0,Public-key multi-tasking throughput,2023-04-01,2024-02-23,8082.3,0.35,email,B2B,organic,0.40,709593.48,0
1,De-engineered analyzing task-force,2023-02-15,2024-04-22,17712.98,0.74,email,B2C,promotion,0.66,516609.10,0
2,Balanced solution-oriented Local Area Network,2022-12-20,2023-10-11,84643.1,0.37,podcast,B2B,paid,0.28,458227.42,0
3,Distributed real-time methodology,2022-09-26,2023-09-27,14589.75,0.47,webinar,B2B,organic,0.19,89958.73,0
4,Front-line executive infrastructure,2023-07-07,2024-05-15,39291.9,0.30,social media,B2B,promotion,0.81,47511.35,0
...,...,...,...,...,...,...,...,...,...,...,...
1031,Extra long name campaign test,2023-04-15,2023-09-15,30000,0.25,email,,paid,0.40,45000.00,1
1032,No revenue campaign,2023-02-01,2023-08-01,20000,0.30,social media,B2B,organic,0.50,,1
1034,Invalid budget,2022-12-01,2023-06-01,abc,,email,B2C,promotion,0.20,50000.00,1
1035,Overlapping dates,2023-03-01,2022-12-31,60000,0.60,webinar,B2B,paid,0.70,90000.00,0
