Data cleaning and preparation:
1. Remove duplicate and irrelevant columns.
2. Process missing values accordingly.
3. Convert the data types for columns such as dates and numeric
values.

In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt

In [12]:
contacts = pd.read_excel('Contacts (Done).xlsx', dtype={'Id': str})
contacts.head()
contacts.describe()

Unnamed: 0,Id,Contact Owner Name,Created Time,Modified Time
count,18548,18548,18548,18548
unique,18548,28,17921,16580
top,5805028000056907001,Charlie Davis,10.06.2024 09:00,13.06.2024 17:08
freq,1,2018,13,25


In [13]:
#Convert the data types
contacts["Created Time"] = pd.to_datetime(contacts["Created Time"],
                                             format='%d.%m.%Y %H:%M',
                                             dayfirst=True,
                                             errors="raise")
contacts["Modified Time"] = pd.to_datetime(contacts["Modified Time"],
                                              format='%d.%m.%Y %H:%M',
                                              dayfirst=True,
                                              errors="raise")

In [14]:
contacts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18548 entries, 0 to 18547
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Id                  18548 non-null  object        
 1   Contact Owner Name  18548 non-null  object        
 2   Created Time        18548 non-null  datetime64[ns]
 3   Modified Time       18548 non-null  datetime64[ns]
dtypes: datetime64[ns](2), object(2)
memory usage: 579.8+ KB


In [15]:
#Removing duplicates
contacts[contacts.duplicated(keep=False, subset=contacts.columns[1:])]

Unnamed: 0,Id,Contact Owner Name,Created Time,Modified Time
203,5805028000001949074,Bob Brown,2023-07-17 18:40:00,2023-07-17 18:40:00
205,5805028000001953081,Bob Brown,2023-07-17 18:40:00,2023-07-17 18:40:00
279,5805028000002340967,Bob Brown,2023-07-18 16:53:00,2023-07-18 16:53:00
280,5805028000002344049,Bob Brown,2023-07-18 16:53:00,2023-07-18 16:53:00
334,5805028000002740077,Bob Brown,2023-07-21 12:26:00,2023-07-21 12:26:00
...,...,...,...,...
17864,5805028000054231884,Rachel White,2024-06-10 09:00:00,2024-06-10 09:33:00
17865,5805028000054232018,Rachel White,2024-06-10 09:00:00,2024-06-10 09:33:00
17869,5805028000054238271,Rachel White,2024-06-10 09:00:00,2024-06-10 09:33:00
17870,5805028000054238317,Rachel White,2024-06-10 09:00:00,2024-06-10 09:33:00


In [16]:
contacts.duplicated(subset=contacts.columns[1:]).sum()

np.int64(38)

In [17]:
contacts.drop_duplicates(subset=contacts.columns[1:], inplace=True)
contacts.duplicated(subset=contacts.columns[1:]).sum()

np.int64(0)

In [18]:
contacts.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18510 entries, 0 to 18547
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Id                  18510 non-null  object        
 1   Contact Owner Name  18510 non-null  object        
 2   Created Time        18510 non-null  datetime64[ns]
 3   Modified Time       18510 non-null  datetime64[ns]
dtypes: datetime64[ns](2), object(2)
memory usage: 723.0+ KB


In [19]:
# Saving the updated file after cleaning
contacts.to_excel("Contacts (Final).xlsx", index=False)

In [20]:
spend = pd.read_excel('Spend (Done).xlsx')   
spend.head()

Unnamed: 0,Date,Source,Campaign,Impressions,Spend,Clicks,AdGroup,Ad
0,2023-07-03,Google Ads,gen_analyst_DE,6,0.0,0,,
1,2023-07-03,Google Ads,performancemax_eng_DE,4,0.01,1,,
2,2023-07-03,Facebook Ads,,0,0.0,0,,
3,2023-07-03,Google Ads,,0,0.0,0,,
4,2023-07-03,CRM,,0,0.0,0,,


In [21]:
spend.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20779 entries, 0 to 20778
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Date         20779 non-null  datetime64[ns]
 1   Source       20779 non-null  object        
 2   Campaign     14785 non-null  object        
 3   Impressions  20779 non-null  int64         
 4   Spend        20779 non-null  float64       
 5   Clicks       20779 non-null  int64         
 6   AdGroup      13951 non-null  object        
 7   Ad           13951 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(2), object(4)
memory usage: 1.3+ MB


In [22]:
#Convert the data types
spend['Source'] = spend['Source'].astype('category')
spend['Campaign'] = spend['Campaign'].fillna('Unknown').astype('category')
spend['AdGroup'] = spend['AdGroup'].fillna('Unknown').astype('category')

In [23]:
#Remove rows with the 0 for Impressions, Spend, Clicks as not relevant for any analysis
spend = spend[~(spend[['Spend', 'Impressions', 'Clicks']] == 0).all(axis=1)]
spend[(spend[['Spend', 'Impressions', 'Clicks']] == 0).all(axis=1)][['Spend', 'Impressions', 'Clicks']]

Unnamed: 0,Spend,Impressions,Clicks


In [24]:
spend.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16407 entries, 0 to 20778
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Date         16407 non-null  datetime64[ns]
 1   Source       16407 non-null  category      
 2   Campaign     16407 non-null  category      
 3   Impressions  16407 non-null  int64         
 4   Spend        16407 non-null  float64       
 5   Clicks       16407 non-null  int64         
 6   AdGroup      16407 non-null  category      
 7   Ad           13370 non-null  object        
dtypes: category(3), datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 821.0+ KB


In [25]:
#Removing duplicates 
spend[spend.duplicated()]

Unnamed: 0,Date,Source,Campaign,Impressions,Spend,Clicks,AdGroup,Ad
13844,2024-03-28,SMM,Unknown,0,0.0,4,Unknown,
15129,2024-04-11,SMM,Unknown,0,0.0,1,Unknown,
16190,2024-04-22,SMM,Unknown,0,0.0,1,Unknown,
16466,2024-04-25,SMM,Unknown,0,0.0,1,Unknown,
16531,2024-04-26,Organic,Unknown,0,0.0,26,Unknown,
16613,2024-04-27,Organic,Unknown,0,0.0,9,Unknown,
18009,2024-05-14,Organic,Unknown,0,0.0,4,Unknown,


In [26]:
spend.duplicated().sum()
     

np.int64(7)

In [27]:
spend.drop_duplicates(keep=False, inplace=True)
spend.duplicated(keep=False).sum()

np.int64(0)

In [28]:
# Removing extra data Ad
spend.drop(columns=['Ad'], inplace=True)


In [29]:
spend.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16393 entries, 0 to 20778
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Date         16393 non-null  datetime64[ns]
 1   Source       16393 non-null  category      
 2   Campaign     16393 non-null  category      
 3   Impressions  16393 non-null  int64         
 4   Spend        16393 non-null  float64       
 5   Clicks       16393 non-null  int64         
 6   AdGroup      16393 non-null  category      
dtypes: category(3), datetime64[ns](1), float64(1), int64(2)
memory usage: 692.2 KB


In [30]:
# Saving the updated file after cleaning
spend.to_excel("Spend (Final).xlsx", index=False)

In [32]:
#calls = pd.read_excel('Calls (Done).xlsx')   
calls = pd.read_excel('Calls (Done).xlsx', dtype={'Id': str, "CONTACTID": str})
calls.head()

Unnamed: 0,Id,Call Start Time,Call Owner Name,CONTACTID,Call Type,Call Duration (in seconds),Call Status,Dialled Number,Outgoing Call Status,Scheduled in CRM,Tag
0,5805028000000805001,30.06.2023 08:43,John Doe,,Inbound,171.0,Received,,,,
1,5805028000000768006,30.06.2023 08:46,John Doe,,Outbound,28.0,Attended Dialled,,Completed,0.0,
2,5805028000000764027,30.06.2023 08:59,John Doe,,Outbound,24.0,Attended Dialled,,Completed,0.0,
3,5805028000000787003,30.06.2023 09:20,John Doe,5.805028000000645e+18,Outbound,6.0,Attended Dialled,,Completed,0.0,
4,5805028000000768019,30.06.2023 09:30,John Doe,5.805028000000645e+18,Outbound,11.0,Attended Dialled,,Completed,0.0,


In [33]:
calls.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95874 entries, 0 to 95873
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Id                          95874 non-null  object 
 1   Call Start Time             95874 non-null  object 
 2   Call Owner Name             95874 non-null  object 
 3   CONTACTID                   91941 non-null  object 
 4   Call Type                   95874 non-null  object 
 5   Call Duration (in seconds)  95791 non-null  float64
 6   Call Status                 95874 non-null  object 
 7   Dialled Number              0 non-null      float64
 8   Outgoing Call Status        86875 non-null  object 
 9   Scheduled in CRM            86875 non-null  float64
 10  Tag                         0 non-null      float64
dtypes: float64(4), object(7)
memory usage: 8.0+ MB


In [34]:
#Convert the data types
calls['Call Start Time'] = pd.to_datetime(calls['Call Start Time'], format='%d.%m.%Y %H:%M', dayfirst=True, errors='raise')

calls_columns = ['Call Owner Name', 'Call Type', 'Call Status', 'Outgoing Call Status', 'Scheduled in CRM']

for col in calls_columns:
    calls[col] = calls[col].astype('category')

for col in ['Outgoing Call Status', 'Scheduled in CRM']:
    if 'Not outgoing call' not in calls[col].cat.categories:
        calls[col] = calls[col].cat.add_categories(['Not outgoing call'])
    calls[col] = calls[col].fillna('Not outgoing call')

In [35]:
calls.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95874 entries, 0 to 95873
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   Id                          95874 non-null  object        
 1   Call Start Time             95874 non-null  datetime64[ns]
 2   Call Owner Name             95874 non-null  category      
 3   CONTACTID                   91941 non-null  object        
 4   Call Type                   95874 non-null  category      
 5   Call Duration (in seconds)  95791 non-null  float64       
 6   Call Status                 95874 non-null  category      
 7   Dialled Number              0 non-null      float64       
 8   Outgoing Call Status        95874 non-null  category      
 9   Scheduled in CRM            95874 non-null  category      
 10  Tag                         0 non-null      float64       
dtypes: category(5), datetime64[ns](1), float64(3), object(

In [36]:
#Removing duplicates 
calls.duplicated(subset=calls.columns[1:]).sum()

np.int64(3257)

In [37]:
calls[calls.duplicated(keep=False, subset=calls.columns[1:])]

Unnamed: 0,Id,Call Start Time,Call Owner Name,CONTACTID,Call Type,Call Duration (in seconds),Call Status,Dialled Number,Outgoing Call Status,Scheduled in CRM,Tag
34,5805028000001140014,2023-07-06 17:15:00,Alice Johnson,5805028000001129001,Outbound,0.0,Unattended Dialled,,Completed,0.0,
35,5805028000001167001,2023-07-06 17:15:00,Alice Johnson,5805028000001129001,Outbound,0.0,Unattended Dialled,,Completed,0.0,
101,5805028000001372054,2023-07-08 16:43:00,John Doe,,Missed,0.0,Missed,,Not outgoing call,Not outgoing call,
102,5805028000001348077,2023-07-08 16:43:00,John Doe,,Missed,0.0,Missed,,Not outgoing call,Not outgoing call,
254,5805028000001568042,2023-07-12 19:23:00,Jane Smith,5805028000001552025,Outbound,0.0,Unattended Dialled,,Completed,0.0,
...,...,...,...,...,...,...,...,...,...,...,...
95804,5805028000056832311,2024-06-21 14:17:00,Yara Edwards,,Outbound,8.0,Attended Dialled,,Completed,0.0,
95833,5805028000056845313,2024-06-21 14:47:00,Ulysses Adams,5805028000026041053,Outbound,0.0,Unattended Dialled,,Completed,0.0,
95834,5805028000056873560,2024-06-21 14:47:00,Ulysses Adams,5805028000026041053,Outbound,0.0,Unattended Dialled,,Completed,0.0,
95838,5805028000056834447,2024-06-21 14:55:00,John Doe,,Missed,0.0,Missed,,Not outgoing call,Not outgoing call,


In [39]:
calls.drop_duplicates(subset=calls.columns[1:], inplace=True)
calls.duplicated(subset=calls.columns[1:]).sum()

np.int64(0)

In [40]:
calls.isna().sum()

Id                                0
Call Start Time                   0
Call Owner Name                   0
CONTACTID                      3802
Call Type                         0
Call Duration (in seconds)       79
Call Status                       0
Dialled Number                92617
Outgoing Call Status              0
Scheduled in CRM                  0
Tag                           92617
dtype: int64

In [41]:
# Removing extra data
calls = calls.drop(columns=['Dialled Number', 'Tag'])

In [None]:
calls.info()

In [42]:
# Saving the updated file after cleaning
calls.to_excel("Calls (Final).xlsx", index=False)