Last Update: 9th October 2023

Status: No pending updates, ready to use

---

In [108]:
import pandas as pd
import numpy as np
import re

#### 1. Read CSV

<u>Note:</u>
1. To avoid any filepath issue, it is best to keep this "ipynb" file with the Excel file that you wish to read in the same folder.
2. This Python script only supports CSV file, convert the Excel file into CSV UTF-8.

In [109]:
#Update the file name
excel_file = "Test_IPQS_Email_Validation"

df = pd.read_csv(excel_file + ".csv")

In [110]:
# rename the column to ensure that there are no blank, (), /, :
df_col = df.columns
replace_pattern = r'[ ()\/:]+'

df_col_rename = {col : re.sub(replace_pattern, "_", col) for col in df_col}
df.rename(columns= df_col_rename, inplace= True)

In [111]:
df.head()

Unnamed: 0,Date,Recent_Abuse,Fraud_Score,Valid,Common_Domain,Deliverability,Disposable,First_Name,Generic,Honeypot,...,First_Seen,Domain_Age,Domain_Velocity,Leaked,User_Activity,Associated_Phone_Numbers,Associated_Names,Email_Address,Original_Column_email,Original_Column_1
0,2023-09-07T02:27:41-04:00,False,0,True,False,medium,False,Unknown,False,False,...,2022-12-12T19:59:09-05:00,2006-10-18T16:05:22-04:00,low,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,lbaker@flexcarestaff.com,lbaker@flexcarestaff.com,lbaker@flexcarestaff.com
1,2023-09-07T02:27:41-04:00,False,0,True,False,high,False,Wytske,False,False,...,2023-09-07T02:27:40-04:00,2020-01-09T07:03:56-05:00,high,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,wytske.garty@trademe.co.nz,wytske.garty@trademe.co.nz,wytske.garty@trademe.co.nz
2,2023-09-07T02:27:40-04:00,False,0,True,True,high,False,Unknown,False,False,...,2023-01-02T13:38:05-05:00,1994-07-14T00:00:00-04:00,medium,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,mdenning@ju.edu,mdenning@ju.edu,mdenning@ju.edu
3,2023-09-07T02:28:25-04:00,False,90,False,False,low,False,Unknown,False,False,...,2023-09-07T02:27:40-04:00,1996-02-13T00:00:00-05:00,low,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,vgarcia@kingkullen.com,vgarcia@kingkullen.com,vgarcia@kingkullen.com
4,2023-09-07T02:27:41-04:00,False,0,True,False,medium,False,Robertson,False,False,...,2023-09-07T02:27:40-04:00,1996-02-27T00:00:00-05:00,none,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,brobertson@royaloak.com,brobertson@royaloak.com,brobertson@royaloak.com


#### 2. Email Validation

<u> **Criteria for Email to be Valid** </u>

If either one of the field does not fulfill the following criteria, the email will be invalid.
1. Recent Abuse = FALSE
1. Valid = TRUE
1. Disposable = FALSE
1. Honeypot = FALSE
1. Spam Trap Score != High

In [112]:
def valid_email(x):
    no_recent_abuse = x['Recent_Abuse'] == False
    is_valid = x['Valid'] == True
    not_disposable = x['Disposable'] == False
    not_honeypot = x['Honeypot'] == False
    not_high_spam_trap_score = x['Spam_Trap_Score'] != "high"

    # return true if all item is true
    is_valid = all([no_recent_abuse, is_valid, not_disposable, not_honeypot, not_high_spam_trap_score])

    return 'Valid' if is_valid else 'Invalid'

In [113]:
df['Email_Validity'] = df.apply(valid_email, axis= 1)
df.head()

Unnamed: 0,Date,Recent_Abuse,Fraud_Score,Valid,Common_Domain,Deliverability,Disposable,First_Name,Generic,Honeypot,...,Domain_Age,Domain_Velocity,Leaked,User_Activity,Associated_Phone_Numbers,Associated_Names,Email_Address,Original_Column_email,Original_Column_1,Email_Validity
0,2023-09-07T02:27:41-04:00,False,0,True,False,medium,False,Unknown,False,False,...,2006-10-18T16:05:22-04:00,low,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,lbaker@flexcarestaff.com,lbaker@flexcarestaff.com,lbaker@flexcarestaff.com,Valid
1,2023-09-07T02:27:41-04:00,False,0,True,False,high,False,Wytske,False,False,...,2020-01-09T07:03:56-05:00,high,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,wytske.garty@trademe.co.nz,wytske.garty@trademe.co.nz,wytske.garty@trademe.co.nz,Valid
2,2023-09-07T02:27:40-04:00,False,0,True,True,high,False,Unknown,False,False,...,1994-07-14T00:00:00-04:00,medium,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,mdenning@ju.edu,mdenning@ju.edu,mdenning@ju.edu,Valid
3,2023-09-07T02:28:25-04:00,False,90,False,False,low,False,Unknown,False,False,...,1996-02-13T00:00:00-05:00,low,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,vgarcia@kingkullen.com,vgarcia@kingkullen.com,vgarcia@kingkullen.com,Invalid
4,2023-09-07T02:27:41-04:00,False,0,True,False,medium,False,Robertson,False,False,...,1996-02-27T00:00:00-05:00,none,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,brobertson@royaloak.com,brobertson@royaloak.com,brobertson@royaloak.com,Valid


#### 3. Invalid Reason

In [114]:
def invalid_reason(x):
    reasons = []

    if x['Email_Validity'] == "Valid": 
        return ""
    
    if x['Recent_Abuse'] != False:
        reasons.append("Recent Abuse")
    
    if x['Valid'] != True:
        reasons.append("Invalid Email")
    
    if x['Disposable'] != False:
        reasons.append("Disposed Email")
    
    if x['Honeypot'] != False:
        reasons.append("Honeypot")
    
    if x['Spam_Trap_Score'] == "high":
        reasons.append("High Spam Score")

    return reasons

In [115]:
df["Invalid_Reasons"] = df.apply(invalid_reason, axis= 1)
df.head()

Unnamed: 0,Date,Recent_Abuse,Fraud_Score,Valid,Common_Domain,Deliverability,Disposable,First_Name,Generic,Honeypot,...,Domain_Velocity,Leaked,User_Activity,Associated_Phone_Numbers,Associated_Names,Email_Address,Original_Column_email,Original_Column_1,Email_Validity,Invalid_Reasons
0,2023-09-07T02:27:41-04:00,False,0,True,False,medium,False,Unknown,False,False,...,low,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,lbaker@flexcarestaff.com,lbaker@flexcarestaff.com,lbaker@flexcarestaff.com,Valid,
1,2023-09-07T02:27:41-04:00,False,0,True,False,high,False,Wytske,False,False,...,high,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,wytske.garty@trademe.co.nz,wytske.garty@trademe.co.nz,wytske.garty@trademe.co.nz,Valid,
2,2023-09-07T02:27:40-04:00,False,0,True,True,high,False,Unknown,False,False,...,medium,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,mdenning@ju.edu,mdenning@ju.edu,mdenning@ju.edu,Valid,
3,2023-09-07T02:28:25-04:00,False,90,False,False,low,False,Unknown,False,False,...,low,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,vgarcia@kingkullen.com,vgarcia@kingkullen.com,vgarcia@kingkullen.com,Invalid,[Invalid Email]
4,2023-09-07T02:27:41-04:00,False,0,True,False,medium,False,Robertson,False,False,...,none,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,brobertson@royaloak.com,brobertson@royaloak.com,brobertson@royaloak.com,Valid,


In [116]:
df['No_of_Invalid_Reasons'] = df['Invalid_Reasons'].apply(len)
df.head()

Unnamed: 0,Date,Recent_Abuse,Fraud_Score,Valid,Common_Domain,Deliverability,Disposable,First_Name,Generic,Honeypot,...,Leaked,User_Activity,Associated_Phone_Numbers,Associated_Names,Email_Address,Original_Column_email,Original_Column_1,Email_Validity,Invalid_Reasons,No_of_Invalid_Reasons
0,2023-09-07T02:27:41-04:00,False,0,True,False,medium,False,Unknown,False,False,...,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,lbaker@flexcarestaff.com,lbaker@flexcarestaff.com,lbaker@flexcarestaff.com,Valid,,0
1,2023-09-07T02:27:41-04:00,False,0,True,False,high,False,Wytske,False,False,...,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,wytske.garty@trademe.co.nz,wytske.garty@trademe.co.nz,wytske.garty@trademe.co.nz,Valid,,0
2,2023-09-07T02:27:40-04:00,False,0,True,True,high,False,Unknown,False,False,...,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,mdenning@ju.edu,mdenning@ju.edu,mdenning@ju.edu,Valid,,0
3,2023-09-07T02:28:25-04:00,False,90,False,False,low,False,Unknown,False,False,...,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,vgarcia@kingkullen.com,vgarcia@kingkullen.com,vgarcia@kingkullen.com,Invalid,[Invalid Email],1
4,2023-09-07T02:27:41-04:00,False,0,True,False,medium,False,Robertson,False,False,...,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,brobertson@royaloak.com,brobertson@royaloak.com,brobertson@royaloak.com,Valid,,0


In [117]:
df['Invalid_Reasons'] = df["Invalid_Reasons"].replace("", "N/A")
df

Unnamed: 0,Date,Recent_Abuse,Fraud_Score,Valid,Common_Domain,Deliverability,Disposable,First_Name,Generic,Honeypot,...,Leaked,User_Activity,Associated_Phone_Numbers,Associated_Names,Email_Address,Original_Column_email,Original_Column_1,Email_Validity,Invalid_Reasons,No_of_Invalid_Reasons
0,2023-09-07T02:27:41-04:00,False,0,True,False,medium,False,Unknown,False,False,...,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,lbaker@flexcarestaff.com,lbaker@flexcarestaff.com,lbaker@flexcarestaff.com,Valid,,0
1,2023-09-07T02:27:41-04:00,False,0,True,False,high,False,Wytske,False,False,...,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,wytske.garty@trademe.co.nz,wytske.garty@trademe.co.nz,wytske.garty@trademe.co.nz,Valid,,0
2,2023-09-07T02:27:40-04:00,False,0,True,True,high,False,Unknown,False,False,...,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,mdenning@ju.edu,mdenning@ju.edu,mdenning@ju.edu,Valid,,0
3,2023-09-07T02:28:25-04:00,False,90,False,False,low,False,Unknown,False,False,...,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,vgarcia@kingkullen.com,vgarcia@kingkullen.com,vgarcia@kingkullen.com,Invalid,[Invalid Email],1
4,2023-09-07T02:27:41-04:00,False,0,True,False,medium,False,Robertson,False,False,...,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,brobertson@royaloak.com,brobertson@royaloak.com,brobertson@royaloak.com,Valid,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5013,2023-09-07T02:35:03-04:00,False,0,True,False,high,False,Chloe,False,False,...,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,chloe.b@vendorpanel.com,chloe.b@vendorpanel.com,chloe.b@vendorpanel.com,Valid,,0
5014,2023-09-07T02:35:02-04:00,False,34,True,False,medium,False,Bill,False,False,...,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,bill@thebeacon.media,bill@thebeacon.media,bill@thebeacon.media,Valid,,0
5015,2023-09-07T02:35:03-04:00,False,0,True,False,high,False,Brian,False,False,...,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,brian_beauchaine@tjx.com,brian_beauchaine@tjx.com,brian_beauchaine@tjx.com,Valid,,0
5016,2023-09-07T02:35:02-04:00,False,0,True,False,high,False,Stefan,False,False,...,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,stefan@thefirm-network.com,stefan@thefirm-network.com,stefan@thefirm-network.com,Valid,,0


#### 4. Split the data frame by email validity

In [118]:
df_valid = df.query("Email_Validity == 'Valid'")
df_valid.head()

Unnamed: 0,Date,Recent_Abuse,Fraud_Score,Valid,Common_Domain,Deliverability,Disposable,First_Name,Generic,Honeypot,...,Leaked,User_Activity,Associated_Phone_Numbers,Associated_Names,Email_Address,Original_Column_email,Original_Column_1,Email_Validity,Invalid_Reasons,No_of_Invalid_Reasons
0,2023-09-07T02:27:41-04:00,False,0,True,False,medium,False,Unknown,False,False,...,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,lbaker@flexcarestaff.com,lbaker@flexcarestaff.com,lbaker@flexcarestaff.com,Valid,,0
1,2023-09-07T02:27:41-04:00,False,0,True,False,high,False,Wytske,False,False,...,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,wytske.garty@trademe.co.nz,wytske.garty@trademe.co.nz,wytske.garty@trademe.co.nz,Valid,,0
2,2023-09-07T02:27:40-04:00,False,0,True,True,high,False,Unknown,False,False,...,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,mdenning@ju.edu,mdenning@ju.edu,mdenning@ju.edu,Valid,,0
4,2023-09-07T02:27:41-04:00,False,0,True,False,medium,False,Robertson,False,False,...,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,brobertson@royaloak.com,brobertson@royaloak.com,brobertson@royaloak.com,Valid,,0
5,2023-09-07T02:27:41-04:00,False,0,True,False,medium,False,Sullivan,False,False,...,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,erinsullivan@getbento.com,erinsullivan@getbento.com,erinsullivan@getbento.com,Valid,,0


In [119]:
df_invalid = df.query("Email_Validity == 'Invalid'")
df_invalid.head()

Unnamed: 0,Date,Recent_Abuse,Fraud_Score,Valid,Common_Domain,Deliverability,Disposable,First_Name,Generic,Honeypot,...,Leaked,User_Activity,Associated_Phone_Numbers,Associated_Names,Email_Address,Original_Column_email,Original_Column_1,Email_Validity,Invalid_Reasons,No_of_Invalid_Reasons
3,2023-09-07T02:28:25-04:00,False,90,False,False,low,False,Unknown,False,False,...,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,vgarcia@kingkullen.com,vgarcia@kingkullen.com,vgarcia@kingkullen.com,Invalid,[Invalid Email],1
9,2023-09-07T02:27:58-04:00,False,96,False,False,low,False,Unknown,False,False,...,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,christina@jgraydesignstudio.com,christina@jgraydesignstudio.com,christina@jgraydesignstudio.com,Invalid,[Invalid Email],1
11,2023-09-07T02:27:47-04:00,False,0,False,False,low,False,Unknown,False,False,...,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,adykes@lyft.com,adykes@lyft.com,adykes@lyft.com,Invalid,[Invalid Email],1
13,2023-09-07T02:27:47-04:00,False,0,False,False,low,False,Unknown,False,False,...,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,rmuething@altiuss.com,rmuething@altiuss.com,rmuething@altiuss.com,Invalid,[Invalid Email],1
27,2023-09-07T02:28:01-04:00,False,0,False,False,low,False,Unknown,False,False,...,False,Enterprise L4+ required.,Enterprise Plus or higher required.,Enterprise Plus or higher required.,guy.frenette@hatch.com,guy.frenette@hatch.com,guy.frenette@hatch.com,Invalid,[Invalid Email],1


#### 5. Save in Excel

In [120]:
# update the filename
new_File = excel_file + "_Output"

writer = pd.ExcelWriter(new_File + ".xlsx")
df.to_excel(writer, sheet_name= "All Contacts")
df_valid.to_excel(writer, sheet_name= "Contacts with Valid Email")
df_invalid.to_excel(writer, sheet_name= "Contacts with Invalid Email")
writer.close()