In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
import os
from pyspark.sql.functions import expr,lit,split
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import udf
import re
# from email_validator import validate_email

In [0]:
 new_schema = StructType() \
      .add("Emp_ID",IntegerType(),True) \
      .add("Name",StringType(),True) \
      .add("Role",StringType(),True) \
      .add("Dept_No",IntegerType(),True) \
      .add("Email_ID",StringType(),True)

In [0]:
filepath="dbfs:/FileStore/shared_uploads/27prachisingh@gmail.com/Junk_Email.csv"
#df1 = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(filepath)
filename = os.path.basename(filepath)
df1 = spark.read.format("csv") \
      .option("header", True) \
      .schema(new_schema) \
      .load(filepath)

In [0]:
df1.show()

+------+-------------+--------+-------+--------------------+
|Emp_ID|         Name|    Role|Dept_No|            Email_ID|
+------+-------------+--------+-------+--------------------+
|   901|    Harry Ram| Analyst|     10| harry.ram@gmail.com|
|   789|Shyam Chouhan| Manager|     20|shyam.chouhan@hot...|
|   223|   Alek theus| Analyst|     10|           Alek Alek|
|   434|     Lucy Ien|Salesman|     20|   lucy.len@yahoo.in|
|   232|  Marie Frank| Analyst|     20|               Marie|
+------+-------------+--------+-------+--------------------+



In [0]:
df1.printSchema()

root
 |-- Emp_ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Role: string (nullable = true)
 |-- Dept_No: integer (nullable = true)
 |-- Email_ID: string (nullable = true)



In [0]:
extract_expr = expr("regexp_extract_all(concat, '(\\\w+([\\\.-]?\\\w+)*@\\\w+([\\\.-]?\\\w+)*(\\\.\\\w{2,3})+)', 0)")

In [0]:
regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'

In [0]:
df1=df1.withColumn("Email_Validity",lit("Invalid"))

In [0]:
display(df1)

Emp_ID,Name,Role,Dept_No,Email_ID,Email_Validity
901,Harry Ram,Analyst,10,harry.ram@gmail.com,Invalid
789,Shyam Chouhan,Manager,20,shyam.chouhan@hotmail.com,Invalid
223,Alek theus,Analyst,10,Alek Alek,Invalid
434,Lucy Ien,Salesman,20,lucy.len@yahoo.in,Invalid
232,Marie Frank,Analyst,20,Marie,Invalid


In [0]:
def check(email):    
    if(re.fullmatch(regex, email)):
        print("Valid Email")
    else:
        print("Invalid Email")

In [0]:
check("uuu@gmail.com")

Valid Email


In [0]:
def func1(y):
    Email_ID=y.Email_ID
    Emp_ID=y.Emp_ID
    Name=y.Name
    Role=y.Role
    Dept_No=y.Dept_No
    Email_Validity=y.Email_Validity
    if (re.fullmatch(regex, Email_ID)):
        Email_Validity="Valid"
    return(Emp_ID,Name,Role,Dept_No,Email_ID,Email_Validity)

In [0]:
rdd2=df1.rdd.map(lambda x: func1(x))

In [0]:
rdd2.collect()

Out[19]: [(901, 'Harry Ram', 'Analyst', 10, 'harry.ram@gmail.com', 'Valid'),
 (789, 'Shyam Chouhan', 'Manager', 20, 'shyam.chouhan@hotmail.com', 'Valid'),
 (223, 'Alek theus', 'Analyst', 10, 'Alek Alek', 'Invalid'),
 (434, 'Lucy Ien', 'Salesman', 20, 'lucy.len@yahoo.in', 'Valid'),
 (232, 'Marie Frank', 'Analyst', 20, 'Marie', 'Invalid')]

In [0]:
df2=rdd2.toDF(["Emp_ID","Name","Role","Dept_No","Email_ID","Email_Validity"])

In [0]:
display(df2)

Emp_ID,Name,Role,Dept_No,Email_ID,Email_Validity
901,Harry Ram,Analyst,10,harry.ram@gmail.com,Valid
789,Shyam Chouhan,Manager,20,shyam.chouhan@hotmail.com,Valid
223,Alek theus,Analyst,10,Alek Alek,Invalid
434,Lucy Ien,Salesman,20,lucy.len@yahoo.in,Valid
232,Marie Frank,Analyst,20,Marie,Invalid


In [0]:
df2=df2.withColumn('Domain_Name', split(df2['Email_ID'], '@').getItem(1))

In [0]:
df2.show()

+------+-------------+--------+-------+--------------------+--------------+-----------+
|Emp_ID|         Name|    Role|Dept_No|            Email_ID|Email_Validity|Domain_Name|
+------+-------------+--------+-------+--------------------+--------------+-----------+
|   901|    Harry Ram| Analyst|     10| harry.ram@gmail.com|         Valid|  gmail.com|
|   789|Shyam Chouhan| Manager|     20|shyam.chouhan@hot...|         Valid|hotmail.com|
|   223|   Alek theus| Analyst|     10|           Alek Alek|       Invalid|       null|
|   434|     Lucy Ien|Salesman|     20|   lucy.len@yahoo.in|         Valid|   yahoo.in|
|   232|  Marie Frank| Analyst|     20|               Marie|       Invalid|       null|
+------+-------------+--------+-------+--------------------+--------------+-----------+



In [0]:
display(df2)

Emp_ID,Name,Role,Dept_No,Email_ID,Email_Validity,Domain_Name
901,Harry Ram,Analyst,10,harry.ram@gmail.com,Valid,gmail.com
789,Shyam Chouhan,Manager,20,shyam.chouhan@hotmail.com,Valid,hotmail.com
223,Alek theus,Analyst,10,Alek Alek,Invalid,
434,Lucy Ien,Salesman,20,lucy.len@yahoo.in,Valid,yahoo.in
232,Marie Frank,Analyst,20,Marie,Invalid,
