In [1]:
import pyspark.sql.types as t
import pyspark.sql.functions as f

from pyspark.sql import SparkSession

from model_customer import ModelCustomer

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
personal_info_list = ModelCustomer.generate_personal_info(50)
schema = t.StructType([t.StructField("name", t.StringType()),
                      t.StructField("lastname", t.StringType()),
                      t.StructField("email", t.StringType()),
                      t.StructField("phone", t.StringType()),
                      t.StructField("document", t.StringType()),
                      t.StructField("age", t.IntegerType()),
                      t.StructField("height", t.IntegerType()),
                      t.StructField("weight", t.IntegerType()),
                      t.StructField("country", t.StringType())])
df_customers = spark.createDataFrame(personal_info_list, schema)

In [4]:
df_customers.show(10)

+----------+---------+--------------------+-------------------+--------------+---+------+------+-------+
|      name| lastname|               email|              phone|      document|age|height|weight|country|
+----------+---------+--------------------+-------------------+--------------+---+------+------+-------+
|Davi Lucca|    Nunes|  rjesus@example.org|   +55 41 3808-6217|981.642.735-46| 63|   176|   121|  pt_BR|
|       Luc|   Gallet|genevieve05@examp...|  +33 4 57 92 27 66|     404197030| 61|   153|    61|  fr_FR|
|    Murilo|  Freitas|isabel22@example.net|    (081) 9609 3348|308.912.467-03| 24|   176|   144|  pt_BR|
|    Daniel|     Pace|carolinebryant@ex...|    +1-279-868-5462|   841-77-7816| 50|   208|    97|  en_US|
|Frédérique|    Leduc|torresrenee@examp...|  +33 3 55 41 02 20|     134032115| 70|   216|    97|  fr_FR|
|     Marie| Delattre|    jroy@example.net|         0245388239|     802863641| 33|   209|    61|  fr_FR|
|      Inès|    Lecoq|valentinadele@exa...|         044

In [5]:
list_validated_customers = ModelCustomer.load_customers(df_customers)
df_validated_customers = spark.createDataFrame(list_validated_customers)

In [6]:
df_validated_customers.columns

['age',
 'check_valid_document',
 'check_valid_email',
 'check_valid_phone',
 'country',
 'document',
 'email',
 'height',
 'lastname',
 'name',
 'phone',
 'type_document',
 'weight']

In [7]:
df_validated_customers.count()

50

In [8]:
# Check Invalid phone customers

In [9]:
df_invalid_phone_customers = df_validated_customers.filter(f.col('check_valid_phone') == False)
df_invalid_phone_customers.count()

14

In [10]:
df_invalid_phone_customers.select(f.col('phone'), f.col('country'), f.col('check_valid_phone')).show()

+------------------+-------+-----------------+
|             phone|country|check_valid_phone|
+------------------+-------+-----------------+
|   195463874555244|  en_US|            false|
|001896569054158140|  en_US|            false|
|    14561650567887|  en_US|            false|
|      330479183488|  fr_FR|            false|
|    17673041237981|  en_US|            false|
|      330533937638|  fr_FR|            false|
|   161781431829427|  en_US|            false|
|    39337042788216|  en_US|            false|
|      330470828050|  fr_FR|            false|
|   231823078440572|  en_US|            false|
|  0010568659246830|  en_US|            false|
|    51099171503352|  en_US|            false|
|     0356419267095|  en_US|            false|
|    12835249462137|  en_US|            false|
+------------------+-------+-----------------+



In [11]:
# Check Valid phone customers

In [12]:
df_valid_phone_customers = df_validated_customers.filter(f.col('check_valid_phone') == True)
df_valid_phone_customers.count()

36

In [13]:
df_valid_phone_customers.select(f.col('phone'), f.col('country'), f.col('check_valid_phone')).show()

+-------------+-------+-----------------+
|        phone|country|check_valid_phone|
+-------------+-------+-----------------+
| 554138086217|  pt_BR|             true|
|  33457922766|  fr_FR|             true|
|  08196093348|  pt_BR|             true|
|  12798685462|  en_US|             true|
|  33355410220|  fr_FR|             true|
|   0245388239|  fr_FR|             true|
|   0443532223|  fr_FR|             true|
|   7749081255|  en_US|             true|
|5508174336369|  pt_BR|             true|
|   8872238720|  en_US|             true|
|  09007586811|  pt_BR|             true|
|   1343891884|  en_US|             true|
|   0456478021|  fr_FR|             true|
|   0272442722|  fr_FR|             true|
|   0367089564|  fr_FR|             true|
|   1176757842|  pt_BR|             true|
|5501188728165|  pt_BR|             true|
|  33559730301|  fr_FR|             true|
|   6110409161|  pt_BR|             true|
|   0143413621|  fr_FR|             true|
+-------------+-------+-----------

In [14]:
# Check Invalid document customers

In [15]:
df_invalid_document_customers = df_validated_customers.filter(f.col('check_valid_document') == False)
df_invalid_document_customers.count()

36

In [16]:
df_invalid_document_customers.select(f.col('document'), f.col('type_document'), f.col('check_valid_document'), f.col('country')).show()

+---------+-------------+--------------------+-------+
| document|type_document|check_valid_document|country|
+---------+-------------+--------------------+-------+
|404197030|         null|               false|  fr_FR|
|841777816|         null|               false|  en_US|
|134032115|         null|               false|  fr_FR|
|802863641|         null|               false|  fr_FR|
|599522059|         null|               false|  fr_FR|
|064226077|         null|               false|  en_US|
|667527674|         null|               false|  en_US|
|303668465|         null|               false|  en_US|
|584312212|         null|               false|  en_US|
|597942358|         null|               false|  fr_FR|
|180036037|         null|               false|  fr_FR|
|211244814|         null|               false|  fr_FR|
|459339495|         null|               false|  en_US|
|369167099|         null|               false|  en_US|
|750570392|         null|               false|  fr_FR|
|546392201

In [17]:
# Check Valid document customers

In [18]:
df_valid_document_customers = df_validated_customers.filter(f.col('check_valid_document') == True)
df_valid_document_customers.count()

14

In [19]:
df_valid_document_customers.select(f.col('document'), f.col('type_document'), f.col('check_valid_document'), f.col('country')).show()

+--------------+-------------+--------------------+-------+
|      document|type_document|check_valid_document|country|
+--------------+-------------+--------------------+-------+
|   98164273546|          CPF|                true|  pt_BR|
|   30891246703|          CPF|                true|  pt_BR|
|   42538709610|          CPF|                true|  pt_BR|
|21568409000189|         CNPJ|                true|  pt_BR|
|17306482000105|         CNPJ|                true|  pt_BR|
|43175928000110|         CNPJ|                true|  pt_BR|
|76389401000147|         CNPJ|                true|  pt_BR|
|31809246000132|         CNPJ|                true|  pt_BR|
|34691850000103|         CNPJ|                true|  pt_BR|
|   25013789621|          CPF|                true|  pt_BR|
|   13604798566|          CPF|                true|  pt_BR|
|   41729503616|          CPF|                true|  pt_BR|
|   54319082605|          CPF|                true|  pt_BR|
|   40751623970|          CPF|          