In [0]:
profile = spark.read.format("csv").option("header","true").option("InferSchema","true")\
    .load("/FileStore/tables/Sample_Data___Customer_Profile-1.csv")
profile.printSchema()

root
 |-- customerNumber: string (nullable = true)
 |-- dateOfBirth: string (nullable = true)
 |-- citizenshipCode: string (nullable = true)
 |-- currCountryCode: string (nullable = true)
 |-- employmentStatus: string (nullable = true)
 |-- incomeInThousands: string (nullable = true)
 |-- marketSegment: string (nullable = true)
 |-- maritalStatus: string (nullable = true)
 |-- stateCode: string (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- accountOpeningDate: string (nullable = true)
 |-- gender: string (nullable = true)



In [0]:
print("No. of rows :", profile.count())
print("No. of columns :" ,len(profile.columns))

No. of rows : 1000
No. of columns : 13


In [0]:
profile.select("customerNumber","dateOfBirth","citizenshipCode","currCountryCode","employmentStatus").show(5,False)

+------------------------------------+-----------+---------------+---------------+----------------+
|customerNumber                      |dateOfBirth|citizenshipCode|currCountryCode|employmentStatus|
+------------------------------------+-----------+---------------+---------------+----------------+
|b90ec527-d54c-4546-8872-81f90f9ed33c|27-11-1995 |US             |USD            |worker          |
|f8162ec9-2f9d-4412-8c22-3c66b2f46a8f|03-05-2002 |US             |USD            |self-employed   |
|8c50fecc-abef-43f9-bfe1-8cc3c5c01a54|21-07-1994 |US             |USD            |employee        |
|5afb4208-19a0-4ef0-a12b-fb430d83af2e|14-03-1996 |US             |USD            |employee        |
|3fed47d1-606c-4fb5-a35d-98a100eabffb|02-08-1995 |US             |USD            |worker          |
+------------------------------------+-----------+---------------+---------------+----------------+
only showing top 5 rows



In [0]:
profile.select("incomeInThousands","marketSegment","maritalStatus","stateCode").show(5,False)

+-----------------+-------------+-------------+---------+
|incomeInThousands|marketSegment|maritalStatus|stateCode|
+-----------------+-------------+-------------+---------+
|$200685.86       |standard     |Married      |FL       |
|$121102.59       |basic        |Married      |CT       |
|$234625.69       |standard     |Widowed      |NY       |
|$152978.83       |HNI          |Divorced     |LA       |
|$168933.21       |standard     |Widowed      |FL       |
+-----------------+-------------+-------------+---------+
only showing top 5 rows



In [0]:
profile.select("city","country","accountOpeningDate","gender").show(5,False)

+-------------+-------------+------------------+------+
|city         |country      |accountOpeningDate|gender|
+-------------+-------------+------------------+------+
|Zephyrhills  |United States|06-09-2021        |Male  |
|Waterbury    |United States|1/30/2021         |Female|
|Staten Island|United States|6/23/2021         |Female|
|Baton Rouge  |United States|04-05-2021        |Male  |
|Miami        |United States|06-08-2021        |Female|
+-------------+-------------+------------------+------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import isnan, when, count, col
profile.select([count(when(col(c).isNull(), c)).alias(c) for c in profile.columns]).show()

+--------------+-----------+---------------+---------------+----------------+-----------------+-------------+-------------+---------+----+-------+------------------+------+
|customerNumber|dateOfBirth|citizenshipCode|currCountryCode|employmentStatus|incomeInThousands|marketSegment|maritalStatus|stateCode|city|country|accountOpeningDate|gender|
+--------------+-----------+---------------+---------------+----------------+-----------------+-------------+-------------+---------+----+-------+------------------+------+
|             0|          0|             22|             22|              59|               46|           12|            0|        0|   0|      0|                 0|     0|
+--------------+-----------+---------------+---------------+----------------+-----------------+-------------+-------------+---------+----+-------+------------------+------+



In [0]:
profile = profile.na.drop()

In [0]:
print("No. of rows :", profile.count())
print("No. of columns :" ,len(profile.columns))

No. of rows : 864
No. of columns : 13


In [0]:
profile = profile.dropDuplicates()

In [0]:
print("No. of rows :", profile.count())
print("No. of columns :" ,len(profile.columns))

No. of rows : 864
No. of columns : 13


In [0]:
from pyspark.sql.functions import isnan, when, count, col
profile.select([count(when(col(c).isNull(), c)).alias(c) for c in profile.columns]).show()

+--------------+-----------+---------------+---------------+----------------+-----------------+-------------+-------------+---------+----+-------+------------------+------+
|customerNumber|dateOfBirth|citizenshipCode|currCountryCode|employmentStatus|incomeInThousands|marketSegment|maritalStatus|stateCode|city|country|accountOpeningDate|gender|
+--------------+-----------+---------------+---------------+----------------+-----------------+-------------+-------------+---------+----+-------+------------------+------+
|             0|          0|              0|              0|               0|                0|            0|            0|        0|   0|      0|                 0|     0|
+--------------+-----------+---------------+---------------+----------------+-----------------+-------------+-------------+---------+----+-------+------------------+------+



In [0]:
profile = profile.withColumn("incomeInThousands", profile.incomeInThousands.substr(2,10))

In [0]:
profile.select("incomeInThousands").show(5,False)

+-----------------+
|incomeInThousands|
+-----------------+
|121102.59        |
|234625.69        |
|214317.71        |
|200685.86        |
|152978.83        |
+-----------------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import regexp_replace,col
profile = profile.withColumn('accountOpeningDate', regexp_replace(col('accountOpeningDate'), "/", "-"))

In [0]:
profile.select("accountOpeningDate").show(5,False)

+------------------+
|accountOpeningDate|
+------------------+
|1-30-2021         |
|6-23-2021         |
|1-20-2021         |
|06-09-2021        |
|04-05-2021        |
+------------------+
only showing top 5 rows



In [0]:
from pyspark.sql.types import DateType
profile = profile.withColumn("accountOpeningDate",
profile["accountOpeningDate"].cast(DateType()))

In [0]:
from pyspark.sql.types import DateType
profile = profile.withColumn("dateOfBirth",
profile["dateOfBirth"].cast(DateType()))

In [0]:
profile.printSchema()

root
 |-- customerNumber: string (nullable = true)
 |-- dateOfBirth: date (nullable = true)
 |-- citizenshipCode: string (nullable = true)
 |-- currCountryCode: string (nullable = true)
 |-- employmentStatus: string (nullable = true)
 |-- incomeInThousands: string (nullable = true)
 |-- marketSegment: string (nullable = true)
 |-- maritalStatus: string (nullable = true)
 |-- stateCode: string (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- accountOpeningDate: date (nullable = true)
 |-- gender: string (nullable = true)



In [0]:
from pyspark.sql.types import FloatType
profile = profile.withColumn("incomeInThousands",
profile["incomeInThousands"].cast(FloatType()))

In [0]:
profile.printSchema()

root
 |-- customerNumber: string (nullable = true)
 |-- dateOfBirth: date (nullable = true)
 |-- citizenshipCode: string (nullable = true)
 |-- currCountryCode: string (nullable = true)
 |-- employmentStatus: string (nullable = true)
 |-- incomeInThousands: float (nullable = true)
 |-- marketSegment: string (nullable = true)
 |-- maritalStatus: string (nullable = true)
 |-- stateCode: string (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- accountOpeningDate: date (nullable = true)
 |-- gender: string (nullable = true)



In [0]:
profile.describe(['customerNumber','dateOfBirth','citizenshipCode','currCountryCode','employmentStatus']).show()

+-------+--------------------+---------------+---------------+----------------+
|summary|      customerNumber|citizenshipCode|currCountryCode|employmentStatus|
+-------+--------------------+---------------+---------------+----------------+
|  count|                 864|            864|            864|             864|
|   mean|                null|           null|           null|            null|
| stddev|                null|           null|           null|            null|
|    min|002ecc96-e757-499...|             US|            USD|        employee|
|    max|ffe45544-c3c2-4c9...|             US|            USD|          worker|
+-------+--------------------+---------------+---------------+----------------+



In [0]:
profile.describe(['incomeInThousands','marketSegment','maritalStatus','stateCode']).show()

+-------+------------------+-------------+-------------+---------+
|summary| incomeInThousands|marketSegment|maritalStatus|stateCode|
+-------+------------------+-------------+-------------+---------+
|  count|               864|          864|          864|      864|
|   mean|250903.15051721645|         null|         null|     null|
| stddev| 88474.80747323314|         null|         null|     null|
|    min|         100582.19|          HNI|     Divorced|       AL|
|    max|         399981.88|     standard|      Widowed|       WV|
+-------+------------------+-------------+-------------+---------+



In [0]:
profile.describe(['city','country','accountOpeningDate','gender']).show()

+-------+-----------+-------------+------+
|summary|       city|      country|gender|
+-------+-----------+-------------+------+
|  count|        864|          864|   864|
|   mean|       null|         null|  null|
| stddev|       null|         null|  null|
|    min|      Akron|United States|Female|
|    max|Zephyrhills|United States|  Male|
+-------+-----------+-------------+------+



In [0]:
profile.select("marketSegment").distinct().collect()

Out[75]: [Row(marketSegment='basic'),
 Row(marketSegment='standard'),
 Row(marketSegment='HNI')]

In [0]:
profile.select("maritalStatus").distinct().collect()

Out[76]: [Row(maritalStatus='Separated'),
 Row(maritalStatus='Married'),
 Row(maritalStatus='Divorced'),
 Row(maritalStatus='Widowed'),
 Row(maritalStatus='Single')]

In [0]:
profile.select("employmentStatus").distinct().collect()

Out[77]: [Row(employmentStatus='self-employed'),
 Row(employmentStatus='employee'),
 Row(employmentStatus='worker')]

In [0]:
from pyspark.sql.functions import countDistinct
profile.select(countDistinct("city").alias("No. of Cities")).show()

+-------------+
|No. of Cities|
+-------------+
|          293|
+-------------+



In [0]:
profile.select("city").distinct().collect()

Out[79]: [Row(city='Worcester'),
 Row(city='Tyler'),
 Row(city='Springfield'),
 Row(city='Charleston'),
 Row(city='Tempe'),
 Row(city='Harrisburg'),
 Row(city='Phoenix'),
 Row(city='Ocala'),
 Row(city='Hollywood'),
 Row(city='Levittown'),
 Row(city='Savannah'),
 Row(city='Omaha'),
 Row(city='Fort Pierce'),
 Row(city='Fort Collins'),
 Row(city='Ashburn'),
 Row(city='Anaheim'),
 Row(city='Everett'),
 Row(city='Greensboro'),
 Row(city='Valdosta'),
 Row(city='Chattanooga'),
 Row(city='Dallas'),
 Row(city='Oakland'),
 Row(city='Laredo'),
 Row(city='Manchester'),
 Row(city='Spring Hill'),
 Row(city='Naperville'),
 Row(city='Scottsdale'),
 Row(city='Trenton'),
 Row(city='Largo'),
 Row(city='San Antonio'),
 Row(city='Bakersfield'),
 Row(city='Chico'),
 Row(city='Beaufort'),
 Row(city='Raleigh'),
 Row(city='Alexandria'),
 Row(city='Chula Vista'),
 Row(city='Philadelphia'),
 Row(city='Louisville'),
 Row(city='Dayton'),
 Row(city='Orange'),
 Row(city='Dearborn'),
 Row(city='Brooksville'),
 Row(ci

In [0]:
display(profile)

customerNumber,dateOfBirth,citizenshipCode,currCountryCode,employmentStatus,incomeInThousands,marketSegment,maritalStatus,stateCode,city,country,accountOpeningDate,gender
c152ad3d-9697-42b9-a0fd-aa9a959da1ef,,US,USD,employee,111561.99,basic,Single,GA,Atlanta,United States,,Male
04c71251-8951-4c0b-be57-9ea3db2f7ea5,,US,USD,employee,130302.69,HNI,Separated,OH,Canton,United States,,Male
033e7be1-b661-44a8-8e30-4668a0445dc8,,US,USD,self-employed,371662.97,HNI,Widowed,VA,Virginia Beach,United States,,Male
38b4ef58-d733-45d1-a54c-6a28c81d1313,,US,USD,self-employed,324471.7,basic,Separated,NC,Greensboro,United States,,Male
75fc1270-fded-4906-a31c-85fe1293345d,,US,USD,worker,137660.56,HNI,Married,TX,Corpus Christi,United States,,Male
c0175246-0ac0-4b1e-a8b6-76b2a01a44c2,,US,USD,self-employed,308475.38,HNI,Married,OK,Edmond,United States,,Female
854cda68-13bc-474f-aa47-fb31dde5fe0e,,US,USD,worker,372746.75,HNI,Married,CA,Santa Monica,United States,,Female
999bcdc5-c9e0-406f-b457-1c0a054443fb,,US,USD,employee,399981.88,HNI,Separated,CA,Inglewood,United States,,Male
aa9ea39c-c871-4f30-84ee-31b855787d23,,US,USD,self-employed,149885.4,basic,Separated,TN,Chattanooga,United States,,Male
737b8d0c-ffe9-483c-8ce9-80640ea3b667,,US,USD,self-employed,266811.75,basic,Widowed,TX,Austin,United States,,Male


In [0]:
display(profile)

customerNumber,dateOfBirth,citizenshipCode,currCountryCode,employmentStatus,incomeInThousands,marketSegment,maritalStatus,stateCode,city,country,accountOpeningDate,gender
c152ad3d-9697-42b9-a0fd-aa9a959da1ef,,US,USD,employee,111561.99,basic,Single,GA,Atlanta,United States,,Male
04c71251-8951-4c0b-be57-9ea3db2f7ea5,,US,USD,employee,130302.69,HNI,Separated,OH,Canton,United States,,Male
033e7be1-b661-44a8-8e30-4668a0445dc8,,US,USD,self-employed,371662.97,HNI,Widowed,VA,Virginia Beach,United States,,Male
38b4ef58-d733-45d1-a54c-6a28c81d1313,,US,USD,self-employed,324471.7,basic,Separated,NC,Greensboro,United States,,Male
75fc1270-fded-4906-a31c-85fe1293345d,,US,USD,worker,137660.56,HNI,Married,TX,Corpus Christi,United States,,Male
c0175246-0ac0-4b1e-a8b6-76b2a01a44c2,,US,USD,self-employed,308475.38,HNI,Married,OK,Edmond,United States,,Female
854cda68-13bc-474f-aa47-fb31dde5fe0e,,US,USD,worker,372746.75,HNI,Married,CA,Santa Monica,United States,,Female
999bcdc5-c9e0-406f-b457-1c0a054443fb,,US,USD,employee,399981.88,HNI,Separated,CA,Inglewood,United States,,Male
aa9ea39c-c871-4f30-84ee-31b855787d23,,US,USD,self-employed,149885.4,basic,Separated,TN,Chattanooga,United States,,Male
737b8d0c-ffe9-483c-8ce9-80640ea3b667,,US,USD,self-employed,266811.75,basic,Widowed,TX,Austin,United States,,Male


In [0]:
display(profile)

customerNumber,dateOfBirth,citizenshipCode,currCountryCode,employmentStatus,incomeInThousands,marketSegment,maritalStatus,stateCode,city,country,accountOpeningDate,gender
c152ad3d-9697-42b9-a0fd-aa9a959da1ef,,US,USD,employee,111561.99,basic,Single,GA,Atlanta,United States,,Male
04c71251-8951-4c0b-be57-9ea3db2f7ea5,,US,USD,employee,130302.69,HNI,Separated,OH,Canton,United States,,Male
033e7be1-b661-44a8-8e30-4668a0445dc8,,US,USD,self-employed,371662.97,HNI,Widowed,VA,Virginia Beach,United States,,Male
38b4ef58-d733-45d1-a54c-6a28c81d1313,,US,USD,self-employed,324471.7,basic,Separated,NC,Greensboro,United States,,Male
75fc1270-fded-4906-a31c-85fe1293345d,,US,USD,worker,137660.56,HNI,Married,TX,Corpus Christi,United States,,Male
c0175246-0ac0-4b1e-a8b6-76b2a01a44c2,,US,USD,self-employed,308475.38,HNI,Married,OK,Edmond,United States,,Female
854cda68-13bc-474f-aa47-fb31dde5fe0e,,US,USD,worker,372746.75,HNI,Married,CA,Santa Monica,United States,,Female
999bcdc5-c9e0-406f-b457-1c0a054443fb,,US,USD,employee,399981.88,HNI,Separated,CA,Inglewood,United States,,Male
aa9ea39c-c871-4f30-84ee-31b855787d23,,US,USD,self-employed,149885.4,basic,Separated,TN,Chattanooga,United States,,Male
737b8d0c-ffe9-483c-8ce9-80640ea3b667,,US,USD,self-employed,266811.75,basic,Widowed,TX,Austin,United States,,Male


In [0]:
display(profile)

customerNumber,dateOfBirth,citizenshipCode,currCountryCode,employmentStatus,incomeInThousands,marketSegment,maritalStatus,stateCode,city,country,accountOpeningDate,gender
c152ad3d-9697-42b9-a0fd-aa9a959da1ef,,US,USD,employee,111561.99,basic,Single,GA,Atlanta,United States,,Male
04c71251-8951-4c0b-be57-9ea3db2f7ea5,,US,USD,employee,130302.69,HNI,Separated,OH,Canton,United States,,Male
033e7be1-b661-44a8-8e30-4668a0445dc8,,US,USD,self-employed,371662.97,HNI,Widowed,VA,Virginia Beach,United States,,Male
38b4ef58-d733-45d1-a54c-6a28c81d1313,,US,USD,self-employed,324471.7,basic,Separated,NC,Greensboro,United States,,Male
75fc1270-fded-4906-a31c-85fe1293345d,,US,USD,worker,137660.56,HNI,Married,TX,Corpus Christi,United States,,Male
c0175246-0ac0-4b1e-a8b6-76b2a01a44c2,,US,USD,self-employed,308475.38,HNI,Married,OK,Edmond,United States,,Female
854cda68-13bc-474f-aa47-fb31dde5fe0e,,US,USD,worker,372746.75,HNI,Married,CA,Santa Monica,United States,,Female
999bcdc5-c9e0-406f-b457-1c0a054443fb,,US,USD,employee,399981.88,HNI,Separated,CA,Inglewood,United States,,Male
aa9ea39c-c871-4f30-84ee-31b855787d23,,US,USD,self-employed,149885.4,basic,Separated,TN,Chattanooga,United States,,Male
737b8d0c-ffe9-483c-8ce9-80640ea3b667,,US,USD,self-employed,266811.75,basic,Widowed,TX,Austin,United States,,Male


In [0]:
Q1 = 170
Q2 = 250
Q3 = 330
IQR = Q3 - Q1
print("Inter Quartile Range (IQR) :",IQR)

Inter Quartile Range (IQR) : 160


In [0]:
from pyspark.sql import functions as F
profile = profile.withColumn("incomeInThousands", F.when(F.col("incomeInThousands") < 70000, 250000).otherwise(F.col("incomeInThousands")))
profile = profile.withColumn("incomeInThousands", F.when(F.col("incomeInThousands") > 570000, 250000).otherwise(F.col("incomeInThousands")))

In [0]:
profile.select("incomeInThousands").show(5,False)

+-----------------+
|incomeInThousands|
+-----------------+
|111561.99        |
|130302.69        |
|371662.97        |
|324471.7         |
|137660.56        |
+-----------------+
only showing top 5 rows



In [0]:
profile.where((col("incomeInThousands") < 70000)).show(5)

+--------------+-----------+---------------+---------------+----------------+-----------------+-------------+-------------+---------+----+-------+------------------+------+
|customerNumber|dateOfBirth|citizenshipCode|currCountryCode|employmentStatus|incomeInThousands|marketSegment|maritalStatus|stateCode|city|country|accountOpeningDate|gender|
+--------------+-----------+---------------+---------------+----------------+-----------------+-------------+-------------+---------+----+-------+------------------+------+
+--------------+-----------+---------------+---------------+----------------+-----------------+-------------+-------------+---------+----+-------+------------------+------+



In [0]:
profile.where((col("incomeInThousands") > 570000)).show(5)

+--------------+-----------+---------------+---------------+----------------+-----------------+-------------+-------------+---------+----+-------+------------------+------+
|customerNumber|dateOfBirth|citizenshipCode|currCountryCode|employmentStatus|incomeInThousands|marketSegment|maritalStatus|stateCode|city|country|accountOpeningDate|gender|
+--------------+-----------+---------------+---------------+----------------+-----------------+-------------+-------------+---------+----+-------+------------------+------+
+--------------+-----------+---------------+---------------+----------------+-----------------+-------------+-------------+---------+----+-------+------------------+------+



In [0]:
display(profile)

customerNumber,dateOfBirth,citizenshipCode,currCountryCode,employmentStatus,incomeInThousands,marketSegment,maritalStatus,stateCode,city,country,accountOpeningDate,gender
c152ad3d-9697-42b9-a0fd-aa9a959da1ef,,US,USD,employee,111561.99,basic,Single,GA,Atlanta,United States,,Male
04c71251-8951-4c0b-be57-9ea3db2f7ea5,,US,USD,employee,130302.69,HNI,Separated,OH,Canton,United States,,Male
033e7be1-b661-44a8-8e30-4668a0445dc8,,US,USD,self-employed,371662.97,HNI,Widowed,VA,Virginia Beach,United States,,Male
38b4ef58-d733-45d1-a54c-6a28c81d1313,,US,USD,self-employed,324471.7,basic,Separated,NC,Greensboro,United States,,Male
75fc1270-fded-4906-a31c-85fe1293345d,,US,USD,worker,137660.56,HNI,Married,TX,Corpus Christi,United States,,Male
c0175246-0ac0-4b1e-a8b6-76b2a01a44c2,,US,USD,self-employed,308475.38,HNI,Married,OK,Edmond,United States,,Female
854cda68-13bc-474f-aa47-fb31dde5fe0e,,US,USD,worker,372746.75,HNI,Married,CA,Santa Monica,United States,,Female
999bcdc5-c9e0-406f-b457-1c0a054443fb,,US,USD,employee,399981.88,HNI,Separated,CA,Inglewood,United States,,Male
aa9ea39c-c871-4f30-84ee-31b855787d23,,US,USD,self-employed,149885.4,basic,Separated,TN,Chattanooga,United States,,Male
737b8d0c-ffe9-483c-8ce9-80640ea3b667,,US,USD,self-employed,266811.75,basic,Widowed,TX,Austin,United States,,Male
