In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.appName("customer") \
.master("spark://10.208.36.84:7077") \
.config("spark.hadoop.defaultFS","hdfs://10.208.36.84:9000") \
.config("spark.executor.cores",2) \
.config("spark.executor.memory","4g") \
.config("spark.cores.max","10") \
.getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
customer_schema = 'member_id string, emp_title string, emp_length string, home_ownership string, annual_inc float, addr_state string, zip_code string, country string, grade string, sub_grade string, verification_status string, tot_hi_cred_lim float, application_type string, annual_inc_joint float, verification_status_joint string'

In [4]:
df = spark.read.format("csv").option("header","true") \
.schema(customer_schema) \
.load("/home/tushar/Documents/project/customer_raw")

In [5]:
df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: float (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- tot_hi_cred_lim: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- annual_inc_joint: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)



In [6]:
df.count()

                                                                                

868845

In [7]:
df.show()

+--------------------+--------------------+----------+--------------+----------+----------+--------+-------+-----+---------+-------------------+---------------+----------------+----------------+-------------------------+
|           member_id|           emp_title|emp_length|home_ownership|annual_inc|addr_state|zip_code|country|grade|sub_grade|verification_status|tot_hi_cred_lim|application_type|annual_inc_joint|verification_status_joint|
+--------------------+--------------------+----------+--------------+----------+----------+--------+-------+-----+---------+-------------------+---------------+----------------+----------------+-------------------------+
|6d5091b3fcaaeb4ea...|             leadman| 10+ years|      MORTGAGE|   55000.0|        PA|   190xx|    USA|    C|       C4|       Not Verified|       178050.0|      Individual|            null|                     null|
|b5e7938b0a2da4cea...|            Engineer| 10+ years|      MORTGAGE|   65000.0|        SD|   577xx|    USA|    C|  

In [8]:
df = df.withColumnRenamed("annual_inc","annual_income") \
.withColumnRenamed("addr_state","address_state") \
.withColumnRenamed("zip_cod","address_zipcode") \
.withColumnRenamed("country","address_country") \
.withColumnRenamed("tot_hi_cred_lim","total_high_credit_limit") \
.withColumnRenamed("annual_ic_joint","join_annual_income")

In [9]:
df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_income: float (nullable = true)
 |-- address_state: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- address_country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- total_high_credit_limit: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- annual_inc_joint: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)



In [10]:
df = df.withColumn("ingestion_date",current_timestamp())

In [11]:
dfdistinct = df.drop_duplicates()

In [12]:
dfdistinct.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_income: float (nullable = true)
 |-- address_state: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- address_country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- total_high_credit_limit: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- annual_inc_joint: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)
 |-- ingestion_date: timestamp (nullable = false)



In [13]:
dfdistinct.count()

                                                                                

868832

<p> Total No. of duplicates rows are 13 </p>

In [14]:
dfannual = dfdistinct.where(col('annual_income').isNotNull())

In [15]:
dfannual.count()

                                                                                

868830

<p> Total Number of Null values in annual income column was 2</p>

In [16]:
df = df.withColumn("emp_length",regexp_replace(col("emp_length"),"(\D)",""))

In [17]:
df.select("emp_length").show()

+----------+
|emp_length|
+----------+
|        10|
|        10|
|        10|
|        10|
|         3|
|         4|
|        10|
|        10|
|         6|
|        10|
|         6|
|         1|
|         3|
|         7|
|        10|
|         8|
|        10|
|        10|
|         5|
|         8|
+----------+
only showing top 20 rows



In [19]:
df = df.withColumn("emp_length",col("emp_length").cast('int'))

In [20]:
df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: integer (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_income: float (nullable = true)
 |-- address_state: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- address_country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- total_high_credit_limit: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- annual_inc_joint: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)
 |-- ingestion_date: timestamp (nullable = false)



<p> In emp length, replacing the null values with avg of the emp_length.</p>

In [36]:
avgemp = df.agg({"emp_length": "avg"}).collect()

                                                                                

In [37]:
avgemp

[Row(avg(emp_length)=6.027526296574045)]

In [40]:
df.createOrReplaceTempView('customer')

In [43]:
avgempl = spark.sql("select floor(avg(emp_length)) from customer").collect()

In [44]:
avgempl

[Row(FLOOR(avg(emp_length))=6)]

In [45]:
print(avgempl[0][0])

6


In [46]:
avgempduration = avgempl[0][0]

In [47]:
customer_df = df.na.fill(avgempduration,subset=['emp_length'])

In [48]:
addr_df = customer_df.withColumn("address_state",when(length(col("address_state")) > 2, "NA").otherwise(col("address_state")))

In [49]:
addr_df.select("address_state").distinct().show()

+-------------+
|address_state|
+-------------+
|           SC|
|           AZ|
|           LA|
|           MN|
|           NJ|
|           DC|
|           OR|
|           VA|
|           RI|
|           WY|
|           KY|
|           NH|
|           MI|
|           NV|
|           WI|
|           CA|
|           NE|
|           CT|
|           MT|
|           NC|
+-------------+
only showing top 20 rows



In [52]:
addr_df.write.format("parquet").mode("overwrite").option("path","/home/tushar/Documents/project/cleaned_customer/parquet").save()

                                                                                

In [53]:
addr_df.write.format("csv").mode("overwrite").option("path","/home/tushar/Documents/project/cleaned_customer/csv").save()

                                                                                