### MERCHANT_EDA

In [19]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count

StatementMeta(cfraudsparkpool, 65, 2, Finished, Available, Finished)

##### Reading the merchant csv file from adls 

In [20]:
merchant_df = spark.read.csv("abfss://raw-fraud-database@ccfrauddatalake.dfs.core.windows.net/MerchantLocationInfo.csv", inferSchema=True , header=True)
merchant_df.show(4)

StatementMeta(cfraudsparkpool, 65, 3, Finished, Available, Finished)

+--------------------+--------------------+-------------+------------------+------------------+-------------+
|           trans_num|            merchant|     category|         merch_lat|        merch_long|merch_zipcode|
+--------------------+--------------------+-------------+------------------+------------------+-------------+
|3faf354a73ecf0080...|fraud_Turner and ...| shopping_pos|         38.987219|        -93.776161|      64071.0|
|01cc00d3f71b1ca08...|fraud_Zieme, Bode...|gas_transport|         35.643782|-90.04151999999999|      72370.0|
|1672c93700e5ab7ce...|fraud_Swaniawski,...| shopping_pos|258.74842790650393|62.297257627331064|         null|
|3dbf35e4eed21d9ef...|fraud_Gottlieb Group|    kids_pets|         49.765258|       -103.798569|         null|
+--------------------+--------------------+-------------+------------------+------------------+-------------+
only showing top 4 rows



##### Rounding the latitude and longitude columns


In [21]:
from pyspark.sql.functions import round, col

merchant_df = merchant_df.withColumn("merch_lat", round(col("merch_lat"), 2)) \
       .withColumn("merch_long", round(col("merch_long"), 2))


StatementMeta(cfraudsparkpool, 65, 4, Finished, Available, Finished)

##### Type Casting the zipcode column from float into int


In [22]:
merchant_df = merchant_df.withColumn('merch_zipcode' , col("merch_zipcode").cast(IntegerType()))


StatementMeta(cfraudsparkpool, 65, 5, Finished, Available, Finished)

In [23]:
merchant_df.show(2)

StatementMeta(cfraudsparkpool, 65, 6, Finished, Available, Finished)

+--------------------+--------------------+-------------+---------+----------+-------------+
|           trans_num|            merchant|     category|merch_lat|merch_long|merch_zipcode|
+--------------------+--------------------+-------------+---------+----------+-------------+
|3faf354a73ecf0080...|fraud_Turner and ...| shopping_pos|    38.99|    -93.78|        64071|
|01cc00d3f71b1ca08...|fraud_Zieme, Bode...|gas_transport|    35.64|    -90.04|        72370|
+--------------------+--------------------+-------------+---------+----------+-------------+
only showing top 2 rows



##### Finding the Missing or Null value

In [24]:

from pyspark.sql.functions import col, sum

null_counts = merchant_df.select([sum(col(c).isNull().cast("int")).alias(c) for c in merchant_df.columns])
null_counts.show()


StatementMeta(cfraudsparkpool, 65, 7, Finished, Available, Finished)

+---------+--------+--------+---------+----------+-------------+
|trans_num|merchant|category|merch_lat|merch_long|merch_zipcode|
+---------+--------+--------+---------+----------+-------------+
|        0|       0|   13031|        0|         0|       208083|
+---------+--------+--------+---------+----------+-------------+



##### Adding a new column called fraud_merchant

In [25]:
from pyspark.sql.functions import regexp_extract

merchant_df = merchant_df.withColumn("fraud_merchant", regexp_extract(col("merchant"), r"(fraud_[A-Za-z]+)", 1))


StatementMeta(cfraudsparkpool, 65, 8, Finished, Available, Finished)

##### Dropping the previous merchant column

In [26]:
merchant_df = merchant_df.drop("merchant")

StatementMeta(cfraudsparkpool, 65, 9, Finished, Available, Finished)

In [27]:
merchant_df.show(2)

StatementMeta(cfraudsparkpool, 65, 10, Finished, Available, Finished)

+--------------------+-------------+---------+----------+-------------+--------------+
|           trans_num|     category|merch_lat|merch_long|merch_zipcode|fraud_merchant|
+--------------------+-------------+---------+----------+-------------+--------------+
|3faf354a73ecf0080...| shopping_pos|    38.99|    -93.78|        64071|  fraud_Turner|
|01cc00d3f71b1ca08...|gas_transport|    35.64|    -90.04|        72370|   fraud_Zieme|
+--------------------+-------------+---------+----------+-------------+--------------+
only showing top 2 rows



##### Fill the category column with the highest occuring category value

In [28]:
from pyspark.sql.functions import col, count, when, lit

most = merchant_df.groupBy("category").count().orderBy(col("count").desc()).first()["category"]


merchant_df = merchant_df.withColumn("category", when(col("category").isNull(), most).otherwise(col("category")))


StatementMeta(cfraudsparkpool, 65, 11, Finished, Available, Finished)

In [29]:
merchant_df.count()

StatementMeta(cfraudsparkpool, 65, 12, Finished, Available, Finished)

1303158

##### Drop the null values 

In [30]:
merchant_df = merchant_df.filter(col("merch_zipcode").isNotNull())


StatementMeta(cfraudsparkpool, 65, 13, Finished, Available, Finished)

In [31]:
merchant_df.collect

StatementMeta(cfraudsparkpool, 65, 14, Finished, Available, Finished)

<bound method DataFrame.collect of DataFrame[trans_num: string, category: string, merch_lat: double, merch_long: double, merch_zipcode: int, fraud_merchant: string]>

In [32]:
merchant_df.show(2)

StatementMeta(cfraudsparkpool, 65, 15, Finished, Available, Finished)

+--------------------+-------------+---------+----------+-------------+--------------+
|           trans_num|     category|merch_lat|merch_long|merch_zipcode|fraud_merchant|
+--------------------+-------------+---------+----------+-------------+--------------+
|3faf354a73ecf0080...| shopping_pos|    38.99|    -93.78|        64071|  fraud_Turner|
|01cc00d3f71b1ca08...|gas_transport|    35.64|    -90.04|        72370|   fraud_Zieme|
+--------------------+-------------+---------+----------+-------------+--------------+
only showing top 2 rows



In [34]:
print("No Of  column",len(merchant_df.columns))
print("Merchant Schema -----------:",merchant_df.schema)

print("No of row in " , merchant_df.count())


StatementMeta(cfraudsparkpool, 65, 17, Finished, Available, Finished)

No Of  column 6
Merchant Schema -----------: StructType([StructField('trans_num', StringType(), True), StructField('category', StringType(), True), StructField('merch_lat', DoubleType(), True), StructField('merch_long', DoubleType(), True), StructField('merch_zipcode', IntegerType(), True), StructField('fraud_merchant', StringType(), True)])
No of row in  1095075


##### Saving the cleaned data into adls 

In [16]:

base_path = "abfss://clean-data-creditcard@ccfrauddatalake.dfs.core.windows.net/cleansed_files_csv/"
merchant_df.write.mode("overwrite").csv(base_path + "merchant")


StatementMeta(cfraudsparkpool, 55, 17, Finished, Available, Finished)

In [17]:

base_path = "abfss://clean-data-creditcard@ccfrauddatalake.dfs.core.windows.net/cleansed_files/"

merchant_df.write.mode("overwrite").parquet(base_path + "merchant")


StatementMeta(cfraudsparkpool, 55, 18, Finished, Available, Finished)