In [0]:
customer_data = spark.read.format("csv").option("header","true").option("InferSchema","true")\
    .load("/FileStore/tables/Sample_Data___Customer_Transactions.csv")
customer_data.printSchema()

root
 |-- customerNumber: string (nullable = true)
 |-- cardLast4Digits: string (nullable = true)
 |-- transactionDate: string (nullable = true)
 |-- transactionTime: timestamp (nullable = true)
 |-- transactionAmount: string (nullable = true)
 |-- transactionType: string (nullable = true)
 |-- merchantName: string (nullable = true)
 |-- merchantCity: string (nullable = true)
 |-- merchantState: string (nullable = true)
 |-- merchantZip: integer (nullable = true)
 |-- merchantCountry: string (nullable = true)



In [0]:
print("No. of rows :", customer_data.count())
print("No. of columns :" ,len(customer_data.columns))

No. of rows : 1000
No. of columns : 11


In [0]:
customer_data.select("customerNumber","cardLast4Digits","transactionDate","transactionTime").show(5,False)

+------------------------------------+----------------+---------------+-------------------+
|customerNumber                      |cardLast4Digits |transactionDate|transactionTime    |
+------------------------------------+----------------+---------------+-------------------+
|00aac2d1-7e01-42c5-855b-42b7c32dfb58|**0875******3834|05/30/2021     |2022-05-18 00:40:39|
|7480d13f-522a-4046-885c-4895190d685e|**4837******6436|06/09/2021     |2022-05-18 21:49:56|
|29cb8de6-5f60-4bd8-a33c-d99d55bec916|**0875******5659|03/14/2021     |2022-05-18 06:36:36|
|65a3ddb0-b6fb-4716-81c8-1cd54614a7b6|**0875******2280|10/28/2021     |2022-05-18 12:16:50|
|899d435e-3615-4f19-864a-c35055af8818|**4837******2578|12/26/2020     |2022-05-18 20:46:04|
+------------------------------------+----------------+---------------+-------------------+
only showing top 5 rows



In [0]:
customer_data.select("transactionAmount","transactionType","merchantName","merchantCity").show(5,False)

+-----------------+---------------+----------------+------------+
|transactionAmount|transactionType|merchantName    |merchantCity|
+-----------------+---------------+----------------+------------+
|$149.06          |Tap            |Ledner Inc      |Philadelphia|
|$49.75           |Tap            |Keeling and Sons|Huntington  |
|$166.43          |Tap            |Crona-Baumbach  |Boston      |
|$129.83          |Swipe          |Hoppe-Reichel   |Atlanta     |
|$191.98          |Tap            |Padberg-Hilpert |Orlando     |
+-----------------+---------------+----------------+------------+
only showing top 5 rows



In [0]:
customer_data.select("merchantState","merchantZip","merchantCountry").show(5,False)

+-------------+-----------+---------------+
|merchantState|merchantZip|merchantCountry|
+-------------+-----------+---------------+
|PA           |19151      |United States  |
|WV           |25775      |United States  |
|MA           |2114       |United States  |
|GA           |30301      |United States  |
|FL           |32868      |United States  |
+-------------+-----------+---------------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import isnan, when, count, col
customer_data.select([count(when(col(c).isNull(), c)).alias(c) for c in customer_data.columns]).show()

+--------------+---------------+---------------+---------------+-----------------+---------------+------------+------------+-------------+-----------+---------------+
|customerNumber|cardLast4Digits|transactionDate|transactionTime|transactionAmount|transactionType|merchantName|merchantCity|merchantState|merchantZip|merchantCountry|
+--------------+---------------+---------------+---------------+-----------------+---------------+------------+------------+-------------+-----------+---------------+
|             0|              0|             35|             35|               54|              0|          19|           0|            0|          0|              0|
+--------------+---------------+---------------+---------------+-----------------+---------------+------------+------------+-------------+-----------+---------------+



In [0]:
customer_data = customer_data.na.drop()

In [0]:
print("No. of rows :", customer_data.count())
print("No. of columns :" ,len(customer_data.columns))

No. of rows : 896
No. of columns : 11


In [0]:
customer_data = customer_data.dropDuplicates()

In [0]:
print("No. of rows :", customer_data.count())
print("No. of columns :" ,len(customer_data.columns))

No. of rows : 896
No. of columns : 11


In [0]:
from pyspark.sql.functions import isnan, when, count, col
customer_data.select([count(when(col(c).isNull(), c)).alias(c) for c in customer_data.columns]).show()

+--------------+---------------+---------------+---------------+-----------------+---------------+------------+------------+-------------+-----------+---------------+
|customerNumber|cardLast4Digits|transactionDate|transactionTime|transactionAmount|transactionType|merchantName|merchantCity|merchantState|merchantZip|merchantCountry|
+--------------+---------------+---------------+---------------+-----------------+---------------+------------+------------+-------------+-----------+---------------+
|             0|              0|              0|              0|                0|              0|           0|           0|            0|          0|              0|
+--------------+---------------+---------------+---------------+-----------------+---------------+------------+------------+-------------+-----------+---------------+



In [0]:
customer_data = customer_data.withColumn("transactionAmount", customer_data.transactionAmount.substr(2,6))

In [0]:
customer_data.select("transactionAmount").show(5,False)

+-----------------+
|transactionAmount|
+-----------------+
|129.83           |
|166.43           |
|49.75            |
|191.98           |
|242.49           |
+-----------------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import regexp_replace,col
customer_data = customer_data.withColumn('transactionDate', regexp_replace(col('transactionDate'), "/", "-"))

In [0]:
customer_data.select("transactionDate").show(10,False)

+---------------+
|transactionDate|
+---------------+
|10-28-2021     |
|03-14-2021     |
|06-09-2021     |
|09-17-2021     |
|12-26-2020     |
|12-16-2020     |
|03-06-2021     |
|05-19-2021     |
|06-04-2021     |
|05-30-2021     |
+---------------+
only showing top 10 rows



In [0]:
from pyspark.sql.types import DateType
customer_data = customer_data.withColumn("transactionDate",
customer_data["transactionDate"].cast(DateType()))

In [0]:
customer_data.printSchema()

root
 |-- customerNumber: string (nullable = true)
 |-- cardLast4Digits: string (nullable = true)
 |-- transactionDate: date (nullable = true)
 |-- transactionTime: timestamp (nullable = true)
 |-- transactionAmount: float (nullable = true)
 |-- transactionType: string (nullable = true)
 |-- merchantName: string (nullable = true)
 |-- merchantCity: string (nullable = true)
 |-- merchantState: string (nullable = true)
 |-- merchantZip: integer (nullable = true)
 |-- merchantCountry: string (nullable = true)



In [0]:
from pyspark.sql.types import FloatType
customer_data = customer_data.withColumn("transactionAmount",
customer_data["transactionAmount"].cast(FloatType()))

In [0]:
customer_data.printSchema()

root
 |-- customerNumber: string (nullable = true)
 |-- cardLast4Digits: string (nullable = true)
 |-- transactionDate: date (nullable = true)
 |-- transactionTime: timestamp (nullable = true)
 |-- transactionAmount: float (nullable = true)
 |-- transactionType: string (nullable = true)
 |-- merchantName: string (nullable = true)
 |-- merchantCity: string (nullable = true)
 |-- merchantState: string (nullable = true)
 |-- merchantZip: integer (nullable = true)
 |-- merchantCountry: string (nullable = true)



In [0]:
customer_data.select("transactionType").distinct().show()

+---------------+
|transactionType|
+---------------+
|          Swipe|
|            Tap|
+---------------+



In [0]:
from pyspark.sql.functions import countDistinct
customer_data.select(countDistinct("merchantState").alias("No. of States")).show()

+-------------+
|No. of States|
+-------------+
|           48|
+-------------+



In [0]:
display(customer_data)

customerNumber,cardLast4Digits,transactionDate,transactionTime,transactionAmount,transactionType,merchantName,merchantCity,merchantState,merchantZip,merchantCountry
013f9b39-46a2-4b4d-9925-7d6934db5079,**0875******9045,,2022-05-18T02:41:10.000+0000,2.4,Swipe,"Koch, Waelchi and Dibbert",New York City,NY,10280,United States
f72d1c57-ad36-459c-ae2b-91eb6028dbee,**4837******5122,,2022-05-18T16:51:34.000+0000,36.84,Tap,Ferry Group,Fort Wayne,IN,46805,United States
d0545c79-6bce-48df-9b93-3cd7cc971cb9,**4837******1588,,2022-05-18T17:28:57.000+0000,8.31,Swipe,Green LLC,Provo,UT,84605,United States
2c6844bc-752d-4a46-a9ea-7517d8b98e5f,**4837******9861,,2022-05-18T05:16:42.000+0000,19.45,Swipe,Renner LLC,Saint Petersburg,FL,33742,United States
832cb7d7-bf58-4171-875d-43f0dc2b17dd,**4837******8507,,2022-05-18T05:43:20.000+0000,14.61,Swipe,Muller-Berge,Tulsa,OK,74184,United States
5bf1197c-b62c-43fa-a8e1-a4969c62def7,**0875******1694,,2022-05-18T16:17:14.000+0000,1.36,Swipe,"Murazik, Runte and Kreiger",Lexington,KY,40505,United States
0fb6e375-f27c-4b25-adf9-b9ab80c531fd,**0875******1689,,2022-05-18T15:32:58.000+0000,19.08,Tap,"Thiel, McKenzie and Zemlak",Amarillo,TX,79188,United States
6a50a72b-b755-4d51-895d-e2191ce33824,**4837******1351,,2022-05-18T20:02:45.000+0000,4.92,Swipe,"McDermott, Runte and Wiza",Long Beach,CA,90847,United States
e99ace57-04de-49fa-9aac-1a66bdcd4d4d,**0875******5149,,2022-05-18T10:18:11.000+0000,49.22,Swipe,"Prosacco, Hills and Koepp",Vero Beach,FL,32969,United States
3946874a-74dc-4a51-9059-cf43df8b85d9,**0875******5698,,2022-05-18T07:13:32.000+0000,11.29,Tap,O'Conner and Sons,San Antonio,TX,78240,United States


In [0]:
display(customer_data)

customerNumber,cardLast4Digits,transactionDate,transactionTime,transactionAmount,transactionType,merchantName,merchantCity,merchantState,merchantZip,merchantCountry
013f9b39-46a2-4b4d-9925-7d6934db5079,**0875******9045,,2022-05-18T02:41:10.000+0000,2.4,Swipe,"Koch, Waelchi and Dibbert",New York City,NY,10280,United States
f72d1c57-ad36-459c-ae2b-91eb6028dbee,**4837******5122,,2022-05-18T16:51:34.000+0000,36.84,Tap,Ferry Group,Fort Wayne,IN,46805,United States
d0545c79-6bce-48df-9b93-3cd7cc971cb9,**4837******1588,,2022-05-18T17:28:57.000+0000,8.31,Swipe,Green LLC,Provo,UT,84605,United States
2c6844bc-752d-4a46-a9ea-7517d8b98e5f,**4837******9861,,2022-05-18T05:16:42.000+0000,19.45,Swipe,Renner LLC,Saint Petersburg,FL,33742,United States
832cb7d7-bf58-4171-875d-43f0dc2b17dd,**4837******8507,,2022-05-18T05:43:20.000+0000,14.61,Swipe,Muller-Berge,Tulsa,OK,74184,United States
5bf1197c-b62c-43fa-a8e1-a4969c62def7,**0875******1694,,2022-05-18T16:17:14.000+0000,1.36,Swipe,"Murazik, Runte and Kreiger",Lexington,KY,40505,United States
0fb6e375-f27c-4b25-adf9-b9ab80c531fd,**0875******1689,,2022-05-18T15:32:58.000+0000,19.08,Tap,"Thiel, McKenzie and Zemlak",Amarillo,TX,79188,United States
6a50a72b-b755-4d51-895d-e2191ce33824,**4837******1351,,2022-05-18T20:02:45.000+0000,4.92,Swipe,"McDermott, Runte and Wiza",Long Beach,CA,90847,United States
e99ace57-04de-49fa-9aac-1a66bdcd4d4d,**0875******5149,,2022-05-18T10:18:11.000+0000,49.22,Swipe,"Prosacco, Hills and Koepp",Vero Beach,FL,32969,United States
3946874a-74dc-4a51-9059-cf43df8b85d9,**0875******5698,,2022-05-18T07:13:32.000+0000,11.29,Tap,O'Conner and Sons,San Antonio,TX,78240,United States
