In [1]:
# Installing pyspark on a local machine
# import Necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql import DataFrameWriter
from pyspark.sql.functions import monotonically_increasing_id
import os
import psycopg2


In [2]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("BanksETL") \
    .master("local[*]") \
    .config("spark.jars", r"C:\Users\user\Desktop\G_pay_bank\postgresql-42.7.7.jar") \
    .getOrCreate()

In [3]:
spark

In [4]:
# Extract  this history data into a spark dataframe
df = spark.read.csv(r"dataset\rawdata\G_pay_bank_transactions.csv", header=True, inferSchema=True)


In [5]:
df.show(5)

+--------------------+------+----------------+--------------+--------------------+------------------+--------------+--------------------+--------------------+--------------------+--------------------+-------------------+------------------+--------------------+-------------+-------------+--------+-----+---------+--------------------+--------------------+------+--------------+
|    Transaction_Date|Amount|Transaction_Type| Customer_Name|    Customer_Address|     Customer_City|Customer_State|    Customer_Country|             Company|           Job_Title|               Email|       Phone_Number|Credit_Card_Number|                IBAN|Currency_Code|Random_Number|Category|Group|Is_Active|        Last_Updated|         Description|Gender|Marital_Status|
+--------------------+------+----------------+--------------+--------------------+------------------+--------------+--------------------+--------------------+--------------------+--------------------+-------------------+------------------+-----

In [6]:
df.printSchema()

root
 |-- Transaction_Date: timestamp (nullable = true)
 |-- Amount: double (nullable = true)
 |-- Transaction_Type: string (nullable = true)
 |-- Customer_Name: string (nullable = true)
 |-- Customer_Address: string (nullable = true)
 |-- Customer_City: string (nullable = true)
 |-- Customer_State: string (nullable = true)
 |-- Customer_Country: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Job_Title: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- Phone_Number: string (nullable = true)
 |-- Credit_Card_Number: long (nullable = true)
 |-- IBAN: string (nullable = true)
 |-- Currency_Code: string (nullable = true)
 |-- Random_Number: double (nullable = true)
 |-- Category: string (nullable = true)
 |-- Group: string (nullable = true)
 |-- Is_Active: string (nullable = true)
 |-- Last_Updated: timestamp (nullable = true)
 |-- Description: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Marital_Status: string (nullable = true)

In [7]:
# Data cleaning and transformation
for col in df.columns:
    print(f"Column: {col}, Nulls: {df.filter(df[col].isNull()).count()}")


Column: Transaction_Date, Nulls: 0
Column: Amount, Nulls: 0
Column: Transaction_Type, Nulls: 0
Column: Customer_Name, Nulls: 100425
Column: Customer_Address, Nulls: 100087
Column: Customer_City, Nulls: 100034
Column: Customer_State, Nulls: 100009
Column: Customer_Country, Nulls: 100672
Column: Company, Nulls: 100295
Column: Job_Title, Nulls: 99924
Column: Email, Nulls: 100043
Column: Phone_Number, Nulls: 100524
Column: Credit_Card_Number, Nulls: 100085
Column: IBAN, Nulls: 100300
Column: Currency_Code, Nulls: 99342
Column: Random_Number, Nulls: 99913
Column: Category, Nulls: 100332
Column: Group, Nulls: 100209
Column: Is_Active, Nulls: 100259
Column: Last_Updated, Nulls: 100321
Column: Description, Nulls: 100403
Column: Gender, Nulls: 99767
Column: Marital_Status, Nulls: 99904


In [8]:
df.describe().show()

+-------+-----------------+----------------+-------------+--------------------+-------------+--------------+----------------+-------------+------------------+-------------------+-------------------+--------------------+--------------------+-------------+------------------+--------+------+---------+--------------------+------+--------------+
|summary|           Amount|Transaction_Type|Customer_Name|    Customer_Address|Customer_City|Customer_State|Customer_Country|      Company|         Job_Title|              Email|       Phone_Number|  Credit_Card_Number|                IBAN|Currency_Code|     Random_Number|Category| Group|Is_Active|         Description|Gender|Marital_Status|
+-------+-----------------+----------------+-------------+--------------------+-------------+--------------+----------------+-------------+------------------+-------------------+-------------------+--------------------+--------------------+-------------+------------------+--------+------+---------+---------------

In [22]:
# Fill up the missing values
df_cleaned = df.fillna({
    "Customer_Name": "Unknown",
    "Customer_Address": "Unknown",
    "Customer_City": "Unknown",
    "Customer_State": "Unknown",
    "Customer_Country": "Unknown",
    "Company": "Unknown",
    "Job_Title": "Unknown",
    "Email": "Unknown",
    "Phone_Number": "Unknown",
    "Credit_Card_Number": 0,
    "IBAN": "Unknown",
    "Currency_Code": "Unknown",
    "Random_Number": 0.0,
    "Category": "Unknown",
    "Group": "Unknown",
    "Is_Active": "Unknown",
    "Description": "No Description",
    "Gender": "Unknown",
    "Marital_Status": "Unknown"
})
    

In [23]:
# drop the missing values in the Last_Updated column
df_cleaned = df_cleaned.dropna(subset=["Last_Updated"])

In [26]:
# Data cleaning and transformation
for col in df_cleaned.columns:
    print(f"Column: {col}, 'Nulls: ', {df_cleaned.filter(df_cleaned[col].isNull()).count()}")


Column: Transaction_Date, 'Nulls: ', 0
Column: Amount, 'Nulls: ', 0
Column: Transaction_Type, 'Nulls: ', 0
Column: Customer_Name, 'Nulls: ', 0
Column: Customer_Address, 'Nulls: ', 0
Column: Customer_City, 'Nulls: ', 0
Column: Customer_State, 'Nulls: ', 0
Column: Customer_Country, 'Nulls: ', 0
Column: Company, 'Nulls: ', 0
Column: Job_Title, 'Nulls: ', 0
Column: Email, 'Nulls: ', 0
Column: Phone_Number, 'Nulls: ', 0
Column: Credit_Card_Number, 'Nulls: ', 0
Column: IBAN, 'Nulls: ', 0
Column: Currency_Code, 'Nulls: ', 0
Column: Random_Number, 'Nulls: ', 0
Column: Category, 'Nulls: ', 0
Column: Group, 'Nulls: ', 0
Column: Is_Active, 'Nulls: ', 0
Column: Last_Updated, 'Nulls: ', 0
Column: Description, 'Nulls: ', 0
Column: Gender, 'Nulls: ', 0
Column: Marital_Status, 'Nulls: ', 0
