## Importing Dependencies


In [36]:
%pip install pyspark sqlalchemy pandas

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



In [37]:
from pyspark.sql import SparkSession
from sqlalchemy import create_engine
import pandas as pd


In [38]:
spark = SparkSession.builder.appName('NugaBankETL').getOrCreate()

In [39]:
#Initializing spark session
spark

### Data Extraction

In [40]:
Nuga_bank_df = spark.read.csv(r'dataset\nuga_bank_transactions.csv', header= True, inferSchema=True)

In [41]:
Nuga_bank_df.show(5)

+--------------------+------+----------------+--------------+--------------------+------------------+--------------+--------------------+--------------------+--------------------+--------------------+-------------------+------------------+--------------------+-------------+-------------+--------+-----+---------+--------------------+--------------------+------+--------------+
|    Transaction_Date|Amount|Transaction_Type| Customer_Name|    Customer_Address|     Customer_City|Customer_State|    Customer_Country|             Company|           Job_Title|               Email|       Phone_Number|Credit_Card_Number|                IBAN|Currency_Code|Random_Number|Category|Group|Is_Active|        Last_Updated|         Description|Gender|Marital_Status|
+--------------------+------+----------------+--------------+--------------------+------------------+--------------+--------------------+--------------------+--------------------+--------------------+-------------------+------------------+-----

In [42]:
Nuga_bank_df.printSchema()

root
 |-- Transaction_Date: timestamp (nullable = true)
 |-- Amount: double (nullable = true)
 |-- Transaction_Type: string (nullable = true)
 |-- Customer_Name: string (nullable = true)
 |-- Customer_Address: string (nullable = true)
 |-- Customer_City: string (nullable = true)
 |-- Customer_State: string (nullable = true)
 |-- Customer_Country: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Job_Title: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- Phone_Number: string (nullable = true)
 |-- Credit_Card_Number: long (nullable = true)
 |-- IBAN: string (nullable = true)
 |-- Currency_Code: string (nullable = true)
 |-- Random_Number: double (nullable = true)
 |-- Category: string (nullable = true)
 |-- Group: string (nullable = true)
 |-- Is_Active: string (nullable = true)
 |-- Last_Updated: timestamp (nullable = true)
 |-- Description: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Marital_Status: string (nullable = true)

## Data Transformation

In [43]:
# To determine the number of rows
no_of_rows =Nuga_bank_df.count()

In [44]:
no_of_rows

1000000

In [45]:
#to determine the number of columns
no_of_columns = len(Nuga_bank_df.columns)

In [46]:
no_of_columns

23

In [47]:
#checking for null values
for column in Nuga_bank_df.columns:
    print(column, 'Null', Nuga_bank_df.filter(Nuga_bank_df[column].isNull()).count())

Transaction_Date Null 0
Amount Null 0
Transaction_Type Null 0
Customer_Name Null 100425
Customer_Address Null 100087
Customer_City Null 100034
Customer_State Null 100009
Customer_Country Null 100672
Company Null 100295
Job_Title Null 99924
Email Null 100043
Phone_Number Null 100524
Credit_Card_Number Null 100085
IBAN Null 100300
Currency_Code Null 99342
Random_Number Null 99913
Category Null 100332
Group Null 100209
Is_Active Null 100259
Last_Updated Null 100321
Description Null 100403
Gender Null 99767
Marital_Status Null 99904


In [59]:
# FIlling up missing values

nuga_bank = Nuga_bank_df.fillna({
    'Customer_Name': 'Unknown',
    'Customer_Address':'Unknown',
    'Customer_City':'Unknown',
    'Customer_State':'Unknown',
    'Customer_Country':'Unknown',
    'Company':'Unknown',
    'Job_Title':'Unknown',
    'Email': 'Uknown',
    'Phone_Number':'Unknown',
    'Credit_Card_Number':0,
    'IBAN':'Unknown',
    'Currency_Code':'Unknown',
    'Random_Number':0.0,
    'Category':'Unknown',
    'Group':'Unknown',
    'Is_Active':'Unknown',
    'Description':'Unknown',
    'Gender':'Unknown',
    'Marital_Status':'Unknown',

})

In [50]:
for column in nuga_bank.columns:
    print(column, 'Null', nuga_bank.filter(nuga_bank[column].isNull()).count())

Transaction_Date Null 0
Amount Null 0
Transaction_Type Null 0
Customer_Name Null 0
Customer_Address Null 0
Customer_City Null 0
Customer_State Null 0
Customer_Country Null 0
Company Null 0
Job_Title Null 0
Email Null 0
Phone_Number Null 0
Credit_Card_Number Null 0
IBAN Null 0
Currency_Code Null 0
Random_Number Null 0
Category Null 0
Group Null 0
Is_Active Null 0
Last_Updated Null 100321
Description Null 0
Gender Null 0
Marital_Status Null 0


In [51]:
#dropping rows where last updated is null
nuga_bank = nuga_bank.na.drop(subset=['Last_Updated'])

In [52]:
for column in nuga_bank.columns:
    print(column, 'Nulls', nuga_bank.filter(nuga_bank[column].isNull()).count())

Transaction_Date Nulls 0
Amount Nulls 0
Transaction_Type Nulls 0
Customer_Name Nulls 0
Customer_Address Nulls 0
Customer_City Nulls 0
Customer_State Nulls 0
Customer_Country Nulls 0
Company Nulls 0
Job_Title Nulls 0
Email Nulls 0
Phone_Number Nulls 0
Credit_Card_Number Nulls 0
IBAN Nulls 0
Currency_Code Nulls 0
Random_Number Nulls 0
Category Nulls 0
Group Nulls 0
Is_Active Nulls 0
Last_Updated Nulls 0
Description Nulls 0
Gender Nulls 0
Marital_Status Nulls 0


In [55]:
no_of_row = nuga_bank.count()
no_of_row

899679

In [57]:
#Summary stats of the data
nuga_bank.describe().show()

+-------+------------------+----------------+-------------+--------------------+-------------+--------------+----------------+-------------+------------------+-------------------+--------------------+--------------------+--------------------+-------------+-----------------+--------+-------+---------+--------------------+-------+--------------+
|summary|            Amount|Transaction_Type|Customer_Name|    Customer_Address|Customer_City|Customer_State|Customer_Country|      Company|         Job_Title|              Email|        Phone_Number|  Credit_Card_Number|                IBAN|Currency_Code|    Random_Number|Category|  Group|Is_Active|         Description| Gender|Marital_Status|
+-------+------------------+----------------+-------------+--------------------+-------------+--------------+----------------+-------------+------------------+-------------------+--------------------+--------------------+--------------------+-------------+-----------------+--------+-------+---------+-------

In [58]:
spark