### Import necessary dependencies

In [None]:
#!pip install setuptools

#!pip install --upgrade pandas


In [7]:
from packaging.version import Version as LooseVersion
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id 
import pandas as pd
from datetime import datetime, timedelta
from sqlalchemy import create_engine
import psycopg2
import os
from dotenv import load_dotenv

In [8]:
## set jave home to avoid java running with the previous version
os.environ['JAVA_HOME'] = r'C:\JAVA8'

In [9]:
#ENABLING FIREWALLS BLOCKING
import os
os.environ["PYSPARK_ALLOW_INSECURE_GATEWAY"] = "1"

In [12]:
# initialize my spark seesion with allowed security
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("GWIHR_PROJECT") \
    .config("spark.jars", r"C:\Users\back2\Desktop\DE-SORTED FILEZ\Century_bank project\postgresql-42.7.5.jar") \
    .getOrCreate()
spark  

### Data Extraction

In [None]:
Century_bank_df = spark.read.csv(r'Raw_data\Century_bank_transactions.csv',header = True)
Century_bank_df.show(10)

In [None]:
## to read your file and also show the correct datatype use this code,
Century_bank_df = spark.read.csv(r'Raw_data\Century_bank_transactions.csv',header = True, inferSchema=True)

Century_bank_df.printSchema()

### Data Cleaning and Transformation

In [None]:
# check your columns
Century_bank_df.columns

In [None]:
# number of rows
num_rows = Century_bank_df.count()

num_rows

1000000

In [None]:
# number of columns
num_columns = len(Century_bank_df.columns)
num_columns

23

In [None]:
# checking for null values
for columns in Century_bank_df.columns:
        print(columns, 'nulls', Century_bank_df.filter(Century_bank_df[columns].isNull()).count())

Transaction_Date nulls 0
Amount nulls 0
Transaction_Type nulls 0
Customer_Name nulls 100425
Customer_Address nulls 100087
Customer_City nulls 100034
Customer_State nulls 100009
Customer_Country nulls 100672
Company nulls 100295
Job_Title nulls 99924
Email nulls 100043
Phone_Number nulls 100524
Credit_Card_Number nulls 100085
IBAN nulls 100300
Currency_Code nulls 99342
Random_Number nulls 99913
Category nulls 100332
Group nulls 100209
Is_Active nulls 100259
Last_Updated nulls 100321
Description nulls 100403
Gender nulls 99767
Marital_Status nulls 99904


In [None]:
# How to fill up missing or null values
Century_bank_df_clean = Century_bank_df.fillna({
    'Customer_Name': 'unknown',
    'Customer_Address': 'unknown',
    'Customer_City': 'unknown',
    'Customer_State':'unknown',
    'Customer_Country': 'unknown',
    'Company': 'unknown',
    'Job_Title': 'unknown',
    'Email': 'unknown',
    'Phone_Number': 'unknown',
    'Credit_Card_Number': 0,
    'IBAN': 'unknown',
    'Currency_Code': 'unknown',
    'Random_Number': 0.0,
    'Category': 'unknown',
    'Group': 'unknown',
    'Is_Active': 'unknown',
    'Description': 'unknown',
    'Gender': 'unknown',
    'Marital_Status': 'unknown'
})

In [None]:
# Drop rows where last updated is null
Century_bank_df_clean = Century_bank_df_clean.na.drop(subset=['Last_Updated'])

In [None]:
# confirm changes made for drop command.
num_rows = Century_bank_df_clean.count()

num_rows

In [None]:
# confirm changes made for null values
for columns in Century_bank_df_clean.columns:
        print(columns, 'nulls', Century_bank_df_clean.filter(Century_bank_df_clean[columns].isNull()).count())

In [None]:
# To have an overview of summary statistics of the data
Century_bank_df_clean.describe().show()

+-------+------------------+----------------+-------------+--------------------+-------------+--------------+----------------+------------+------------------+-------------------+--------------------+--------------------+--------------------+-------------+-----------------+--------+-------+---------+--------------------+-------+--------------+
|summary|            Amount|Transaction_Type|Customer_Name|    Customer_Address|Customer_City|Customer_State|Customer_Country|     Company|         Job_Title|              Email|        Phone_Number|  Credit_Card_Number|                IBAN|Currency_Code|    Random_Number|Category|  Group|Is_Active|         Description| Gender|Marital_Status|
+-------+------------------+----------------+-------------+--------------------+-------------+--------------+----------------+------------+------------------+-------------------+--------------------+--------------------+--------------------+-------------+-----------------+--------+-------+---------+----------

In [None]:
# create a data model using the appropriate tool ( lucid or draw io)


In [None]:
Century_bank_df_clean.columns

### Table creation

In [None]:
# Transactaction table
Transaction = Century_bank_df_clean.select('Transaction_Date', 'Amount', 'Transaction_Type')
# Add the transaction_ID column
Transaction = Transaction.withColumn('Transaction_ID', monotonically_increasing_id())
# Reordering columns to make sure the added column comes first
Transaction = Transaction.select('Transaction_ID','Transaction_Date', 'Amount', 'Transaction_Type')

#Transaction.show()

In [None]:
# Customer table
# to reduce reductancy or repeatation where one customer made multiple purchase use (.distinct)
Customer = Century_bank_df_clean.select('Customer_Name','Customer_Address','Customer_City',
                                        'Customer_State','Customer_Country','Email','Phone_Number').distinct()
# Add the Customer_ID column
Customer = Customer.withColumn('Customer_ID', monotonically_increasing_id())
# Reordering columns to make sure the added column comes first
Customer = Customer.select('Customer_ID','Customer_Name','Customer_Address','Customer_City',
                                        'Customer_State','Customer_Country','Email','Phone_Number')

#Customer.show()

In [None]:
# Employee table
Employee = Century_bank_df_clean.select('Company','Job_Title','Gender','Marital_Status').distinct()
# Add the Customer_ID column
Employee = Employee.withColumn('Employee_ID', monotonically_increasing_id())
# Reordering columns to make sure the added column comes first
Employee = Employee.select('Employee_ID','Company','Job_Title','Gender','Marital_Status')

Employee.show()

In [None]:
# Fact table
Fact_table = Century_bank_df.join(Customer, ['Customer_Name','Customer_Address','Customer_City',\
                                        'Customer_State','Customer_Country','Email','Phone_Number'], 'left')\
                         .join(Transaction, ['Transaction_Date', 'Amount', 'Transaction_Type'],'left')\
                         .join(Employee, ['Company','Job_Title','Gender','Marital_Status'], 'left')\
                         .select('Transaction_ID','Customer_ID','Employee_ID','Credit_Card_Number','IBAN',\
                                 'Currency_Code','Random_Number','Category','Group','Is_Active','Last_Updated','Description',)   


Fact_table.show()


In [None]:
# output or save transformed data as csv file
#Transaction.repartition(1).write.mode('overwrite').option('header', 'True').csv(r'C:\Users\back2\Desktop\(PYSPARK case_study)/Transaction')
#Employee.repartition(1).write.mode('overwrite').option('header', 'True').csv(r'C:\Users\back2\Desktop\(PYSPARK case_study)/Employee')
#Customer.repartition(1).write.mode('overwrite').option('header', 'True').csv(r'C:\Users\back2\Desktop\(PYSPARK case_study)/Customer')
#Fact_table.repartition(1).write.mode('overwrite').option('header', 'True').csv(r'C:\Users\back2\Desktop\(PYSPARK case_study)/Fact_table')


In [None]:
# convert spark_df to pandas_df
Transaction_pd_df = Transaction.toPandas()
Customer_pd_df = Customer.toPandas()
Employee_pd_df = Employee.toPandas()
Fact_table_pd_df = Fact_table.toPandas()


In [None]:
# Define database parameters including the database name
db_params ={
    'username': 'postgres',
    'password': '0852',
    'host': 'localhost',
    'port': '5432',
    'database': 'nuga_bank'
}
db_url = f"postgresql://{db_params['username']}:{db_params['password']}@{db_params['host']}:{db_params['port']}/{db_params['database']}"
# create the database engine with the db url
engine = create_engine(db_url)
# connect to PostgreSQL server
with engine.connect() as connection:

# create table and load the data
    Transaction_pd_df.to_sql('Transaction', connection, index=False, if_exists='replace')
    Customer_pd_df.to_sql('Customer', connection, index=False, if_exists='replace')
    Employee_pd_df.to_sql('Employee', connection, index=False, if_exists='replace')
    Fact_table_pd_df.to_sql('Fact_table', connection, index=False, if_exists='replace')
print('successfull')