In [17]:
# Import
from pyspark.sql import SparkSession
import os

# Create SparkSession
spark = SparkSession.builder.appName("Bankstatementsprocessing").getOrCreate()


In [18]:
# Folder path containing CSV files
folder_path = "C:/Projects/Finances/Bank statements/hdfc"

# List CSV files in the folder
csv_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.csv')]

# Read CSV files into Spark DataFrame
df = spark.read.csv(csv_files , header=True , inferSchema=True )

In [19]:
df.count()

1757

In [20]:
df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Value_date: string (nullable = true)
 |-- Debit: double (nullable = true)
 |-- Credit: double (nullable = true)
 |-- Ref_No: string (nullable = true)
 |-- Balance: double (nullable = true)



Checking metadata and viewing the rows where issue is coming

In [12]:
# Get the schema as a StructType object
schema = df.schema

# Get the metadata for a specific column (replace "column_name" with the actual column name)
column_metadata = schema["Debit"]

# Print the metadata to see how the data type was inferred
print("Column Metadata:", column_metadata)

Column Metadata: StructField('Debit', StringType(), True)


In [21]:
from pyspark.sql.functions import col

# Convert column to numeric (assuming the column name is "Debit")
df = df.withColumn("Debit", df["Debit"].cast("double"))

# Find NA values in the column
na_count = df.filter(col("Debit").isNull()).count()

# Show DataFrame schema to confirm the data type conversion
df.printSchema()

# Print the count of NA values
print("Number of NA values in 'Debit':", na_count)


root
 |-- Date: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Value_date: string (nullable = true)
 |-- Debit: double (nullable = true)
 |-- Credit: double (nullable = true)
 |-- Ref_No: string (nullable = true)
 |-- Balance: double (nullable = true)

Number of NA values in 'Debit': 0


In [22]:
df.filter(col("Debit").isNull()).show()

+----+-----------+----------+-----+------+------+-------+
|Date|Description|Value_date|Debit|Credit|Ref_No|Balance|
+----+-----------+----------+-----+------+------+-------+
+----+-----------+----------+-----+------+------+-------+

