In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
import os

In [317]:
spark = SparkSession.builder \
    .appName("Prac3") \
    .master("local[*]") \
    .getOrCreate()

In [318]:
my_schema = "row_id INT,name STRING, age INT, dept STRING, CorruptedRecord STRING"

In [319]:
file_path = "/home/somnath/my_vscode_project/input_data/corrupted_data1.csv"


In [320]:
!ls /home/somnath/my_vscode_project/

 README.md				        input_data
 basics					        output_data
'besic airflow project'			        psql.txt
'besic project of integration'		        test_ecommerce.sql
 databrick-mini-course-end-to-end-project.zip  'txt to voice.py'
'databricks_project 2'			        venv
'directory path.py'


READ MODE

In [321]:
df = spark.read \
    .format("csv") \
    .option("header", "true") \
    .option("mode", "PERMISSIVE") \
    .option("inferSchema", "false") \
    .schema(my_schema) \
    .load(file_path)

In [322]:
df.collect()

26/01/24 16:09:54 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 4, schema size: 5
CSV file: file:///home/somnath/my_vscode_project/input_data/corrupted_data1.csv


[Row(row_id=1, name='Alice', age=None, dept='30', CorruptedRecord='IT'),
 Row(row_id=2, name='Bob', age=None, dept='HR', CorruptedRecord=None),
 Row(row_id=3, name='Charlie', age=28, dept='12121', CorruptedRecord='IT'),
 Row(row_id=4, name='Alice', age=30, dept='IT', CorruptedRecord='Charlie'),
 Row(row_id=5, name='Eve', age=35, dept=None, CorruptedRecord=None),
 Row(row_id=6, name='Frank', age=None, dept='40', CorruptedRecord='Finance'),
 Row(row_id=7, name='Grace', age=None, dept='HR', CorruptedRecord=None),
 Row(row_id=8, name='Bob', age=None, dept='HR', CorruptedRecord=None),
 Row(row_id=9, name='Heidi', age=29, dept='IT', CorruptedRecord='Charlie'),
 Row(row_id=10, name='Ivan', age=31, dept='Security', CorruptedRecord=None),
 Row(row_id=11, name='Judy', age=27, dept='IT', CorruptedRecord=None),
 Row(row_id=12, name='Mallory', age=45, dept='Finance', CorruptedRecord=None),
 Row(row_id=13, name='Oscar', age=None, dept='HR', CorruptedRecord=None),
 Row(row_id=14, name='Peggy', age=34

In [323]:
df.count()

40

In [324]:
df.printSchema()

root
 |-- row_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- dept: string (nullable = true)
 |-- CorruptedRecord: string (nullable = true)



In [325]:
df.show(40,truncate=False)


+------+----------------+----+--------+---------------+
|row_id|name            |age |dept    |CorruptedRecord|
+------+----------------+----+--------+---------------+
|1     |Alice           |NULL|30      |IT             |
|2     |Bob             |NULL|HR      |NULL           |
|3     |Charlie         |28  |12121   |IT             |
|4     |Alice           |30  |IT      |Charlie        |
|5     |Eve             |35  |NULL    |NULL           |
|6     |Frank           |NULL|40      |Finance        |
|7     |Grace           |NULL|HR      |NULL           |
|8     |Bob             |NULL|HR      |NULL           |
|9     |Heidi           |29  |IT      |Charlie        |
|10    |Ivan            |31  |Security|NULL           |
|11    |Judy            |27  |IT      |NULL           |
|12    |Mallory         |45  |Finance |NULL           |
|13    |Oscar           |NULL|HR      |NULL           |
|14    |Peggy           |34  |323     |IT             |
|15    |Trent           |38  |Security|NULL     

26/01/24 16:09:54 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 4, schema size: 5
CSV file: file:///home/somnath/my_vscode_project/input_data/corrupted_data1.csv


In [326]:
output_dir = "/home/somnath/my_vscode_project/output_data/bad_rows"
os.makedirs(output_dir, exist_ok=True)
print("\nðŸ’¾ Writing to CSV...")
(df.coalesce(1)
   .write
   .mode("overwrite")
   .option("header", "true") # You can use .option() or .csv(header=True)
   .json(f"{output_dir}") 
)
print(f"âœ… JSON folder created at: {output_dir}")


ðŸ’¾ Writing to CSV...


26/01/24 16:09:55 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 4, schema size: 5
CSV file: file:///home/somnath/my_vscode_project/input_data/corrupted_data1.csv


âœ… JSON folder created at: /home/somnath/my_vscode_project/output_data/bad_rows


In [327]:
bad_rows_df = spark.read.format("json").load(output_dir)
bad_rows_df.show(truncate=False)

+---------------+----+--------+-------+------+
|CorruptedRecord|age |dept    |name   |row_id|
+---------------+----+--------+-------+------+
|IT             |NULL|30      |Alice  |1     |
|NULL           |NULL|HR      |Bob    |2     |
|IT             |28  |12121   |Charlie|3     |
|Charlie        |30  |IT      |Alice  |4     |
|NULL           |35  |NULL    |Eve    |5     |
|Finance        |NULL|40      |Frank  |6     |
|NULL           |NULL|HR      |Grace  |7     |
|NULL           |NULL|HR      |Bob    |8     |
|Charlie        |29  |IT      |Heidi  |9     |
|NULL           |31  |Security|Ivan   |10    |
|NULL           |27  |IT      |Judy   |11    |
|NULL           |45  |Finance |Mallory|12    |
|NULL           |NULL|HR      |Oscar  |13    |
|IT             |34  |323     |Peggy  |14    |
|NULL           |38  |Security|Trent  |15    |
|NULL           |41  |Finance |Victor |16    |
|NULL           |NULL|NULL    |Walter |17    |
|NULL           |30  |IT      |Alice  |18    |
|NULL        

In [328]:
spark.stop()