In [0]:
# handlining multiline records in csv file
# define schema
from pyspark.sql.types import *
schema = StructType([StructField("ID",IntegerType(),True),
                     StructField("Name",StringType(),True),
                     StructField("Description",StringType(),True) 
                     ])




In [0]:
df = spark.read.format("csv").schema(schema).load("/FileStore/tables/multiline.csv")

display(df)


ID,Name,Description
,Name,Description
1.0,Apple,A red fruit
,,
,,
2.0,Banana,A yellow fruit
,,
,,
3.0,Cherry,A small red fruit
,,
,,


In [0]:
# in the above sample csv file, the descrption column was in multine,hence the complete description was trimmed and was showing as null records...which is not the correct data. Am rading the csv file with multiline True option.

In [0]:
# Path to the CSV file
csv_file_path = '/FileStore/tables/multiline.csv'

# Read the multiline CSV file into a DataFrame
df = spark.read.format("csv") \
               .option("header", "true") \
               .option("multiline", "true") \
               .option("escape", "\"") \
               .option("quote", "\"") \
               .option("inferSchema", "true") \
               .load(csv_file_path)

In [0]:
display(df)

ID,Name,Description
1,Apple,A red fruit that is sweet and crisp.
2,Banana,A yellow fruit that is soft and sweet.
3,Cherry,A small red fruit that is tart and juicy.


In [0]:
#Path to the CSV file: Define the path to the CSV file.
#Read the multiline CSV file into a DataFrame:
#Use read.format("csv") to specify that we are reading a CSV file.
#Use the option method to set various options for reading the CSV file:
#"header", "true": Indicates that the first line of the CSV file contains the header.
#"multiline", "true": Enables reading multiline records.
#"escape", "\"": Specifies the escape character to handle double quotes in the text.
#"quote", "\"": Specifies the quote character.
#"inferSchema", "true": Infers the schema of the DataFrame.
#Use the load method to read the CSV file into a DataFrame.
#Display the DataFrame: 

In [0]:
csv_file_path = '/FileStore/tables/Category.csv'

# Read the multiline CSV file into a DataFrame
new_df = spark.read.format("csv") \
               .option("header", "true") \
               .option("multiline", "true") \
               .option("escape", "\"") \
               .option("quote", "\"") \
               .option("inferSchema", "true") \
               .load(csv_file_path)

In [0]:
new_df.show()

+---+---------+
| ID| Category|
+---+---------+
|  1|    Fruit|
|  2|    Fruit|
|  3|    Fruit|
|  4|Vegetable|
+---+---------+



In [0]:
## Perform the join operation
joined_df = df.join(new_df, on='ID', how='inner')

In [0]:
joined_df.show()

+---+------+--------------------+--------+
| ID|  Name|         Description|Category|
+---+------+--------------------+--------+
|  1| Apple|A red fruit\nthat...|   Fruit|
|  2|Banana|A yellow fruit\nt...|   Fruit|
|  3|Cherry|A small red fruit...|   Fruit|
+---+------+--------------------+--------+



In [0]:
display(joined_df)

ID,Name,Description,Category
1,Apple,A red fruit that is sweet and crisp.,Fruit
2,Banana,A yellow fruit that is soft and sweet.,Fruit
3,Cherry,A small red fruit that is tart and juicy.,Fruit


In [0]:
full_outer_join_df = df.join(new_df, on='ID', how='outer')
print("Full Outer Join:")
full_outer_join_df.show(truncate=False)

Full Outer Join:
+---+------+-------------------------------------------+---------+
|ID |Name  |Description                                |Category |
+---+------+-------------------------------------------+---------+
|1  |Apple |A red fruit\nthat is sweet and\ncrisp.     |Fruit    |
|2  |Banana|A yellow fruit\nthat is soft and\nsweet.   |Fruit    |
|3  |Cherry|A small red fruit\nthat is tart and\njuicy.|Fruit    |
|4  |null  |null                                       |Vegetable|
+---+------+-------------------------------------------+---------+



In [0]:
# Right join
right_join_df = df.join(new_df, on='ID', how='right')
print("Right Join:")
right_join_df.show(truncate=False)

Right Join:
+---+------+-------------------------------------------+---------+
|ID |Name  |Description                                |Category |
+---+------+-------------------------------------------+---------+
|1  |Apple |A red fruit\nthat is sweet and\ncrisp.     |Fruit    |
|2  |Banana|A yellow fruit\nthat is soft and\nsweet.   |Fruit    |
|3  |Cherry|A small red fruit\nthat is tart and\njuicy.|Fruit    |
|4  |null  |null                                       |Vegetable|
+---+------+-------------------------------------------+---------+



In [0]:
from pyspark.sql.functions import col, length, upper, concat

In [0]:
# Concatenate Name and Description into a new column
concat_df = df.withColumn('Name_Description', concat(col('Name'), col('Description')))
print("DataFrame with Concatenated Name and Description:")
concat_df.show(truncate=False)

DataFrame with Concatenated Name and Description:
+---+------+-------------------------------------------+-------------------------------------------------+
|ID |Name  |Description                                |Name_Description                                 |
+---+------+-------------------------------------------+-------------------------------------------------+
|1  |Apple |A red fruit\nthat is sweet and\ncrisp.     |AppleA red fruit\nthat is sweet and\ncrisp.      |
|2  |Banana|A yellow fruit\nthat is soft and\nsweet.   |BananaA yellow fruit\nthat is soft and\nsweet.   |
|3  |Cherry|A small red fruit\nthat is tart and\njuicy.|CherryA small red fruit\nthat is tart and\njuicy.|
+---+------+-------------------------------------------+-------------------------------------------------+



In [0]:
# Filter rows where the Name column is 'Apple'
filtered_df = df.filter(col('Name') == 'Apple')
print("Filtered DataFrame (Name == 'Apple'):")
filtered_df.show(truncate=False)

Filtered DataFrame (Name == 'Apple'):
+---+-----+--------------------------------------+
|ID |Name |Description                           |
+---+-----+--------------------------------------+
|1  |Apple|A red fruit\nthat is sweet and\ncrisp.|
+---+-----+--------------------------------------+



In [0]:
# Add a new column with the length of the Description
length_df = df.withColumn('Description_Length', length(col('Description')))
print("DataFrame with Description Length:")
length_df.show(truncate=False)

DataFrame with Description Length:
+---+------+-------------------------------------------+------------------+
|ID |Name  |Description                                |Description_Length|
+---+------+-------------------------------------------+------------------+
|1  |Apple |A red fruit\nthat is sweet and\ncrisp.     |36                |
|2  |Banana|A yellow fruit\nthat is soft and\nsweet.   |38                |
|3  |Cherry|A small red fruit\nthat is tart and\njuicy.|41                |
+---+------+-------------------------------------------+------------------+



In [0]:
# Group by Category and count the number of rows in each group
joined_df = df.join(new_df, on='ID', how='inner')
grouped_df = joined_df.groupBy('Category').count()
print("Grouped by Category and Count:")
grouped_df.show(truncate=False)

Grouped by Category and Count:
+--------+-----+
|Category|count|
+--------+-----+
|Fruit   |3    |
+--------+-----+

