#Creating External Stage

In [0]:
access_key =  ""
secret_key = ""
encoded_secret_key = secret_key.replace("/", "%2F")
aws_bucket_name = "{}"
mount_name = "databricks_external_stage"

dbutils.fs.mount(f"s3a://{access_key}:{encoded_secret_key}@{aws_bucket_name}", f"/mnt/{mount_name}")

True

## **List files in external stage**

In [0]:
display(dbutils.fs.ls(f"/mnt/databricks_external_stage"))

path,name,size,modificationTime
dbfs:/mnt/databricks_external_stage/input_folder/,input_folder/,0,1741672769989
dbfs:/mnt/databricks_external_stage/output_folder/,output_folder/,0,1741672769989
dbfs:/mnt/databricks_external_stage/product_reviews_input/,product_reviews_input/,0,1741672769989
dbfs:/mnt/databricks_external_stage/product_reviews_output/,product_reviews_output/,0,1741672769989


## **Recursive File Listing**

In [0]:
def list_files_recursive(path):
    files = dbutils.fs.ls(path)
    for file in files:
        print(file.path)  # Print current file/folder
        if file.isDir():  # If it's a directory, recurse into it
            list_files_recursive(file.path)

# Call function for the root path
list_files_recursive("/mnt/databricks_external_stage")

dbfs:/mnt/databricks_external_stage/input_folder/
dbfs:/mnt/databricks_external_stage/input_folder/Versicolor.csv
dbfs:/mnt/databricks_external_stage/input_folder/Virginica.csv
dbfs:/mnt/databricks_external_stage/output_folder/
dbfs:/mnt/databricks_external_stage/product_reviews_input/
dbfs:/mnt/databricks_external_stage/product_reviews_input/product_reviews.csv
dbfs:/mnt/databricks_external_stage/product_reviews_output/


In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
df=spark.read.format('csv').option('header',True).option('inferSchema',True).load('/mnt/databricks_external_stage/input_folder/')
df.show(truncate=False)

+---+------------+-----------+------------+-----------+---------------+
|Id |SEPAL_LENGTH|SEPAL_WIDTH|PETAL_LENGTH|PETAL_WIDTH|CLASS_NAME     |
+---+------------+-----------+------------+-----------+---------------+
|1  |7.0         |3.2        |4.7         |1.4        |Iris-versicolor|
|2  |6.4         |3.2        |4.5         |1.5        |Iris-versicolor|
|3  |6.9         |3.1        |4.9         |1.5        |Iris-versicolor|
|4  |5.5         |2.3        |4.0         |1.3        |Iris-versicolor|
|5  |6.5         |2.8        |4.6         |1.5        |Iris-versicolor|
|6  |5.7         |2.8        |4.5         |1.3        |Iris-versicolor|
|7  |6.3         |3.3        |4.7         |1.6        |Iris-versicolor|
|8  |4.9         |2.4        |3.3         |1.0        |Iris-versicolor|
|9  |6.6         |2.9        |4.6         |1.3        |Iris-versicolor|
|10 |5.2         |2.7        |3.9         |1.4        |Iris-versicolor|
|11 |5.0         |2.0        |3.5         |1.0        |Iris-vers

In [0]:
df.select(input_file_name()).distinct().show(truncate=False)

+---------------------------------------------------------------+
|input_file_name()                                              |
+---------------------------------------------------------------+
|dbfs:/mnt/databricks_external_stage/input_folder/Versicolor.csv|
|dbfs:/mnt/databricks_external_stage/input_folder/Virginica.csv |
+---------------------------------------------------------------+



In [0]:
df.write.format("delta").mode("append").option("path","/mnt/databricks_external_stage/output_folder/").saveAsTable(f"hive_metastore.default.delta_table_1")


In [0]:
%sql
select * from hive_metastore.default.delta_table_1 limit 10;

Id,SEPAL_LENGTH,SEPAL_WIDTH,PETAL_LENGTH,PETAL_WIDTH,CLASS_NAME
1,7.0,3.2,4.7,1.4,Iris-versicolor
2,6.4,3.2,4.5,1.5,Iris-versicolor
3,6.9,3.1,4.9,1.5,Iris-versicolor
4,5.5,2.3,4.0,1.3,Iris-versicolor
5,6.5,2.8,4.6,1.5,Iris-versicolor
6,5.7,2.8,4.5,1.3,Iris-versicolor
7,6.3,3.3,4.7,1.6,Iris-versicolor
8,4.9,2.4,3.3,1.0,Iris-versicolor
9,6.6,2.9,4.6,1.3,Iris-versicolor
10,5.2,2.7,3.9,1.4,Iris-versicolor


In [0]:
%sql
select count(*) from hive_metastore.default.delta_table_1;

count(1)
101


In [0]:
%sql
update hive_metastore.default.delta_table_1 set CLASS_NAME=upper(CLASS_NAME);

num_affected_rows
101


In [0]:
%sql
select * from hive_metastore.default.delta_table_1 limit 10;

Id,SEPAL_LENGTH,SEPAL_WIDTH,PETAL_LENGTH,PETAL_WIDTH,CLASS_NAME
1,6.3,3.3,6.0,2.5,IRIS-VIRGINICA
2,5.8,2.7,5.1,1.9,IRIS-VIRGINICA
3,7.1,3.0,5.9,2.1,IRIS-VIRGINICA
4,6.3,2.9,5.6,1.8,IRIS-VIRGINICA
5,6.5,3.0,5.8,2.2,IRIS-VIRGINICA
6,7.6,3.0,6.6,2.1,IRIS-VIRGINICA
7,4.9,2.5,4.5,1.7,IRIS-VIRGINICA
8,7.3,2.9,6.3,1.8,IRIS-VIRGINICA
9,6.7,2.5,5.8,1.8,IRIS-VIRGINICA
10,7.2,3.6,6.1,2.5,IRIS-VIRGINICA


In [0]:
%sql
--drop table hive_metastore.default.reviews;
CREATE EXTERNAL TABLE hive_metastore.default.reviews (
    ID STRING,
    Product_Name STRING,
    Product_ID STRING,
    Reviewer_Name STRING,
    Review_Date DATE,
    Review STRING
)
USING com.databricks.spark.csv
OPTIONS (header "true")
LOCATION '/mnt/databricks_external_stage/product_reviews_input/';


In [0]:
%sql
select * from hive_metastore.default.reviews limit 10

ID,Product_Name,Product_ID,Reviewer_Name,Review_Date,Review
1,Wireless Mouse,P001,Alice Johnson,2024-03-01,Great mouse! Smooth performance and excellent battery life. Highly recommend.
2,Bluetooth Speaker,P002,John Doe,2024-03-05,"Decent sound quality, but the bass is weak. Expected better for the price."
3,Mechanical Keyboard,P003,Sarah Lee,2024-03-10,Fantastic keyboard! The tactile feedback is amazing. Perfect for gaming.
4,Noise Cancelling Headphones,P004,Mike Brown,2024-03-15,"Noise cancellation is good, but the ear cups are a bit uncomfortable for long use."
5,Smartwatch,P005,Emma Wilson,2024-03-20,"Very stylish and feature-rich. Battery life could be better, but still a great buy!"


In [0]:
%sql
SELECT ai_analyze_sentiment('I am happy');

ai_analyze_sentiment(I am happy)
positive


In [0]:
%sql
SELECT ai_analyze_sentiment("It's very bad weather outside");

ai_analyze_sentiment(It's very bad weather outside)
negative


In [0]:
%sql
select *,ai_analyze_sentiment(Review) as sentiment from hive_metastore.default.reviews;

ID,Product_Name,Product_ID,Reviewer_Name,Review_Date,Review,sentiment
1,Wireless Mouse,P001,Alice Johnson,2024-03-01,Great mouse! Smooth performance and excellent battery life. Highly recommend.,positive
2,Bluetooth Speaker,P002,John Doe,2024-03-05,"Decent sound quality, but the bass is weak. Expected better for the price.",mixed
3,Mechanical Keyboard,P003,Sarah Lee,2024-03-10,Fantastic keyboard! The tactile feedback is amazing. Perfect for gaming.,positive
4,Noise Cancelling Headphones,P004,Mike Brown,2024-03-15,"Noise cancellation is good, but the ear cups are a bit uncomfortable for long use.",mixed
5,Smartwatch,P005,Emma Wilson,2024-03-20,"Very stylish and feature-rich. Battery life could be better, but still a great buy!",mixed


In [0]:
# Run SQL query and store the result in a DataFrame
df = spark.sql("""
    SELECT *, ai_analyze_sentiment(Review) AS sentiment
    FROM hive_metastore.default.reviews
""")

# Define the S3 path where the Delta table will be stored
s3_path = "/mnt/databricks_external_stage/product_reviews_output/"

# Save the DataFrame as Delta format in S3
df.coalesce(1).write.format("delta").mode("overwrite").save(s3_path)